using System;
using System.Collections.Generic;
using System.Configuration;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using MySql.Data.MySqlClient;
namespace DOSApp1
{
/// <summary>
/// 音节抓取者
/// </summary>
public class Grabber
{
/// <summary>
/// 目标 URL
/// </summary>
private const String TARGET_URL = "http://www./syllables/";
/// <summary>
/// 获取单词列表
/// </summary>
private const String SQL_GetWordList = "SELECT `id`, `english` FROM `all_word` WHERE `id` >= @startID AND `id` <= @overID ORDER BY `id` ASC";
/// <summary>
/// 更新单词音节
/// </summary>
private const String SQL_UpdateWordList = "UPDATE `all_word` SET `syllable` = @syllable WHERE `id` = @id";
#region 类构造器
/// <summary>
/// 类参数构造器
/// </summary>
/// <param name="startID"></param>
/// <param name="overID"></param>
public Grabber(int startID, int overID)
{
this.StartID = startID;
this.OverID = overID;
}
#endregion
/// <summary>
/// 获取开始 ID
/// </summary>
public int StartID
{
get;
protected set;
}
/// <summary>
/// 获取结束 ID
/// </summary>
public int OverID
{
get;
protected set;
}
/// <summary>
/// 开始抓取
/// </summary>
public void StartGrab()
{
for (int i = this.StartID; i <= this.OverID; i += 20)
{
// 开始 ID
int startID = i;
// 结束 ID
int overID = i + 20;
// 获取单词列表
IList<Word> wordList = this.GetWordList(startID, overID);
if (wordList == null || wordList.Count <= 0)
{
continue;
}
// 发送单词列表并获取响应文本
String responseText = this.PostWord(wordList);
if (String.IsNullOrEmpty(responseText))
{
continue;
}
// 获取音节列表
IList<String> syllableList = this.ExtractSyllable(responseText);
// 设置单词音节
this.PutWordSyllable(wordList, syllableList);
// 更新单词列表
this.UpdateWordList(wordList);
// 屏幕打印结果
this.PrintResult(wordList);
}
}
/// <summary>
/// 获取英文单词列表
/// </summary>
/// <param name="startID"></param>
/// <param name="overID"></param>
/// <returns></returns>
private IList<Word> GetWordList(int startID, int overID)
{
// 创建连接
MySqlConnection sqlConn = new MySqlConnection(ConfigurationManager.ConnectionStrings["MySQL5"].ConnectionString);
// 创建命令
MySqlCommand sqlCmd = new MySqlCommand(Grabber.SQL_GetWordList, sqlConn);
// 开始 ID
sqlCmd.Parameters.AddWithValue("@startID", startID);
// 结束 ID
sqlCmd.Parameters.AddWithValue("@overID", overID);
// 单词列表
List<Word> wordList = new List<Word>();
try
{
sqlConn.Open();
// 执行 SQL 查询
MySqlDataReader dr = sqlCmd.ExecuteReader();
while (dr.Read())
{
Word w = new Word();
// ID
w.ID = Convert.ToInt32(dr["id"]);
// 英文
w.English = Convert.ToString(dr["english"]);
wordList.Add(w);
}
}
catch
{
throw;
}
finally
{
sqlConn.Close();
}
return wordList;
}
/// <summary>
/// 发送单词获取音节文本
/// </summary>
/// <param name="wordList"></param>
/// <returns></returns>
private String PostWord(IList<Word> wordList)
{
if (wordList == null || wordList.Count <= 0)
{
return "";
}
String text = "";
// 是否为词组
Regex isPhrase = new Regex(@"[^(\w)]+");
foreach (Word w in wordList)
{
// 不是词组才可以抓取音节
if (isPhrase.IsMatch(w.English) == false)
{
text += w.English + "\r\n";
}
}
// 创建 Web 请求
WebRequest request = WebRequest.Create(Grabber.TARGET_URL);
// 获取发送内容
byte[] postContent = Encoding.UTF8.GetBytes(String.Format("inputText={0}", text));
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = postContent.LongLength;
request.Method = "POST";
// 获取请求流对象
Stream requestStream = request.GetRequestStream();
// 设置 POST 参数
requestStream.Write(postContent, 0, postContent.Length);
requestStream.Flush();
requestStream.Close();
// 获取响应
HttpWebResponse response = request.GetResponse() as HttpWebResponse;
// 获取响应流对象
Stream responseStream = response.GetResponseStream();
// 创建文本读取流
StreamReader sr = new StreamReader(responseStream);
return sr.ReadToEnd();
}
/// <summary>
/// 获取音节字符串
/// </summary>
/// <param name="src"></param>
/// <returns></returns>
private IList<String> ExtractSyllable(String src)
{
if (String.IsNullOrEmpty(src))
{
return null;
}
// 创建提取音节正则表达式
Regex syllableRegex = new Regex(@"<textarea cols=""48"" rows=""10"" id=""inputText"" name=""inputText"">.*?<\/textarea>", RegexOptions.Singleline);
// 匹配
Match syllableMatch = syllableRegex.Match(src);
if (syllableMatch == null)
{
return null;
}
String syllable = syllableMatch.Value;
if (String.IsNullOrEmpty(syllable))
{
return null;
}
// 清除 html 标记
syllable = new Regex(@"<[^>]*>").Replace(syllable, "");
syllable = syllable.Replace("\r\n", ";");
return syllable.Split(";");
}
/// <summary>
/// 设置单词音节
/// </summary>
/// <param name="wordList"></param>
/// <param name="syllableList"></param>
private void PutWordSyllable(IList<Word> wordList, IList<String> syllableList)
{
if (wordList == null || wordList.Count <= 0)
{
return;
}
if (syllableList == null || syllableList.Count <= 0)
{
return;
}
Dictionary<String, Word> tempDict = new Dictionary<string, Word>();
// 将单词列表加入到临时字典
foreach (Word w in wordList)
{
if (tempDict.ContainsKey(w.English) == false)
{
tempDict.Add(w.English, w);
}
}
// 查找单词并更新音节
foreach (String syllable in syllableList)
{
String english = syllable.Replace("-", "");
if (tempDict.ContainsKey(english))
{
tempDict[english].Syllable = syllable;
}
}
}
/// <summary>
/// 更新单词列表
/// </summary>
/// <param name="wordList"></param>
/// <param name="syllable"></param>
private void UpdateWordList(IList<Word> wordList)
{
if (wordList == null || wordList.Count <= 0)
{
return;
}
// 创建连接
MySqlConnection sqlConn = new MySqlConnection(ConfigurationManager.ConnectionStrings["MySQL5"].ConnectionString);
// 创建命令
MySqlCommand sqlCmd = new MySqlCommand(Grabber.SQL_UpdateWordList, sqlConn);
// 音节
sqlCmd.Parameters.AddWithValue("@syllable", "");
// ID
sqlCmd.Parameters.AddWithValue("@id", "");
try
{
sqlConn.Open();
foreach (Word w in wordList)
{
// ID
sqlCmd.Parameters["@id"].Value = w.ID;
// 音节
sqlCmd.Parameters["@syllable"].Value = w.Syllable;
sqlCmd.ExecuteNonQuery();
}
}
catch
{
throw;
}
finally
{
sqlConn.Close();
}
}
/// <summary>
/// 屏幕打印结果
/// </summary>
/// <param name="wordList"></param>
private void PrintResult(IList<Word> wordList)
{
if (wordList == null || wordList.Count <= 0)
{
return;
}
foreach (Word w in wordList)
{
Console.WriteLine("{0} => {1}", w.English, String.IsNullOrEmpty(w.Syllable) ? "No" : "Yes");
}
}
}
}