using System; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Threading; using System.Windows.Forms; namespace SpiderMan { public partial class Form1 : Form { public Form1() { InitializeComponent(); } /// <summary> /// 線程數量 /// </summary> private static int threadCount = 0; private void button1_Click(object sender, EventArgs e) { string urlPattern = "http://www.3464.com/data/zhongguochengshijingweidu/?PageNo={0}"; int pageFirstIndex = 1; int pageLastIndex = 125; for (int pageIndex = pageFirstIndex; pageIndex <= pageLastIndex; pageIndex++) { string url = string.Format(urlPattern, pageIndex); Log("開始讀取url:" + url); ThreadPool.QueueUserWorkItem(aurl => { string html = GetHttpSource((string)aurl); ParseHtml(html); //線程計數-- Interlocked.Decrement(ref threadCount); }, url); //線程計數++ Interlocked.Increment(ref threadCount); } while (true) { Application.DoEvents(); Loading(); if (threadCount <= 0) { break; } } //Thread.Sleep(1000); Log("數據採集結束"); } #region 解析html /// <summary> /// 解析html /// </summary> /// <param name="html"></param> private void ParseHtml(string html) { var beginPos = html.IndexOf("編號"); var endPos = html.IndexOf("</table>", beginPos); var partHtml = html.Substring(beginPos, endPos - beginPos); /* <tr[^<]*<td[^>]*>(?<id>\d*?)</td>[^>]*>(?<prov>\w*)</td>[^>]*>[^>]*>(?<city>\w*)</a></td>[^>]*>(?<city2>\w*)</td>[^>]*>(?<py>\w*)</td>[^>]*>(?<qh>\w*)</td>[^>]*>(?<yb>\w*)</td>[^>]*>(?<dj>[\d\.]*)</td>[^>]*>(?<bw>[\d\.]*)</td>[^>]*> */ var ms = Regex.Matches(partHtml, @"<tr[^<]*<td[^>]*>(?<id>\d*?)</td>[^>]*>(?<prov>\w*)</td>[^>]*>[^>]*>(?<city>\w*)</a></td>[^>]*>(?<city2>\w*)</td>[^>]*>(?<py>\w*)</td>[^>]*>(?<qh>\w*)</td>[^>]*>(?<yb>\w*)</td>[^>]*>(?<dj>[\d\.]*)</td>[^>]*>(?<bw>[\d\.]*)</td>[^>]*>"); foreach (Match m in ms) { if (!m.Success) { Log("解析錯誤:" + m.Value); continue; } //Log(partHtml); var 編碼 = m.Groups["id"].Value; var 省市 = m.Groups["prov"].Value; var 地區市 = m.Groups["city"].Value; var 市縣 = m.Groups["city2"].Value; var 拼音 = m.Groups["py"].Value; var 區號 = m.Groups["qh"].Value; var 郵編 = m.Groups["yb"].Value; var 東經 = m.Groups["dj"].Value; var 北緯 = m.Groups["bw"].Value; Log(string.Format("{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}", 編碼, 省市, 地區市, 市縣, 拼音, 區號, 郵編, 東經, 北緯)); } } #endregion /// <summary> /// 簡易進度條 /// </summary> private void Loading() { if (this.InvokeRequired) { this.Invoke(new MethodInvoker(Loading)); } else { int maxLength = 100; int residue = maxLength - this.Text.Length; this.Text = "採集中" + new StringBuilder().Append('.', residue).ToString(); } } #region Log /// <summary> /// 簡易控制檯輸出 /// </summary> /// <param name="msg"></param> private void Log(string msg) { if (this.textBox1.InvokeRequired) { this.Invoke(new MethodInvoker(() => Log(msg))); } else { this.textBox1.AppendText(msg); this.textBox1.AppendText(System.Environment.NewLine); } } #endregion #region GetHttpSource /// <summary> /// 得到網頁源碼 /// </summary> /// <param name="url"></param> /// <returns></returns> private string GetHttpSource(string url) { //請求別人的網站溫柔點 Thread.Sleep(new Random().Next(100, 500)); var wc = new WebClient {Encoding = Encoding.Default}; wc.Headers.Add("Content-Type", "application/x-www-form-urlencoded"); var source = wc.DownloadString(url); return source; } #endregion } }