名字有點大,其實就是我女友如今所在的這家公司的工做,要求她必須擁有一個很大的南京公司名單,並且現狀就是,她知道的公司就屈指可數,更別說是她同事們不知道而她知道的公司了。可是她被要求天天至少找到一個他們公司數據裏面沒有人公司,因此她只能借住搜索引擎。html
上面是作這個東西的緣由,起初我是想用網絡爬蟲來寫的,後來操做了個開頭,發現,網絡爬蟲是個大項目,並且涉及到大數據的操做和數據是否最新等特色,以爲不必。加上時間緊急,她生怕完成不了任務被辭,哎,就快速的用業餘時間寫了一個小程序。正則表達式
程序的思路很簡單,就是經過搜索引擎(以百度爲例)構造get型的url,而後在C#裏面生成一個request請求,去取得Response,而後從Response裏面取得dom,再用正則表達式從dom裏面取得本身想要的數據:一、與公司名稱匹配度高的字符串;二、路轉到搜索結果下一頁的地址。將取得的全部公司名稱放進到list中,而後遍歷去重,再與本身預約義的列表匹配,將不合法的公司名去掉。最張顯示出來,加上另存爲txt,雙擊複製等上功能。固然,爲了可擴展性,我將取公司名稱的正則表達式開放給用戶,讓用戶(我女友?確定不是啊!是我本身)能夠儘量的自定義。小程序
好了,二話不說,上代碼:網絡
1: using System;
2: using System.Collections.Generic;
3: using System.ComponentModel;
4: using System.Data;
5: using System.Drawing;
6: using System.IO;
7: using System.Linq;
8: using System.Net;
9: using System.Text;
10: using System.Text.RegularExpressions;
11: using System.Windows.Forms;
12: using System.Threading;
13: using System.Collections;
14:
15:
16: namespace SearchCompany
17: {
18: public partial class Form1 : Form
19: {
20: Regex rx;
21: Thread getCompany;
22: public delegate void MyInvoke(string str,int type);
23: List<String> lstCom = new List<String>();
24: public List<String> lstFackName =new List<string>(){ "公司", "有限公司", "廣告公司", "保險公司", "獵頭公司", "新公司", "舊公司"};
25: public string googleNextPageRx;
26: public string baiduNextPageRx;
27: public long pagenum;
28: public Form1()
29: {
30: InitializeComponent();
31: baiduNextPageRx = "(?<=</a><a href=\")/s[^\"]*(?=\"\\sclass=\"n\">下一頁></a><span class=\"nums\" style=\"margin-left:120px\">)";
32: googleNextPageRx = "";
33: textBox1.Text = @"\b((\w)|(\(\w+?\)))+(?<=公司)";
34: }
35:
36: private void Search_Click(object sender, EventArgs e)
37: {
38: pagenum = 1;
39: rx = new Regex(textBox1.Text.Trim());//regex傳進來的時候被轉義,可是給了rx後又自動轉義回去了,因此說,不影響,能夠直接寫任意正確的正則式;
40: lblShow.Text = "";
41: lblBug.Text = "";
42: lstCompany.Items.Clear();
43: getCompany = new Thread(new ThreadStart(MakeCompany));
44: getCompany.Name = "getPageAndComputeData";
45: getCompany.IsBackground = true;
46: getCompany.Start();
47: }
48: /// <summary>
49: /// 子線程對主線程的操做
50: /// </summary>
51: /// <param name="lblstr">要在主界面上顯示的內容</param>
52: /// <param name="type">操做類型:一、開始處理網站,顯示已經處理多少個正在處理第幾個,二、正在獲取的數據,三、處理完成,搜索按鈕能夠點擊,四、填充數據,五、顯示bug</param>
53: public void setFromThread(string lblstr, int type)
54: {
55: if (lblShow.InvokeRequired)
56: {
57: MyInvoke _myInvoke = new MyInvoke(setFromThread);
58: this.Invoke(_myInvoke, new object[] { lblstr,type });
59: }
60: else
61: {
62: if (type == 1)
63: {
64: this.Search.Enabled = false;
65: this.AddCompany.Enabled = false;
66: this.lblShow.Text = lblstr;
67: }
68: else if (type == 2)
69: {
70: this.lblShow.Text = lblstr;
71: }
72: else if (type == 3)
73: {
74: this.Search.Enabled = true;
75: this.AddCompany.Enabled = true;
76: }
77: else if (type == 4)
78: {
79: //lstCom中已有數據,如今將它處理優化,再寫入lstCompany中
80: lblShow.Text = "正在處理數據,請稍後";
81: List<String> newlst = new List<string>();
82: if (lstCom.Count > 0)
83: {
84: foreach (String str in lstCom.Distinct<String>())
85: {
86: lstCompany.Items.Add(str);
87: }
88: }
89: lblShow.Text=String.Format("數據已經處理完成\n共處理{0}個頁面\n找到名稱不重複的公司:{1}家",pagenum.ToString(), lstCompany.Items.Count.ToString());
90: this.Search.Enabled = true;
91: this.AddCompany.Enabled = true;
92: }
93: else if (type == 5)
94: {
95: lblBug.Text += lblstr;
96: }
97: }
98: }
99:
100: public void MakeCompany()
101: {
102: try
103: {
104: setFromThread("開始獲取頁面", 1);
105: HttpWebRequest request;
106: StringBuilder sbPageString = new StringBuilder();
107: string oldNextUrl;
108: string newNextUrl;
109: foreach (string url in lstSite.Items)
110: {
111:
112: request = (HttpWebRequest)WebRequest.Create(url);
113: request.MaximumAutomaticRedirections = 500;
114: request.CookieContainer = new CookieContainer();
115: oldNextUrl = url;
116: request.Timeout = 3000;
117: request.Headers.Set("Pragma", "no-cache");
118: HttpWebResponse response = (HttpWebResponse)request.GetResponse();
119: Stream sm = response.GetResponseStream();
120: Encoding ecode = Encoding.GetEncoding("utf-8");
121: StreamReader sr = new StreamReader(sm, ecode);
122: string pages = sr.ReadToEnd();
123: //從pages裏面取下一頁的地址/s?.*(?="\sclass="n">下一頁></a><span class="nums" style="margin-left:120px">)
124: Regex rxNextUrl = new Regex(baiduNextPageRx);
125: Match mcNextUrl = rxNextUrl.Match(pages);
126:
127: //把pages放到sbPageString裏面
128: sbPageString.Append(pages.Replace("<em>", "").Replace("</em>", ""));
129: //用新的地址取數據放進pages裏面
130: setFromThread("已獲取1個頁面,正在嘗試獲取下一頁", 2);
131: while (mcNextUrl.Success)
132: {
133: newNextUrl = "http://www.baidu.com" + mcNextUrl.Value;
134: if (oldNextUrl.Equals(newNextUrl)) break;
135: oldNextUrl = newNextUrl;
136: request = (HttpWebRequest)WebRequest.Create("http://www.baidu.com" + mcNextUrl.Value);
137: request.Timeout = 3000;
138: request.Headers.Set("Pragma", "no-cache");
139: response = (HttpWebResponse)request.GetResponse();
140: sm = response.GetResponseStream();
141: ecode = Encoding.GetEncoding("utf-8");
142: sr = new StreamReader(sm, ecode);
143: pages = sr.ReadToEnd();
144: rxNextUrl = new Regex(baiduNextPageRx);
145: mcNextUrl = rxNextUrl.Match(pages);
146: sbPageString.Append(pages.Replace("<em>", "").Replace("</em>", ""));
147: setFromThread(String.Format("已獲取{0}個頁面,正在嘗試獲取下一個頁面", (++pagenum).ToString()), 2);
148: }
149: }
150:
151: setFromThread(String.Format("共獲取{0}個頁面的數據,正在對數據進行處理",pagenum.ToString()), 2);
152: string strPage = sbPageString.ToString().ToLower(); MatchCollection mc = rx.Matches(strPage);
153: string str="";
154: foreach (Match tmc in mc)
155: {
156: str=tmc.Value.Trim();
157: if(!lstFackName.Contains<String>(str))
158: lstCom.Add(str);
159: }
160: setFromThread("", 4);
161: }catch (Exception ex)
162: {
163: setFromThread(ex.Message.ToString(), 5);
164: }
165: }
166:
167: private void AddCompany_Click(object sender, EventArgs e)
168: {
169: String strURL=newURL.Text.Trim();
170: if (strURL != String.Empty)
171: {
172: lstSite.Items.Add(strURL);
173: newURL.Text = "";
174: AddCompany.Enabled = false;
175: }
176: }
177:
178: private void newURL_TextChanged(object sender, EventArgs e)
179: {
180: if (newURL.Text.Trim() == String.Empty)
181: {
182: AddCompany.Enabled = false;
183: }
184: else
185: {
186: AddCompany.Enabled = true;
187: }
188: }
189:
190: private void button1_Click(object sender, EventArgs e)
191: {
192: if (lstSite.SelectedItems.Count > 0)
193: {
194: for (int i = 0; i < lstSite.SelectedItems.Count; i++)
195: {
196: lstSite.Items.Remove(lstSite.SelectedItems[i]);
197: }
198: }
199: }
200:
201: private void button2_Click(object sender, EventArgs e)
202: {
203: if (saveFileDialog1.ShowDialog() == DialogResult.OK)
204: {
205: try
206: {
207: System.IO.FileStream fs = (System.IO.FileStream)saveFileDialog1.OpenFile();
208: StreamWriter sw = new StreamWriter(fs);
209: for (int i = 0; i < lstCompany.Items.Count; i++)
210: {
211: sw.WriteLine(lstCompany.Items[i].ToString());
212: }
213: sw.Flush();
214: sw.Close();
215: fs.Close();
216: MessageBox.Show("文件保存成功");
217: }
218: catch (Exception ex)
219: {
220: MessageBox.Show("異常:\n{0}", ex.Message.ToString());
221: }
222: }
223: }
224:
225: private void lstCompany_DoubleClick(object sender, EventArgs e)
226: {
227: Clipboard.SetText(lstCompany.SelectedItems[0].ToString());
228: lblCopy.Text = "複製成功...";
229: timer1.Tick += lblCopyClear;
230: timer1.Interval = 3000;
231: timer1.Start();
232: }
233: public void lblCopyClear(object sender,EventArgs e)
234: {
235: lblCopy.Text = "";
236: timer1.Tick -= lblCopyClear;
237: timer1.ToString();
238: }
239:
240: }
241: }