最近,女朋友的妹妹要去網上找房產中介人信息用於招聘,本身去網上一個一個找太慢,我女朋友知道我是搞IT的就請教我有沒有辦法幫她快速找電話號碼,因而我就想到了爬蟲程序,而後普通的爬蟲代碼有限制,恰好本身在搞自動化測試,就想到用webdriver自動化測試工具編寫一個爬蟲工具抓取趕集網上的經紀人信息。 java
本身一直用的是Java代碼寫的自動化測試腳本,可是對於他們不會編程的人,沒有界面操做很不方便,因而我就想到了編寫一個桌面程序,可是java的GUI又不美觀,最後就想到微軟的C#,因而自學VS2010,自學.NET,C#結合Webdriver FOR C#版本編寫了一個小工具。 web
支持火狐瀏覽器,phatomJS內存瀏覽器進行抓取。 編程
將數據查詢出放在界面上 c#
支持導出excel文件 數組
基本的實現原理就是:先計算出頁碼,而後循環一頁一頁抓取數據,抓取出的數據先臨時存儲在LIST數據類型中,再存放到界面上的數據控件,點擊導出excel的時候,把數據控件的數據轉換成LIST,再導出到excel. 瀏覽器
界面以下: 網絡
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; using System.Text; using System.Windows.Forms; using OpenQA.Selenium; using OpenQA.Selenium.Firefox; using OpenQA.Selenium.Interactions; using OpenQA.Selenium.Interactions.Internal; using OpenQA.Selenium.Support; using System.Threading; using Microsoft.Office; using Excel; using System.Drawing; using OpenQA.Selenium.PhantomJS; using OpenQA.Selenium.Chrome; using OpenQA.Selenium.IE; using OpenQA.Selenium.Remote; using System.Timers; namespace tel_search { public partial class Form1 : Form { //計時器 static int time_js = 0;//設定臨時變量 static System.Timers.Timer timer; public Form1() { InitializeComponent(); } private void label1_Click(object sender, EventArgs e) { } private void label2_Click(object sender, EventArgs e) { } private void label3_Click(object sender, EventArgs e) { } private void Form1_Load(object sender, EventArgs e) { radioButton_phatom.Checked = true; } private void button1_Click(object sender, EventArgs e) { //開始後臺運行搜索 //backgroundWorker1.RunWorkerAsync(200000); if (listBox1.SelectedItems.Count == 0) { DialogResult dr5 = MessageBox.Show("請選擇地區", "舒適提示", MessageBoxButtons.OK); if (dr5 == DialogResult.OK) { listBox1.Focus(); } } else { List<String[]> mp_inofs_list = new List<string[]>(); //用於統計名片個數 int mp_total = 0; //記錄錯誤關鍵字 //List<String> err_kw = null; IWebDriver driver = null; DialogResult dr_search_result = MessageBox.Show("數據比較多,可能須要幾分鐘時間抓取,請耐心等待!運行過程當中,請不要關閉瀏覽器窗口,不然程序會報錯!", "舒適提示", MessageBoxButtons.OK); if (dr_search_result == DialogResult.OK) { //抓取數據過程,開始計時 timer = new System.Timers.Timer(100); timer.Elapsed += new System.Timers.ElapsedEventHandler(OnTimedEvent); timer.AutoReset = true; timer.Enabled = true; if (radioButton_phatom.Checked == true) { driver = new PhantomJSDriver(); } if (radioButton_firefox.Checked == true) { try { FirefoxProfile profile = new FirefoxProfile(); profile.SetPreference("browser.bookmarks.restore_default_bookmarks", false); driver = new FirefoxDriver(profile); } catch { DialogResult dr4 = MessageBox.Show("你的電腦沒有安裝火狐瀏覽器,是否當即下載", "舒適提示", MessageBoxButtons.YesNo); if (dr4 == DialogResult.Yes) { listBox1.Focus(); System.Diagnostics.Process.Start("http://download.firefox.com.cn/releases/stub/official/zh-CN/Firefox-latest.exe"); //停止程序往下運行 return; } if (dr4 == DialogResult.No) { return; } } } } String quyu = null; String url_quyu = null; driver.Manage().Window.Maximize(); try { if (listBox1.SelectedItem.ToString() == "全深圳") { quyu = ""; } if (listBox1.SelectedItem.ToString() == "福田") { quyu = "futian"; } if (listBox1.SelectedItem.ToString() == "羅湖") { quyu = "luohu"; } if (listBox1.SelectedItem.ToString() == "南山") { quyu = "nanshan"; } if (listBox1.SelectedItem.ToString() == "寶安") { quyu = "baoan"; } if (listBox1.SelectedItem.ToString() == "龍崗") { quyu = "longgang"; } if (listBox1.SelectedItem.ToString() == "鹽田") { quyu = "yantian"; } if (listBox1.SelectedItem.ToString() == "龍華新區") { quyu = "longhuaxinqu"; } if (listBox1.SelectedItem.ToString() == "光明新區") { quyu = "guangmingxinqu"; } if (listBox1.SelectedItem.ToString() == "坪山新區") { quyu = "pingshanxinqu"; } if (listBox1.SelectedItem.ToString() == "大鵬新區") { quyu = "dapengxinqu"; } if (listBox1.SelectedItem.ToString() == "深圳周邊") { quyu = "shenzhenzhoubian"; } url_quyu = "http://sz.ganji.com/zufang/agent/" + quyu + "/"; driver.Navigate().GoToUrl(url_quyu); } catch { driver.Quit(); DialogResult dr_search_result2 = MessageBox.Show("打不開網頁,您的網絡可能有問題,請檢查網絡鏈接是否正常!", "舒適提示"); return; } List<IWebElement> page_next_page_button = new List<IWebElement>(); List<IWebElement> page_after_index = null; do { page_next_page_button = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[@class='next']/span"))); page_after_index = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]"))); page_after_index.ElementAt(page_after_index.Count - 1).Click(); }while (page_next_page_button.Count!=0);//判斷是否查找到"下一頁"的按鈕.特別注意不能用!=null判斷,這樣是無效的。 //driver.Close();不能關閉瀏覽器,若是關閉了,須要從新建立driver對象 int page_nums = Convert.ToInt16(page_after_index.ElementAt(page_after_index.Count-1).Text); //獲取每一個關鍵詞含有多少頁,經過獲取頁碼元素判斷 url_quyu = "http://sz.ganji.com/zufang/agent/" + quyu + "/"; driver.Navigate().GoToUrl(url_quyu); List<IWebElement> page_index = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]"))); for (int page_num = 0; page_num < page_nums; page_num++) { //每次點擊頁碼以後,都要從新找元素 driver.Manage().Timeouts().ImplicitlyWait(new TimeSpan(0, 0, 1)); //以一個集合做爲參數建立list LIST<T> TESTLIST=NEW LIST<T>(IEnumerable<T> Collections) List<IWebElement> page = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]"))); String page_num_text = null; if (page_num <=5) { page_num_text = page.ElementAt(page_num).Text; page.ElementAt(page_num).Click(); } else { page_num_text = page.ElementAt(5).Text; page.ElementAt(5).Click(); } driver.Manage().Timeouts().ImplicitlyWait(new TimeSpan(0, 0, 1)); //獲取每頁所含名片元素,同時獲取每頁名片數量 List<IWebElement> mp_indx = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='listBox']/ul/li/div[@class='list-mod2']"))); //列表用於存儲單個名片信息 List<String> mp_info_list = new List<String>(); //第一個detailLayer元素不含名片信息過濾掉,因此從1開始 for (int i = 0; i < mp_indx.Count(); i++) { //獲取單個名片內容元素----Start IWebElement mp_name = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-info']/span[@class='broker-name']/a")); IWebElement mp_tel = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-info']/span[@class='broker-tel']")); IWebElement mp_compy = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(0); IWebElement mp_serverquyu = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(1); IWebElement mp_serverxiaoqu = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(2); String mp_name_text = mp_name.Text; String mp_tel_text = mp_tel.Text; //Substring函數,截取公司名字,刪除開頭的「經紀公司「幾個字 String mp_compy_text = mp_compy.Text.Substring(6,mp_compy.Text.ToString().Length-6); String mp_serverquyu_text = mp_serverquyu.Text.Substring(6,mp_serverquyu.Text.ToString().Length-6); String mp_serverxiaoqu_text = mp_serverxiaoqu.Text.Substring(6,mp_serverxiaoqu.Text.ToString().Length-6); mp_info_list.Add(listBox1.SelectedItem.ToString()); mp_info_list.Add(mp_name_text); mp_info_list.Add(mp_tel_text); mp_info_list.Add(mp_compy_text); mp_info_list.Add(mp_serverquyu_text); mp_info_list.Add(mp_serverxiaoqu_text); mp_info_list.Add(page_num_text); //------------------END //將單個名片內容列表轉爲數組 String[] mp_info_arrary = mp_info_list.ToArray(); //將單個名片,存儲到名片二維列表 mp_inofs_list.Add(mp_info_arrary); //將名片列表內容刪除 ,用於存儲下一個名片內容 mp_info_list.RemoveRange(0, mp_info_list.Count()); mp_total++; } } //將全部名片信息存儲爲一個二維數組 String[][] mp_infos_arrary = mp_inofs_list.ToArray(); //列寬設置 col_header1.Width = 30; col_header2.Width = 45; col_header3.Width = 100; col_header4.Width = 90; col_header5.Width = 350; col_header6.Width = 115; col_header7.Width = 465; col_header8.Width = 20; //開始更新LISTVIEW數據--START listView1.GridLines = true; listView1.MultiSelect = true; //listView1.Clear(); listView1.BeginUpdate(); for (int i = 0; i < mp_inofs_list.Count; i++) { ListViewItem Lvi = new ListViewItem(); //第一列序號 Lvi.Text = (i + 1).ToString(); for (int j = 0; j < 7; j++) { Lvi.SubItems.Add(mp_inofs_list.ElementAt(i).ElementAt(j)); } listView1.Items.Add(Lvi); } listView1.EndUpdate(); listView1.LabelEdit = true; listView1.FullRowSelect = true; driver.Quit(); //結束計時 timer.Enabled = false; timer.Dispose(); time_lbl.Visible = true; //換算成分秒 int minute = 0;//分 double second = 0;//秒 second = time_js / 10; if (second > 60) { minute = (int)(second / 60); second = second % 60; time_lbl.Text = minute.ToString() + "分" + second.ToString() + "秒"; } else { time_lbl.Text = 0.ToString() + "分" + second.ToString() + "秒"; } DialogResult msg_mp_total = MessageBox.Show("謝謝你的耐心等待" + "成功抓取" + page_nums + "頁" + mp_inofs_list.Count + "條名片信息,用時:" + time_lbl.Text, "恭喜"); } } private void listView1_SelectedIndexChanged(object sender, EventArgs e) { } private void button4_Click(object sender, EventArgs e) { turntoexcel(); } private void turntoexcel() { SaveFileDialog sfd = new SaveFileDialog(); sfd.DefaultExt = "xls"; sfd.Filter = "Excel文件(*.xls)|*.xls"; if (sfd.ShowDialog() == DialogResult.OK) { DoExport(listView1, sfd.FileName); } } private void DoExport(ListView listView, string strFileName) { if (listView1.Items.Count== 0) { MessageBox.Show("沒有數據,沒法導出!"); return; } int rowNum = listView.Items.Count; int columnNum = listView.Items[0].SubItems.Count; int rowIndex = 1; int columnIndex = 0; if (rowNum == 0 || string.IsNullOrEmpty(strFileName)) { return; } if (rowNum > 0) { Microsoft.Office.Interop.Excel.Application xlApp = new Microsoft.Office.Interop.Excel.ApplicationClass(); if (xlApp == null) { MessageBox.Show("沒法建立excel對象,可能您的系統沒有安裝excel"); return; } xlApp.DefaultFilePath = ""; xlApp.DisplayAlerts = true; xlApp.SheetsInNewWorkbook = 1; Microsoft.Office.Interop.Excel.Workbook xlBook = xlApp.Workbooks.Add(true); //將ListView的列名導入Excel表第一行 foreach (ColumnHeader dc in listView.Columns) { columnIndex++; xlApp.Cells[rowIndex, columnIndex] = dc.Text; } //將ListView中的數據導入Excel中 for (int i = 0; i < rowNum; i++) { rowIndex++; columnIndex = 0; for (int j = 0; j < columnNum; j++) { columnIndex++; //注意這個在導出的時候加了「\t」 的目的就是避免導出的數據顯示爲科學計數法。能夠放在每行的首尾。 xlApp.Cells[rowIndex, columnIndex] = Convert.ToString(listView.Items[i].SubItems[j].Text) + "\t"; } } //例外須要說明的是用strFileName,Excel.XlFileFormat.xlExcel9795保存方式時 當你的Excel版本不是9五、97 而是200三、2007 時導出的時候會報一個錯誤:異常來自 HRESULT:0x800A03EC。 解決辦法就是換成strFileName, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal。 xlBook.SaveAs(strFileName, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Microsoft.Office.Interop.Excel.XlSaveAsAccessMode.xlExclusive, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing); xlApp = null; xlBook = null; MessageBox.Show("恭喜導出成功!"); } } private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e) { //容許長時間的操做 int input = (int)e.Argument; Thread.Sleep(input); } private void textBox1_TextChanged(object sender, EventArgs e) { } private void linkLabel1_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e) { linkLabel1.Links[0].LinkData = "http://download.firefox.com.cn/releases/stub/official/zh-CN/Firefox-latest.exe"; String URL = linkLabel1.Links[0].LinkData.ToString(); System.Diagnostics.Process.Start(URL); } private void OnTimedEvent(Object source, ElapsedEventArgs e) { if (timer.Enabled == true) { time_js++; } else { return; } } } }