用C#.NET 與Webdriver寫的抓取網頁信息的小工具

    最近,女朋友的妹妹要去網上找房產中介人信息用於招聘,本身去網上一個一個找太慢,我女朋友知道我是搞IT的就請教我有沒有辦法幫她快速找電話號碼,因而我就想到了爬蟲程序,而後普通的爬蟲代碼有限制,恰好本身在搞自動化測試,就想到用webdriver自動化測試工具編寫一個爬蟲工具抓取趕集網上的經紀人信息。 java

    本身一直用的是Java代碼寫的自動化測試腳本,可是對於他們不會編程的人,沒有界面操做很不方便,因而我就想到了編寫一個桌面程序,可是java的GUI又不美觀,最後就想到微軟的C#,因而自學VS2010,自學.NET,C#結合Webdriver FOR C#版本編寫了一個小工具。 web

 

支持火狐瀏覽器,phatomJS內存瀏覽器進行抓取。 編程

將數據查詢出放在界面上 c#

支持導出excel文件 數組

基本的實現原理就是:先計算出頁碼,而後循環一頁一頁抓取數據,抓取出的數據先臨時存儲在LIST數據類型中,再存放到界面上的數據控件,點擊導出excel的時候,把數據控件的數據轉換成LIST,再導出到excel. 瀏覽器

界面以下: 網絡

  

 

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using OpenQA.Selenium;
using OpenQA.Selenium.Firefox;
using OpenQA.Selenium.Interactions;
using OpenQA.Selenium.Interactions.Internal;
using OpenQA.Selenium.Support;
using System.Threading;
using Microsoft.Office;
using Excel;
using System.Drawing;
using OpenQA.Selenium.PhantomJS;
using OpenQA.Selenium.Chrome;
using OpenQA.Selenium.IE;
using OpenQA.Selenium.Remote;
using System.Timers;

namespace tel_search
{
    public partial class Form1 : Form
    {
        //計時器
        static int time_js = 0;//設定臨時變量
        static System.Timers.Timer timer;
        
        public Form1()
        {
            InitializeComponent();
        }

        private void label1_Click(object sender, EventArgs e)
        {

        }

        private void label2_Click(object sender, EventArgs e)
        {

        }

        private void label3_Click(object sender, EventArgs e)
        {

        }

        private void Form1_Load(object sender, EventArgs e)
        {
            radioButton_phatom.Checked = true;
        }

        private void button1_Click(object sender, EventArgs e)
        {
            //開始後臺運行搜索
            //backgroundWorker1.RunWorkerAsync(200000);
            
            if (listBox1.SelectedItems.Count == 0)
            {
                DialogResult dr5 = MessageBox.Show("請選擇地區", "舒適提示", MessageBoxButtons.OK);
                if (dr5 == DialogResult.OK)
                {
                    listBox1.Focus();
                }
            }
            else
            {
                List<String[]> mp_inofs_list = new List<string[]>();
                //用於統計名片個數
                int mp_total = 0;
                //記錄錯誤關鍵字
                //List<String> err_kw = null;
                IWebDriver driver = null;
                DialogResult dr_search_result = MessageBox.Show("數據比較多,可能須要幾分鐘時間抓取,請耐心等待!運行過程當中,請不要關閉瀏覽器窗口,不然程序會報錯!", "舒適提示", MessageBoxButtons.OK);
                if (dr_search_result == DialogResult.OK)
                {
                    //抓取數據過程,開始計時
                    timer = new System.Timers.Timer(100);
                    timer.Elapsed += new System.Timers.ElapsedEventHandler(OnTimedEvent);
                    timer.AutoReset = true;
                    timer.Enabled = true;
                    
                    if (radioButton_phatom.Checked == true)
                    {
                        driver = new PhantomJSDriver();
                    }
                    if (radioButton_firefox.Checked == true)
                    {
                        try
                        {
                            FirefoxProfile profile = new FirefoxProfile();

                            profile.SetPreference("browser.bookmarks.restore_default_bookmarks", false);
                            driver = new FirefoxDriver(profile);
                            
                        }
                        catch
                        {
                            DialogResult dr4 = MessageBox.Show("你的電腦沒有安裝火狐瀏覽器,是否當即下載", "舒適提示", MessageBoxButtons.YesNo);
                            if (dr4 == DialogResult.Yes)
                            {
                                listBox1.Focus();
                                System.Diagnostics.Process.Start("http://download.firefox.com.cn/releases/stub/official/zh-CN/Firefox-latest.exe");
                                //停止程序往下運行
                                return;

                            }
                            if (dr4 == DialogResult.No)
                            {
                                return;
                            }
                        }
                    }
                }



                String quyu = null;
                String url_quyu = null;
                driver.Manage().Window.Maximize(); 

                try
                {
                    if (listBox1.SelectedItem.ToString() == "全深圳")
                    {
                        quyu = "";

                    }
                    if (listBox1.SelectedItem.ToString() == "福田")
                    {
                        quyu = "futian";

                    }
                    if (listBox1.SelectedItem.ToString() == "羅湖")
                    {
                        quyu = "luohu";

                    }
                    if (listBox1.SelectedItem.ToString() == "南山")
                    {
                        quyu = "nanshan";

                    }
                    if (listBox1.SelectedItem.ToString() == "寶安")
                    {
                        quyu = "baoan";

                    }
                    if (listBox1.SelectedItem.ToString() == "龍崗")
                    {
                        quyu = "longgang";

                    }
                    if (listBox1.SelectedItem.ToString() == "鹽田")
                    {
                        quyu = "yantian";

                    }
                    if (listBox1.SelectedItem.ToString() == "龍華新區")
                    {
                        quyu = "longhuaxinqu";

                    }
                    if (listBox1.SelectedItem.ToString() == "光明新區")
                    {
                        quyu = "guangmingxinqu";

                    }
                    if (listBox1.SelectedItem.ToString() == "坪山新區")
                    {
                        quyu = "pingshanxinqu";

                    }
                    if (listBox1.SelectedItem.ToString() == "大鵬新區")
                    {
                        quyu = "dapengxinqu";
                    }
                    if (listBox1.SelectedItem.ToString() == "深圳周邊")
                    {
                        quyu = "shenzhenzhoubian";

                    }
                    
                    
                        url_quyu = "http://sz.ganji.com/zufang/agent/" + quyu + "/";
                        driver.Navigate().GoToUrl(url_quyu);
                    

                }
                catch
                {
                    driver.Quit();
                    DialogResult dr_search_result2 = MessageBox.Show("打不開網頁,您的網絡可能有問題,請檢查網絡鏈接是否正常!", "舒適提示");
                    return;

                }
                
                    List<IWebElement> page_next_page_button = new List<IWebElement>();
                    List<IWebElement> page_after_index = null;
                    
                    do
                    {
                        page_next_page_button = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[@class='next']/span")));
                        
                        page_after_index = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]")));
                        page_after_index.ElementAt(page_after_index.Count - 1).Click();


                    }while (page_next_page_button.Count!=0);//判斷是否查找到"下一頁"的按鈕.特別注意不能用!=null判斷,這樣是無效的。

                    //driver.Close();不能關閉瀏覽器,若是關閉了,須要從新建立driver對象
                    int page_nums = Convert.ToInt16(page_after_index.ElementAt(page_after_index.Count-1).Text);
                  

              
                //獲取每一個關鍵詞含有多少頁,經過獲取頁碼元素判斷
                   url_quyu = "http://sz.ganji.com/zufang/agent/" + quyu + "/";
                   driver.Navigate().GoToUrl(url_quyu);
                  List<IWebElement> page_index = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]")));
               

                  for (int page_num = 0; page_num < page_nums; page_num++)
                {
                    //每次點擊頁碼以後,都要從新找元素
                    driver.Manage().Timeouts().ImplicitlyWait(new TimeSpan(0, 0, 1));
					//以一個集合做爲參數建立list   LIST<T> TESTLIST=NEW LIST<T>(IEnumerable<T> Collections)
                    List<IWebElement> page = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='pageBox']/ul[@class='pageLink clearfix']/li/a[not(@class='next'or@class='prev')]")));
                    String page_num_text = null;
                    if (page_num <=5)
                    {
                        page_num_text = page.ElementAt(page_num).Text;
                       
                        page.ElementAt(page_num).Click();
                    }
                    else
                    {
                        page_num_text = page.ElementAt(5).Text;
                       
                        page.ElementAt(5).Click();
                    }

                    driver.Manage().Timeouts().ImplicitlyWait(new TimeSpan(0, 0, 1));

                    //獲取每頁所含名片元素,同時獲取每頁名片數量
                    List<IWebElement> mp_indx = new List<IWebElement>(driver.FindElements(By.XPath("//div[@class='listBox']/ul/li/div[@class='list-mod2']")));

                    //列表用於存儲單個名片信息
                    List<String> mp_info_list = new List<String>();
                    //第一個detailLayer元素不含名片信息過濾掉,因此從1開始

                    for (int i = 0; i < mp_indx.Count(); i++)
                    {
                        

                        //獲取單個名片內容元素----Start
                    
                        IWebElement mp_name = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-info']/span[@class='broker-name']/a"));
                        IWebElement mp_tel = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-info']/span[@class='broker-tel']"));
                        IWebElement mp_compy = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(0);
                        IWebElement mp_serverquyu = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(1);
                        IWebElement mp_serverxiaoqu = mp_indx.ElementAt(i).FindElement(By.XPath("div[@class='broker-service']")).FindElements(By.TagName("p")).ElementAt(2);
                        
                       
                        String mp_name_text = mp_name.Text;
                        String mp_tel_text = mp_tel.Text;
                        //Substring函數,截取公司名字,刪除開頭的「經紀公司「幾個字
                        String mp_compy_text = mp_compy.Text.Substring(6,mp_compy.Text.ToString().Length-6);
                        String mp_serverquyu_text = mp_serverquyu.Text.Substring(6,mp_serverquyu.Text.ToString().Length-6);
                        String mp_serverxiaoqu_text = mp_serverxiaoqu.Text.Substring(6,mp_serverxiaoqu.Text.ToString().Length-6);

                        mp_info_list.Add(listBox1.SelectedItem.ToString());
                        mp_info_list.Add(mp_name_text);
                        mp_info_list.Add(mp_tel_text);
                        mp_info_list.Add(mp_compy_text);
                        mp_info_list.Add(mp_serverquyu_text);
                        mp_info_list.Add(mp_serverxiaoqu_text);
                        mp_info_list.Add(page_num_text);

                        //------------------END
                        //將單個名片內容列表轉爲數組
                        String[] mp_info_arrary = mp_info_list.ToArray();

                        //將單個名片,存儲到名片二維列表
                        mp_inofs_list.Add(mp_info_arrary);
                        //將名片列表內容刪除 ,用於存儲下一個名片內容
                        mp_info_list.RemoveRange(0, mp_info_list.Count());
                        mp_total++;

                    }
                    

                }

                //將全部名片信息存儲爲一個二維數組
                String[][] mp_infos_arrary = mp_inofs_list.ToArray();
                //列寬設置
                col_header1.Width = 30;
                col_header2.Width = 45;
                col_header3.Width = 100;
                col_header4.Width = 90; 
                col_header5.Width = 350;
                col_header6.Width = 115;
                col_header7.Width = 465;
                col_header8.Width = 20;

                //開始更新LISTVIEW數據--START
                listView1.GridLines = true;
                listView1.MultiSelect = true;
                //listView1.Clear();
                listView1.BeginUpdate();
                for (int i = 0; i < mp_inofs_list.Count; i++)
                {
                    ListViewItem Lvi = new ListViewItem();
                    //第一列序號
                    Lvi.Text = (i + 1).ToString();
                    for (int j = 0; j < 7; j++)
                    {

                        Lvi.SubItems.Add(mp_inofs_list.ElementAt(i).ElementAt(j));
                    }
                    listView1.Items.Add(Lvi);
                }
                listView1.EndUpdate();
                listView1.LabelEdit = true;
                listView1.FullRowSelect = true;
             
                driver.Quit();
                

              
                //結束計時
                timer.Enabled = false;
                timer.Dispose();

                time_lbl.Visible = true;
                //換算成分秒
                int minute = 0;//分
                double second = 0;//秒
                second = time_js / 10;
                if (second > 60)
                {
                    minute = (int)(second / 60);
                    second = second % 60;
                    time_lbl.Text = minute.ToString() + "分" + second.ToString() + "秒";
                }
                else
                {
                    time_lbl.Text = 0.ToString() + "分" + second.ToString() + "秒";
                }
                DialogResult msg_mp_total = MessageBox.Show("謝謝你的耐心等待" + "成功抓取" + page_nums + "頁" + mp_inofs_list.Count + "條名片信息,用時:" + time_lbl.Text, "恭喜");
               



            }
        }

        private void listView1_SelectedIndexChanged(object sender, EventArgs e)
        {

        }

        private void button4_Click(object sender, EventArgs e)
        {
            turntoexcel();
        }
        private void turntoexcel()
        {
            SaveFileDialog sfd = new SaveFileDialog();
            sfd.DefaultExt = "xls";
            sfd.Filter = "Excel文件(*.xls)|*.xls";
            if (sfd.ShowDialog() == DialogResult.OK)
            {

                DoExport(listView1, sfd.FileName);

            }


        }
        private void DoExport(ListView listView, string strFileName)
        {

            if (listView1.Items.Count== 0)
            {
                MessageBox.Show("沒有數據,沒法導出!");
                return;
            }
            int rowNum = listView.Items.Count;
           
            int columnNum = listView.Items[0].SubItems.Count;
            int rowIndex = 1;
            int columnIndex = 0;
            if (rowNum == 0 || string.IsNullOrEmpty(strFileName))
            {
                return;
            }
            if (rowNum > 0)
            {

                Microsoft.Office.Interop.Excel.Application xlApp = new Microsoft.Office.Interop.Excel.ApplicationClass();
                if (xlApp == null)
                {
                    MessageBox.Show("沒法建立excel對象,可能您的系統沒有安裝excel");
                    return;
                }
                xlApp.DefaultFilePath = "";
                xlApp.DisplayAlerts = true;
                xlApp.SheetsInNewWorkbook = 1;
                Microsoft.Office.Interop.Excel.Workbook xlBook = xlApp.Workbooks.Add(true);
                
                //將ListView的列名導入Excel表第一行
                foreach (ColumnHeader dc in listView.Columns)
                {
                    columnIndex++;
                    xlApp.Cells[rowIndex, columnIndex] = dc.Text;
                }
                //將ListView中的數據導入Excel中
                for (int i = 0; i < rowNum; i++)
                {
                    rowIndex++;
                    columnIndex = 0;
                    for (int j = 0; j < columnNum; j++)
                    {
                        columnIndex++;
                        //注意這個在導出的時候加了「\t」 的目的就是避免導出的數據顯示爲科學計數法。能夠放在每行的首尾。
                        xlApp.Cells[rowIndex, columnIndex] = Convert.ToString(listView.Items[i].SubItems[j].Text) + "\t";
                    }
                }
                //例外須要說明的是用strFileName,Excel.XlFileFormat.xlExcel9795保存方式時 當你的Excel版本不是9五、97 而是200三、2007 時導出的時候會報一個錯誤:異常來自 HRESULT:0x800A03EC。 解決辦法就是換成strFileName, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal。
                xlBook.SaveAs(strFileName, Microsoft.Office.Interop.Excel.XlFileFormat.xlWorkbookNormal, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Microsoft.Office.Interop.Excel.XlSaveAsAccessMode.xlExclusive, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing);
                xlApp = null;
                xlBook = null;
                MessageBox.Show("恭喜導出成功!");
            }
        }

        private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
        {
            //容許長時間的操做
            int input = (int)e.Argument;
            Thread.Sleep(input);

        }

        private void textBox1_TextChanged(object sender, EventArgs e)
        {

        }

        private void linkLabel1_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e)
        {
            linkLabel1.Links[0].LinkData = "http://download.firefox.com.cn/releases/stub/official/zh-CN/Firefox-latest.exe";
            String URL = linkLabel1.Links[0].LinkData.ToString();
            System.Diagnostics.Process.Start(URL);
        }
        private  void OnTimedEvent(Object source, ElapsedEventArgs e)
        {
            if (timer.Enabled == true)
            {
                time_js++;
                
            }
            else
            {
                return;
            }

        }

       

    }
}
相關文章
相關標籤/搜索