巧用Webbrowser實現網絡數據採集

 在上一篇文章:《巧用C# webbrowser實現動態網頁爬蟲機器人》中,給出了一個綜合利用webbrowser控件,MSHTML DOM,正則表達式,以及線程阻塞技術爬取動態網頁連接的框架和方法。(注:所謂動態網頁是指:目標網頁的URL不能從當前網頁中直接得到,而是要依賴JavaScript從新定向和生成。這種動態網頁,咱們有的時候能夠經過查看頁面<script>元素的內容找到蛛絲馬跡,當更多時候,生成目標網頁的URL的JS函數的源碼再也不當前網頁,這時候就顯得無能爲力了)。《巧用C# webbrowser實現動態網頁爬蟲機器人》一文中的框架方法也一樣適用於靜態網頁,能夠說這個框架方法在某種程度上具備必定的通用性。此篇博文是對上一篇博文的一個補充,實現的功能是:從一級索引頁獲取到二級索引頁首頁的連接,在二級索引頁首頁獲取當前頁面中指向內容頁面的URL連接,而且進行翻頁獲取內容頁源碼,同時完成二級索引頁首頁到次頁的翻頁,並以此類推。(注意:索引頁,內容頁的定義請見:《巧用C# webbrowser實現動態網頁爬蟲機器人》)html

    網絡數據採集是網絡挖掘的一個重要步驟。屬於預處理工做範疇。通常分爲兩步,網絡信息採集(即下載網頁源碼)和網絡信息解析(即對網頁源碼進行解析,提取出BOI (block of interest))。本文是對《巧用C# webbrowser實現動態網頁爬蟲機器人》的補充,完成網絡信息採集功能。web

下面給出關鍵模塊代碼:正則表達式

 

public class ArticlePage
    {
        public string title;
        public string url;
        public string rawtext;
        public ArticlePage()
        {
            title = string.Empty;
            url = string.Empty;
            rawtext = string.Empty;
        }
    }
 public bool mysignal1;//btnworkflow按鈕是否被點擊
        public bool mysignal2;
        public bool mysignal3;
        public bool loading;//工做流按鈕與webbrowser進行交互的通訊按鈕
        public bool subloading;
        public bool subloadingPer;
public Form1()
{
            InitializeComponent();
            mysignal1 = false;
            mysignal2 = false;
            mysignal3 = false;
            loading = true;
            subloading = true;
            subloadingPer = true;
 }
 
 private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
      {
            if (webBrowser1.ReadyState ==WebBrowserReadyState.Complete)
            {
                if (mysignal1)
                {
                    if (!mysignal2&&!mysignal3)
                    {
                        loading = false;
                    }
                    else
                    {
                        if (mysignal2&&!mysignal3)
                        {
                            subloading = false;
                        }
                       
                        if (mysignal3)
                        {
                            subloadingPer = false;
                        }

                    }
                   
                   
                }
               
               
            }
                        

           
               
           


        }
private void btnworkflow_Click(object sender, EventArgs e)
        {
mysignal1 = true;
            string dirCurPageUrl = string.Empty;//用於恢復目錄頁當前頁的網頁視圖
            List<ArticlePage> arListCurrentPage;
            foreach (string s in issuesMap)
            {

                loading = true;
                string tmpurl = s;
                webBrowser1.Navigate(tmpurl);
              
                while (loading == true)
                {
                    Application.DoEvents();
                }
              
                
                    arListCurrentPage = GetArticlePageInfoFromCurrentDirpage();
                     if (arListCurrentPage != null)
                     {
                         dirCurPageUrl = webBrowser1.Url.ToString();
                         mysignal3 = true;
                         foreach ( ArticlePage ap in arListCurrentPage)
                         {
                             webBrowser1.Navigate(ap.url);
                             subloadingPer = true;
                             while(subloadingPer)
                             {
                                 Application.DoEvents();
                             }
                             ap.rawtext = webBrowser1.DocumentText;
                         }
                         InsertTitleUrlToDataBase(arListCurrentPage);
                         mysignal3 = false;
                         webBrowser1.Navigate(dirCurPageUrl);
                         loading = true;
                         while (loading == true)
                         {
                             Application.DoEvents();
                         }
                     }


                  

                    mysignal2 = true;
                    while (AnchorNextPage())
                    {
                        subloading = true;
                        while(subloading)
                        {
                            Application.DoEvents();

                        }
                        arListCurrentPage = GetArticlePageInfoFromCurrentDirpage();
                        if (arListCurrentPage != null)
                        {
                            dirCurPageUrl = webBrowser1.Url.ToString();
                            mysignal3 = true;
                            foreach (ArticlePage ap in arListCurrentPage)
                            {
                                webBrowser1.Navigate(ap.url);
                                subloadingPer = true;
                                while (subloadingPer)
                                {
                                    Application.DoEvents();
                                }
                                ap.rawtext = webBrowser1.DocumentText;
                            }
                            InsertTitleUrlToDataBase(arListCurrentPage);
                            mysignal3 = false;
                            webBrowser1.Navigate(dirCurPageUrl);
                            subloading = true;
                            while (subloading)
                            {
                                Application.DoEvents();

                            }


                        }
                       

                       

                    }
                    mysignal2 = false;
                   
                    //得到當前頁面的下一頁連接
               
                  
            }
               

                  
        }
private void InsertTitleUrlToDataBase(List<ArticlePage> arlist)
        {
            DataBaseManipulation dm = new DataBaseManipulation();
            string conStr = "server=(local);database=xxxxx;uid=sa;pwd=xxx";
            dm.ConstructConnection(conStr);
            foreach (ArticlePage article in arlist)
            {
dm.InsertToDataBase(article, "xxx");
            }




        }
 public void InsertToDataBase(ArticlePage article,string table )
        { //插入字符串
           string  sqlcommand=string.Format("insert  into {0}(ArticlePageUrl,ArticlePageTitle,ArticlePageSource)values(@ArticlePageUrl,@ArticlePageTitle,@ArticlePageSource)",table);
         
            //數據庫參數構造與賦初值
           SqlParameter ArticlePageTitle = new SqlParameter("@ArticlePageTitle", SqlDbType.VarChar, 400);
           ArticlePageTitle.Value = article.title;
           SqlParameter ArticlePageUrl = new SqlParameter("@ArticlePageUrl", SqlDbType.VarChar, 400);
           ArticlePageUrl.Value = article.url;
           SqlParameter ArticlePageSource = new SqlParameter("@ArticlePageSource", SqlDbType.Text);
           ArticlePageSource.Value = article.rawtext;
           SqlCommand cmd = new SqlCommand(sqlcommand, connection);
           cmd.Parameters.Add(ArticlePageTitle);
           cmd.Parameters.Add(ArticlePageUrl);
          cmd.Parameters.Add(ArticlePageSource);
         
            //打開數據庫鏈接
           OpenConnection();

        
           try
           {  
               //執行cmd操做
               cmd.ExecuteNonQuery();
           }
           catch (System.Exception e)
           {   //輸出錯誤到記事本中
               StreamWriter sw = new StreamWriter("D:\\myerror.txt", true, Encoding.Default);
               sw.Write(e.Message);
               sw.Close();
              //一旦發生錯誤程序就中止運行,等待用戶發現
               Console.Read();
              
           }

            //關閉數據庫鏈接
           CloseConnection();


        }





    }
}

 

本地數據庫(保存爬取的信息)視圖:sql

 

 

小結與發散:筆者於本科畢設作過新聞類網頁正文提取的課題,請見《新聞類網頁正文提取系列》,而且從各大新聞門戶網站的不一樣版面提取了四萬餘條新聞做爲語料庫,該語料庫已經提供給網友下載,語料庫說明以及下載地址見:《獻給熱衷於天然語言處理的業餘愛好者的語料庫》。 可是畢設當時,我對於動態網頁提取方法並無一個清晰的認識,因此能夠提取的新聞版面上限制很大(必須是靜態新聞網頁才能提取)。有興趣,而且對語料庫有需求的網友,能夠參考個人此篇博客,以及上一篇博客《巧用C# webbrowser實現動態網頁爬蟲機器人》,還有正文提取系列的一些博文,本身動手配置一個自動獲取新聞語料庫的小型爬蟲。數據庫

相關文章
相關標籤/搜索