解析ASPX網頁__doPostBack分頁的網頁table數據

因爲急於上線的功能要去客服系統裏抓取數據進行驗證,客服方面又沒有時間開發EDI接口給到我,因此用了本辦法:爬人家web系統上的數據進行分析。html

因爲客服的web系統用ASP.Net的__doPostBack控件進行數據分頁。__doPostBack是經過__EVENTTARGET,__EVENTARGUMENT兩個隱藏控件向服務端發送控制信息的。web

這裏我要分析的頁面概況以下:正則表達式

這裏有個導出按鈕,直接模擬導出按鈕獲取數據。模擬點擊頁面來獲取咱們要解析須要的參數:__VIEWSTATE、__EVENTVALIDATIONjson

設計一個demoapp

 private void button1_Click(object sender, EventArgs e)
        {
            string strViewState = System.Web.HttpUtility.UrlEncode("/wEPDwUJOTEyODUyNDU3D2QWAgIBDw8WAh4EVGV4dAUMUGFsbGV0IExpc3Q6ZGQYAQUPZ3ZzeXNzdGF0dXNsaXN0DzwrAAwBCAIOZOMeKZkMrWymOLPLpoGaeUA09JXcMmBeiapHUaN/Gi/F");
            string strEventValidation = System.Web.HttpUtility.UrlEncode("/wEdAA0xoT8jJS8wSLBSnnngxJ11LlhQ8m9UI3ZtBAB4T5ifli9v5Ab4i1r+ooL3Ki4G1V2mwGQwvscKG/slAoCqDXMdx+s83MyrFGOK+Bhp33qS53KOlIwqJuEKsQlYBBWX4thggho5YkoND4EeSEP5w92hCXHcK3jw5s4JUgUUp9F6PJLbP8X7bsfD4kS7SIahZoNk3kkOw9nHzhwCbKD2SjMJqipXHjkN/zBo3hFqKyPYi8LcXcQFP+NtnRoXzZkJPyDN+DvxnwFeFeJ9MIBWR693WMHe4rn0nQ4UPheoLGOgfsnsDvJMKKjfZWVoPlnQzPA=");
            StringBuilder url = new StringBuilder();
            url.Append("&__VIEWSTATE=" + strViewState);
            url.Append("&__EVENTVALIDATION=" + strEventValidation);
            url.Append("&Button1=導出");
            byte[] data = System.Text.Encoding.ASCII.GetBytes(url.ToString());
            Uri uri = new Uri(textBox1.Text.ToString());
            System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(uri);
            request.Method = "post";
            request.ContentType = "application/x-www-form-urlencoded";
            //request.ContentType = "application/json";
            request.ContentLength = data.Length;
            Stream requestStream = request.GetRequestStream();
            requestStream.Write(data, 0, data.Length);
            requestStream.Close();
            System.Net.HttpWebResponse response = (System.Net.HttpWebResponse)request.GetResponse();
            Stream responseStream = response.GetResponseStream();
            StreamReader readStream = new StreamReader(responseStream, System.Text.Encoding.Default);
            string html = readStream.ReadToEnd().ToString();
            readStream.Close();
            var strReg = @"<table[^>]*>[\s\S]*</table>";
            List<string> result = new List<string>();
            MatchCollection mc = Regex.Matches(html, strReg);
            DataTable dt = new DataTable();
            dt.Columns.Add("ID", typeof(string));                    //序號
            dt.Columns.Add("SN", typeof(string));                    //系統序列號
            dt.Columns.Add("WO", typeof(string));                    //工單號
            dt.Columns.Add("NextStation", typeof(string));           //下一站別
            dt.Columns.Add("RepairHeld", typeof(string));            //不良否
            dt.Columns.Add("SKUNo", typeof(string));                 //成品料號
            dt.Columns.Add("Line", typeof(string));                  //線別
            dt.Columns.Add("LastEditTime", typeof(string));          //最後時間
            dt.Columns.Add("Carton", typeof(string));                //卡通號
            dt.Columns.Add("PalletNo", typeof(string));              //棧板號
            dt.Columns.Add("Closed", typeof(string));                //完成否
            dt.Columns.Add("ShipOrderNo", typeof(string));           //出貨通知單號
            dt.Columns.Add("ContainerNo", typeof(string));           //貨櫃號
            dt.Columns.Add("TruckLoaded", typeof(string));           //出貨否
            string strhead = string.Empty;
            Match m = mc[0];
            int rowindex = 1;
            foreach (Match mtr in Regex.Matches(m.Value, "(?is)(?<=<tr>).+?(?=</tr>)"))
            {
                if (rowindex == 1) { rowindex++; continue; }
                DataRow dr = dt.NewRow();
                MatchCollection tds = Regex.Matches(mtr.Value, "(?is)(?<=<td>).+?(?=</td>)");
                MatchCollection tds2 = Regex.Matches(mtr.Value, "(?is)(?<=target=\"_blank\">).+?(?=</a></td>)");
                MatchCollection tds6 = Regex.Matches(mtr.Value, "(?is)(?<=@\">).+?(?=</td>)");
                MatchCollection ids = Regex.Matches(tds[0].Value, "(?is)(?<=\">).+?(?=</span>)");
                dr["ID"] = FormatString(ids[0].Value);
                dr["SN"] = FormatString(tds2[0].Value);
                dr["WO"] = FormatString(tds[1].Value);
                dr["NextStation"] = FormatString(tds[2].Value);
                dr["RepairHeld"] = FormatString(tds[3].Value);
                dr["SKUNo"] = FormatString(tds6[1].Value);
                dr["Line"] = FormatString(tds[4].Value);
                dr["LastEditTime"] = FormatString(tds[5].Value);
                dr["Carton"] = FormatString(tds[6].Value);
                dr["PalletNo"] = FormatString(tds[7].Value);
                dr["Closed"] = FormatString(tds[8].Value);
                dr["ShipOrderNo"] = FormatString(tds[9].Value);
                dr["ContainerNo"] = FormatString(tds[10].Value);
                dr["TruckLoaded"] = FormatString(tds[11].Value);
                dt.Rows.Add(dr);
                rowindex++;
            }
            gridControl1.DataSource = dt;
        }
        /// <summary>
        /// 頁面加載
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void Form1_Load(object sender, EventArgs e)
        {
            this.textBox1.Text = @"http://172.16.1.13/WIPDetail.aspx?location=PA-1112069";
        }
        private string FormatString(string str)
        {
            str = str.Replace("\r\n", "").Replace("&nbsp;", "").Trim();
            return str;
        }

正則表達式分析解析出來的HTML文本便可。post

其實針對上述既要分頁又是隱藏控件分頁的table不太好爬數據,若是解析不分頁的table的話沒那麼複雜,能夠直接獲取整個頁面的html元素而後進行解析ui

列如我須要知道以下頁面的table表裏面的數據:this

首先根據URL,WebRequest請求該頁面,IO流讀取整個頁面的HTML元素,首先正則定位table,而後遍歷這些元素,按照<tr>,<th>解析url

WebRequest request = WebRequest.Create("http://172.16.1.13/FGIMA.aspx?t=ssn&sno=" + str_Pallet);
                WebResponse response = request.GetResponse();
                StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("utf-8"));
                var html = reader.ReadToEnd();
                var strReg = @"(?is)(?<=<table>).+?(?=</table>)";
                List<string> result = new List<string>();
                MatchCollection mc = Regex.Matches(html, strReg);
                DataTable dt = new DataTable();
              
                string strhead = string.Empty;
                foreach (Match m in mc)
                {
                    int rowindex = 1;
                    foreach (Match mtr in Regex.Matches(m.Value, "(?is)(?<=<tr>).+?(?=</tr>)"))
                    {
                        if (rowindex == 1)
                        {
                            strhead = mtr.Value;
                            foreach (Match mtdh in Regex.Matches(mtr.Value, "(?is)(?<=<th>).+?(?=</th>)"))
                            {
                                if (!dt.Columns.Contains(mtdh.Value.Replace("\r\n", "").Trim()))
                                {
                                    dt.Columns.Add(mtdh.Value.Replace("\r\n", "").Trim());
                                }
                            }
                        }
                        else
                        {
                            DataRow dr = dt.NewRow();
                            if (mtr.Value.Contains("累計數量")) continue;
                            if (mtr.Value.Contains("沒有找到相關數據"))
                            {
                                return dt;
                            }
                            MatchCollection mtdds = Regex.Matches(mtr.Value, "(?is)(?<=<td>).+?(?=</td>)");
                            int colindex = 0;
                            foreach (Match mtdh in Regex.Matches(strhead, "(?is)(?<=<th>).+?(?=</th>)"))
                            {
                                if (mtdh.Value.Contains("機器序列號"))
                                {
                                    foreach (Match mtdhdd in Regex.Matches(mtr.Value, "(?is)(?<=target=\"_blank\">).+?(?=</a>)"))
                                    {
                                        dr["機器序列號"] = mtdhdd.Value.Replace("\r\n", "").Trim();
                                    }
                                }
                                else
                                    dr[mtdh.Value.Replace("\r\n", "").Trim()] = mtdds[colindex].Value.Replace("\r\n", "").Trim();

                                colindex++;
                            }
                            dt.Rows.Add(dr);
                        }
                        rowindex++;
                    }
                }
                if (dt.Columns.Contains("機器序列號"))
                {
                    dt.Columns["機器序列號"].ColumnName = "SN";
                }
                if (dt.Columns.Contains("工單號碼"))
                {
                    dt.Columns["工單號碼"].ColumnName = "WorkorderNo";
                }
                if (dt.Columns.Contains("成品料號"))
                {
                    dt.Columns["成品料號"].ColumnName = "PartNo";
                }
                if (dt.Columns.Contains("棧板號碼"))
                {
                    dt.Columns["棧板號碼"].ColumnName = "Pallet";
                }
                reader.Close();
                reader.Dispose();
                response.Close();
                return dt;

相關文章
相關標籤/搜索