本文介紹基於搜狗的微信公衆號定向爬蟲,使用C#實現,故取名WeGouSharp。本文中的項目託管在Github上,你能夠戳WeGouSharp獲取源碼,歡迎點星。關於微信公共號爬蟲的項目網上已經很多,然而基本大多數的都是使用Python實現 鑑於鄙人是名.NET開發人員,因而又爲廣大微軟系同胞建立了這個輪子,使用C#實現的微信爬蟲 藍本爲Chyroc/WechatSogou, 在此還請各位大佬指教。chrome
目錄 1.項目結構 2.數據結構 3.xpath介紹 4.使用HtmlAgilityPack解析網頁內容 5.驗證碼處理以及文件緩存
public struct OfficialAccount { public string AccountPageurl; public string WeChatId; public string Name; public string Introduction; public bool IsAuth; public string QrCode; public string ProfilePicture;//public string RecentArticleUrl; }
字段 | 含義 |
AccountPageurl | 微信公衆號頁 |
WeChatId | 公號ID(惟一) |
Name | 名稱 |
Introduction | 介紹 |
IsAuth | 是否官方認證 |
QrCode | 二維碼連接 |
ProfilePicture | 頭像連接 |
public struct HotWord { public int Rank;//排行 public string Word; public string JumpLink; //相關連接 public int HotDegree; //熱度 }
HttpTool類裏封裝了一個較多參數的HTTP GET操做,用於獲取搜狗的頁面:
由於搜狗自己是作搜索引擎的緣故,因此反爬蟲是很是嚴厲的,所以HTTP GET的方法要注意攜帶不少參數,且不一樣頁面要求不同.通常地,要帶上默認的
referer和host 而後請求頭的UserAgent 要僞造,經常使用的useragent有
public static List<string> _agent = new List<string> { "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv: Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv: Gecko Fedora/ Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", };
自定義的GET 方法
/// <summary> /// 頁面出現驗證碼,輸入才能繼續,此驗證依賴cookie, 獲取驗證碼的requset有個cookie,每次不一樣,須要在post驗證碼的時候帶上 /// </summary> /// <returns></returns> public bool VerifyCodeForContinute(string url ,bool isUseOCR) { bool isSuccess = false; logger.Debug("vcode appear, use VerifyCodeForContinute()"); DateTime Epoch = new DateTime(1970, 1, 1,0,0,0,0); var timeStamp17 = (DateTime.UtcNow - Epoch).TotalMilliseconds.ToString("R"); //get timestamp with 17 bit string codeurl = "" + timeStamp17; WebHeaderCollection headers = new WebHeaderCollection(); var content = this.Get(headers, codeurl,"UTF-8",true); ShowImageHandle showImageHandle = new ShowImageHandle(DisplayImageFromBase64); showImageHandle.BeginInvoke(content, null, null); Console.WriteLine("請輸入驗證碼:"); string verifyCode = Console.ReadLine(); string postURL = ""; timeStamp17 = (DateTime.UtcNow - Epoch).TotalMilliseconds.ToString("R"); //get timestamp with 17 bit string postData = string.Format("cert={0}&input={1}",timeStamp17,verifyCode );// "{" + string.Format(@"'cert':'{0}','input':'{1}'", timeStamp17, verifyCode) + "}"; headers.Add("Host", ""); headers.Add("Referer", url); string remsg = this.Post(postURL, headers, postData,true); try { JObject jo = JObject.Parse(remsg);//把json字符串轉化爲json對象 int statusCode = (int)jo.GetValue("ret"); if (statusCode == 0) { isSuccess = true; } else { logger.Error("cannot unblock because " + jo.GetValue("msg")); var vcodeException = new WechatSogouVcodeException(); vcodeException.MoreInfo = "cannot jiefeng because " + jo.GetValue("msg"); throw vcodeException; } }catch(Exception e) { logger.Error(e); } return isSuccess; }
var timeStamp17 = (DateTime.UtcNow - Epoch).TotalMilliseconds.ToString("R"); //get timestamp with 17 bit