點擊我前往Github查看源代碼 別忘記starhtml
本項目github地址:https://github.com/wangqifan/ZhiHu node
UserManage是獲取用戶信息的爬蟲模塊git
public class UserManage { private string html; private string url_token; }
構造函數github
用戶主頁的uRL格式爲"https://www.zhihu.com/people/"+url_token+"/following";json
public UserManage(string urltoken) { url_token = urltoken; }
先封裝一個獲取html頁面的方法api
private bool GetHtml() { string url="https://www.zhihu.com/people/"+url_token+"/following"; html = HttpHelp.DownLoadString(url); return !string.IsNullOrEmpty(html); }
拿到了html頁面,接下來是剝取頁面中的JSON,藉助HtmlAgilityPack數組
public void analyse() { if (GetHtml()) { try { Stopwatch watch = new Stopwatch(); watch.Start(); HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); HtmlNode node = doc.GetElementbyId("data"); StringBuilder stringbuilder =new StringBuilder(node.GetAttributeValue("data-state", "")); stringbuilder.Replace(""", "'"); stringbuilder.Replace("<", "<"); stringbuilder.Replace(">", ">"); watch.Stop(); Console.WriteLine("分析Html用了{0}毫秒", watch.ElapsedMilliseconds.ToString()); } catch (Exception ex) { Console.WriteLine(ex.ToString()); } } }
添加用戶的關注列表的連接服務器
private void GetUserFlowerandNext(string json) { string foollowed = "https://www.zhihu.com/api/v4/members/" + url_token + "/followers?include=data%5B*%5D.answer_count%2Carticles_count%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20"; string following = "https://www.zhihu.com/api/v4/members/" + url_token + "/followees?include=data%5B%2A%5D.answer_count%2Carticles_count%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset=0"; RedisCore.PushIntoList(1, "nexturl", following); RedisCore.PushIntoList(1, "nexturl", foollowed); }
對json數據進一步剝取,只要用戶的信息,藉助JSON解析工具Newtonsoft.Json多線程
private void GetUserInformation(string json) { JObject obj = JObject.Parse(json); string xpath = "['" + url_token + "']"; JToken tocken = obj.SelectToken("['entities']").SelectToken("['users']").SelectToken(xpath); RedisCore.PushIntoList(2, "User", tocken.ToString()); }
如今來完成下analyse函數
public void analyse() { if (GetHtml()) { try { Stopwatch watch = new Stopwatch(); watch.Start(); HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); HtmlNode node = doc.GetElementbyId("data"); StringBuilder stringbuilder =new StringBuilder(node.GetAttributeValue("data-state", "")); stringbuilder.Replace(""", "'"); stringbuilder.Replace("<", "<"); stringbuilder.Replace(">", ">"); GetUserInformation(stringbuilder.ToString()); GetUserFlowerandNext(stringbuilder.ToString()); watch.Stop(); Console.WriteLine("分析Html用了{0}毫秒", watch.ElapsedMilliseconds.ToString()); } catch (Exception ex) { Console.WriteLine(ex.ToString()); } } } }
UrlTask是從nexturl隊列獲取用戶的關注列表的url,獲取關注列表。服務器返回的Json的數據函數
封裝一個對象的序列化和反序列化的類
public class SerializeHelper { /// <summary> /// 對數據進行序列化 /// </summary> /// <param name="value"></param> /// <returns></returns> public static string SerializeToString(object value) { return JsonConvert.SerializeObject(value); } /// <summary> /// 反序列化操做 /// </summary> /// <typeparam name="T"></typeparam> /// <param name="str"></param> /// <returns></returns> public static T DeserializeToObject<T>(string str) { return JsonConvert.DeserializeObject<T>(str); } }
封裝UrlTask類
public class UrlTask { private string url { get; set; } private string JSONstring { get; set; } public UrlTask(string _url) { url = _url; } }
添加一個獲取資源的方法
private bool GetHtml() { JSONstring= HttpHelp.DownLoadString(url); Console.WriteLine("Json下載完成"); return !string.IsNullOrEmpty(JSONstring); }
解析json方法
public void Analyse() { try { if (GetHtml()) { Stopwatch watch = new Stopwatch(); watch.Start(); followerResult result = SerializeHelper.DeserializeToObject<followerResult>(JSONstring); if (!result.paging.is_end) { RedisCore.PushIntoList(1, "nexturl", result.paging.next); } foreach (var item in result.data) { int type=Math.Abs(item.GetHashCode())% 3 + 3; if (RedisCore.InsetIntoHash(type, "urltokenhash", item.url_token, "存在")) { RedisCore.PushIntoList(1, "urltoken", item.url_token); } } watch.Stop(); Console.WriteLine("解析json用了{0}毫秒",watch.ElapsedMilliseconds.ToString()); } } catch (Exception ex) { Console.WriteLine(ex.ToString()); } }
解析:若是result.paging.is_end爲true,那麼這個是用戶關注列表的最後一頁,那麼它的nexturl應該加入隊列,負責不要加入,對於後面的用戶數組,由於信息不去全,不要了,有了Id前往主頁獲取詳細信息。
模塊組合
封裝一個一個方法,從隊列拿到nextutl,前往用戶的關注列表,拿到更多用戶ID
private static void GetNexturl() { string nexturl = RedisCore.PopFromList(1, "nexturl"); if (!string.IsNullOrEmpty(nexturl)) { UrlTask task = new UrlTask(nexturl); task.Analyse(); } }
封裝一個方法,循環從隊列獲取用戶的urltoken(若是隊列空了,執行GetNexturl),前往用戶主頁,獲取信息
private static void GetUser(object data) { while (true) { string url_token = RedisCore.PopFromList(1, "urltoken"); Console.WriteLine(url_token); if (!string.IsNullOrEmpty(url_token)) { UserManage manage = new UserManage(url_token); manage.analyse(); } else { GetNexturl(); } } }
在main函數裏面執行這些方法,因爲任務量大,採用多線程,線程數視狀況而定
for (int i = 0; i < 10; i++) { ThreadPool.QueueUserWorkItem(GetUser); }
添加種子數據,用於剛開始時候隊列都是空的,須要添加種子數據
UserTask task=new UserTask(「某個用戶的uRLtoken」); task.analyse();
執行一次以後要註釋掉,避免重複