對於QQ空間的數據一直來是垂涎不已,老早就想偷過來研究研究,這幾天閒下來便開始動手。。。javascript
整個程序的流程爲:登陸-->獲取cookie-->獲取全部的好友qq_number-->根據全部的好友qq遍歷他們的說說-->get全部好友的說說數據java
程序跑了20多分鐘就跑完了,,共282好友,,跑了60000+說說python
有些我的隱私我抹掉了。。甭介意。嘿嘿mysql
1.登陸-->獲取cookiegit
打開http://i.qq.com/,以下圖github
但大多數時候是這樣的web
咱們這裏使用帳號密碼登陸,爲了方便使用selenium自動化神器(關於selenium的用法能夠參考http://www.javashuo.com/article/p-gopedkfz-hu.html,這裏不作過多闡述)sql
QQ帳號,QQ密碼存儲在userinfo.ini文件中,而後用configparser將其讀取出來數據庫
讀取的代碼以下json
configparser是一個讀取配置文件的庫,這裏讀取的格式爲get('[配置文件中括號裏的值]',‘相對應的key值’)
import configparser config = configparser.ConfigParser(allow_no_value=False) config.read('userinfo.ini') self.__username =config.get('qq_info','qq_number') self.__password=config.get('qq_info','qq_password')
用戶信息讀取出來後就能夠登陸了
有些盆友用selenium的時候,可能會發現有些元素定位不到,這是由於有些網頁套了一個iFrame
selenium根據id定位到該iframe
self.web.switch_to_frame('login_frame')
自動登陸且獲取cookie的代碼
def login(self): self.web.switch_to_frame('login_frame') log=self.web.find_element_by_id("switcher_plogin") log.click() time.sleep(1) username=self.web.find_element_by_id('u') username.send_keys(self.__username) ps=self.web.find_element_by_id('p') ps.send_keys(self.__password) btn=self.web.find_element_by_id('login_button') time.sleep(1) btn.click() time.sleep(2) self.web.get('https://user.qzone.qq.com/{}'.format(self.__username)) cookie='' for elem in self.web.get_cookies(): cookie+=elem["name"]+"="+ elem["value"]+";" self.cookies=cookie self.get_g_tk() self.headers['Cookie']=self.cookies self.web.quit()
2.獲取全部好友的QQ_number
研究很久後發如今QQ空間主頁中權限設置頁面中,點擊僅限QQ好友,會有下面這樣的頁面出來
按F12後研究js文件發現有這樣一個文件
這個js文件裏有好友的qq_number
因而請求這個文件獲得qq_number
def get_frends_url(self): url='https://h5.qzone.qq.com/proxy/domain/base.qzone.qq.com/cgi-bin/right/get_entryuinlist.cgi?' params = {"uin": self.__username, "fupdate": 1, "action": 1, "g_tk": self.g_tk} url = url + parse.urlencode(params) return url def get_frends_num(self): t=True offset=0 url=self.get_frends_url() while(t): url_=url+'&offset='+str(offset) page=self.req.get(url=url_,headers=self.headers) if "\"uinlist\":[]" in page.text: t=False else: if not os.path.exists("./frends/"): os.mkdir("frends/") with open('./frends/'+str(offset)+'.json','w',encoding='utf-8') as w: w.write(page.text) offset += 50
這裏有一個函數self.g_tk()它返回一個加密的p_skey , 在這個js文件中qzfl_v8_2.1.61.js,有這樣一段代碼
QZFL.pluginsDefine.getACSRFToken = function(url) { url = QZFL.util.URI(url); var skey; if (url) { if (url.host && url.host.indexOf("qzone.qq.com") > 0) { try { skey = parent.QZFL.cookie.get("p_skey"); } catch (err) { skey = QZFL.cookie.get("p_skey"); } } else { if (url.host && url.host.indexOf("qq.com") > 0) { skey = QZFL.cookie.get("skey"); } } } if (!skey) { skey = QZFL.cookie.get("p_skey") || (QZFL.cookie.get("skey") || (QZFL.cookie.get("rv2") || "")); } return arguments.callee._DJB(skey); }; QZFL.pluginsDefine.getACSRFToken._DJB = function(str) { var hash = 5381; for (var i = 0, len = str.length;i < len;++i) { hash += (hash << 5) + str.charCodeAt(i); } return hash & 2147483647; };
把它寫成python版的以下
def get_g_tk(self): p_skey = self.cookies[self.cookies.find('p_skey=')+7: self.cookies.find(';', self.cookies.find('p_skey='))] h=5381 for i in p_skey: h+=(h<<5)+ord(i) print('g_tk',h&2147483647) self.g_tk=h&2147483647
由於將好友信息存儲爲json文件,所以須要解析文件信息
#coding:utf-8 import json import os def get_Frends_list(): k = 0 file_list=[i for i in os.listdir('./frends/') if i.endswith('json')] frends_list=[] for f in file_list: with open('./frends/{}'.format(f),'r',encoding='utf-8') as w: data=w.read()[95:-5] js=json.loads(data) # print(js) for i in js: k+=1 frends_list.append(i) return frends_list frends_list=get_Frends_list() print(frends_list)
3.獲取全部好友說說
與以前相似,進入好友的說說主頁後發現也有這樣一個js文件將全部說說以json形式顯示出來
相似的,寫了獲取說說的代碼(通過測試,參數中的num最好寫20,不然會出現未知的結果。。。)
def get_mood_url(self): url='https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?' params = { "sort":0, "start":0, "num":20, "cgi_host": "http://taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6", "replynum":100, "callback":"_preloadCallback", "code_version":1, "inCharset": "utf-8", "outCharset": "utf-8", "notice": 0, "format":"jsonp", "need_private_comment":1, "g_tk": self.g_tk } url = url + parse.urlencode(params) return url def get_mood_detail(self): from getFrends import frends_list url = self.get_mood_url() for u in frends_list[245:]: t = True QQ_number=u['data'] url_ = url + '&uin=' + str(QQ_number) pos = 0 while (t): url__ = url_ + '&pos=' + str(pos) mood_detail = self.req.get(url=url__, headers=self.headers) print(QQ_number,u['label'],pos) if "\"msglist\":null" in mood_detail.text or "\"message\":\"對不起,主人設置了保密,您沒有權限查看\"" in mood_detail.text: t = False else: if not os.path.exists("./mood_detail/"): os.mkdir("mood_detail/") if not os.path.exists("./mood_detail/"+u['label']): os.mkdir("mood_detail/"+u['label']) with open('./mood_detail/'+u['label']+"/" +str(QQ_number)+"_"+ str(pos) + '.json', 'w',encoding='utf-8') as w: w.write(mood_detail.text) pos += 20 time.sleep(2)
將須要的說說數據存入數據庫
#存入數據庫 def dataToMysql(): con=pymysql.connect( host='127.0.0.1', user='root', password="×××", database='qq_z', port=3306, ) cur=con.cursor() sql="insert into info (qq_number,created_time,content,commentlist,source_name,cmtnum,name) values ({},{},{},{},{},{},{});" d=[i for i in os.listdir('mood_detail') if not i.endswith('.xls')] for ii in d: fl=[i for i in os.listdir('mood_detail/'+ii) if i.endswith('.json')] print('mood_detail/'+ii) k=1 for i in fl: with open('mood_detail/'+ii+"/"+i,'r',encoding='latin-1') as w: s=w.read()[17:-2] js=json.loads(s) print(i) for s in js['msglist']: m=-1 if not s['commentlist']: s['commentlist']=list() cur.execute(sql.format(int(i[:i.find('_')]),s['created_time'],str(s['content']),str([(x['content'],x['createTime2'],x['name'],x['uin']) for x in list(s['commentlist'])]),str(s['source_name']),int(s['cmtnum']),str(s['name']))) k+=1 con.commit() con.close()
將須要的說說數據存入Excel
def dataToExcel(): d=[i for i in os.listdir('mood_detail') if not i.endswith('.xls')] for ii in d: wb=xlwt.Workbook() sheet=wb.add_sheet('sheet1',cell_overwrite_ok=True) sheet.write(0,0,'content') sheet.write(0,1,'createTime') sheet.write(0,2,'commentlist') sheet.write(0,3,'source_name') sheet.write(0,4,'cmtnum') fl=[i for i in os.listdir('mood_detail/'+ii) if i.endswith('.json')] print('mood_detail/'+ii) k=1 for i in fl: with open('mood_detail/'+ii+"/"+i,'r',encoding='latin-1') as w: s=w.read()[17:-2] js=json.loads(s) print(i) for s in js['msglist']: m=-1 sheet.write(k,m+1,str(s['content'])) sheet.write(k,m+2,str(s['createTime'])) if not s['commentlist']: s['commentlist']=list() sheet.write(k,m+3,str([(x['content'],x['createTime2'],x['name'],x['uin']) for x in list(s['commentlist'])])) sheet.write(k,m+4,str(s['source_name'])) sheet.write(k,m+5,str(s['cmtnum'])) k+=1 if not os.path.exists('mood_detail/Excel/'): os.mkdir('mood_detail/Excel/') try: wb.save('mood_detail/Excel/'+ii+'.xls') except Exception: print("error")
4.分析數據
24小時發佈的說說數
你們在中午和晚上發佈的說說比較多,凌晨比較少
說說最多排行top20
說說最少排行top20
果真,,悶騷的人發的說說比較多。。。哈哈哈
從2000年到2018年,說說分佈以下
看來個人朋友們年輕的時候蠻悶騷,,隨着年紀增大,,說說愈來愈少。。
感謝https://zhuanlan.zhihu.com/p/24656161給個人提示。。。少走了許多彎路
數據抓取速度賊快,,20分鐘抓取了我全部好友(282+)60000+說說。。
項目已傳到
朋友們,以爲有用來個star噢。。蟹蟹。。。