1.抓取某個公衆號全部的文件信息python
Charles +電腦版微信+pycharm+pythonjson
2. 分析微信
通過分析:每一個公衆號文章列表頁鏈接都是app
https://mp.weixin.qq.com/mp/profile_ext 開頭 ,抓取時每一個公衆號的只有幾個參照不同函數
抓取:工具
3. 代碼ui
import requests import json import time def parse(__biz, uin, key, pass_ticket, appmsg_token="", offset="0"): """ 文章信息獲取 """ url = 'https://mp.weixin.qq.com/mp/profile_ext' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.901.400 QQBrowser/9.0.2524.400", } params = { "action": "getmsg", "__biz": __biz, "f": "json", "offset": str(offset), "count": "10", "is_ok": "1", "scene": "124", "uin": uin, "key": key, "pass_ticket": pass_ticket, "wxtoken": "", "appmsg_token": appmsg_token, "x5": "0", } res = requests.get(url, headers=headers, params=params, timeout=3) data = json.loads(res.text) print(data) # 獲取信息列表 msg_list = eval(data.get("general_msg_list")).get("list", []) for i in msg_list: # 去除文字連接 try: # 文章標題 title = i["app_msg_ext_info"]["title"].replace(',', ',') # 文章摘要 digest = i["app_msg_ext_info"]["digest"].replace(',', ',') # 文章連接 url = i["app_msg_ext_info"]["content_url"].replace("\\", "").replace("http", "https") # 文章發佈時間 date = i["comm_msg_info"]["datetime"] print(title, digest, url, date) with open('article.csv', 'a') as f: f.write(title + ',' + digest + ',' + url + ',' + str(date) + '\n') except: pass # 判斷是否可繼續翻頁 1-能夠翻頁 0-到底了 if 1 == data.get("can_msg_continue", 0): time.sleep(3) parse(__biz, uin, key, pass_ticket, appmsg_token, data["next_offset"]) else: print("爬取完畢") if __name__ == '__main__': # 請求參數 __biz = input('biz: ') uin = input('uin: ') key = input('key: ') pass_ticket = input('passtick: ') # 解析函數 parse(__biz, uin, key, pass_ticket, appmsg_token="", offset="0")
經過抓取獲取到不一樣公衆號的__biz, uin, key, pass_ticket的參數,就能夠完成對該公衆號的抓取。url
注意: 程序運行前,用抓包工具獲取須要的參數後,須要將charles 或filddle 抓包工具先關閉,而後再運行該程序,不然會報錯。(這個程序須要電腦端微信登陸後,進行抓取,若是使用的是模擬器,模擬器上的uni和key 參數和電腦端不同,通常請求不能成功!)spa