源碼:html
1 import requests 2 import re 3 from my_mysql import MysqlConnect 4 5 6 # 獲取問答信息 7 def get_contents(page,headers): 8 url = 'https://www.zhihu.com/api/v4/members/chen-lu-ya-26/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={}&limit=20&sort_by=created'.format(page) 9 req = requests.get(url,headers=headers) 10 html_json_dict = req.json() 11 # print(html_json_dict) 12 data_list = html_json_dict['data'] 13 contents = [] 14 for item in data_list: 15 question = item['question']['title'] 16 excerpt = item['excerpt'] 17 if '<' in excerpt: 18 pat = r'(.*?)<.*>(.*)' 19 res = re.search(pat, excerpt) 20 front = res.group(1) 21 back = res.group(2) 22 pat = r'<.*?>(.*?)<.*?>' 23 res = re.findall(pat, excerpt) 24 middle = ' '.join(res) 25 excerpt = front + middle + back 26 contents.append((question,excerpt)) 27 return contents 28 29 if __name__ == '__main__': 30 headers = { 31 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 32 } 33 mc = MysqlConnect('127.0.0.1','root','123456','homework') 34 for page in range(0,20*8,20): 35 contents = get_contents(page, headers) 36 # print(contents) 37 for content in contents: 38 sql = 'insert into zhihu values(null,%s,%s)' 39 mc.exec_data(sql,content) 40 print(content)