參考於崔慶才的Python爬蟲教程,可是崔的視頻時間過長,今日頭條網站反爬蟲也有了變化,所以寫下此文章來記錄本身的爬取過程遇到的問題,也給你們一些借鑑。歡迎你們討論。
1、獲取街索引頁。
咱們會發現doc下服務器給出的response裏面全是些js代碼,沒有咱們想要的二級頁面連接。
而後查看XHR下,preview會發現咱們要的數據全在這裏面,他是以一個json對象的存放的,由此咱們知道他是由Ajax渲染的。(通常下滑加載出數據的基本都是ajax動態渲染的)。再看他的請求字符參數、請求頭參數有不少,不過沒什麼問題,直接複製過來便可。html
def get_index(offset, keyword): para = { 'aid': '24', 'app_name': 'web_search', 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': '20', 'en_qc': '1', 'cur_tab': '1', 'from': 'search_tab', 'pd': 'synthesis', 'timestamp': '1593665684775', '_signature': '7' + 'yD7.AAgEBApd0Jxcjcfwe8huuAALHovmaBcff719uWpg6PhnCCgTgbuUckg1kLI3dQFdQZ1b3VwbtvV9P3ZaGHpjzTDdgJm.hxt6TELcPiJAsAOBYizC - 15.PpPHKolFtN' } headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36', 'cookie': 'tt_webid=6844895200352765453; ttcid=147baf1d0de44bcfaa3aaccb43de319231; csrftoken=e72569f7c754e03b930bbb54f34b7759; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6845096485504583176; s_v_web_id=verify_kc5ojvq4_iZKfsEzB_AubR_44PZ_AgkE_hvFA5OJnJ4nc; SLARDAR_WEB_ID=ba4439ab-f467-42a9-9dcd-8be836757cc3; __tasessionId=mbg1eigaz1593748245904; tt_webid=6845096485504583176; __ac_nonce=05efeab2d00f5555e228f; __ac_signature=_02B4Z6wo00f01JctsQwAAIBDjnNXOcxFq8yXKbWAAHs9kTU3N2TXnkkV75OXvSXoT5pfhWopyS8N2zbrE1VOQlMWB8pFBr0ZvUmL9qtI0GvgW6d9cv9TeAEXgf1HZ3bDCooMMUBc9Rq-r53241; tt_scid=8TNWiPBmsnVRkGGZJ5frIVGKsH4gMngyY0s4KQWdyckfgNuI45yMdiqvIa1sudDF15e8' } url = 'https://www.toutiao.com/api/search/content/?' + urlencode(para) # urlencode能夠吧字典類型轉化爲url中的請求參數 try: response = requests.get(url, headers=headers) if response.status_code == 200: print('索引頁請求成功') response.encoding = 'UTF-8' return response.text return None except RequestException: print('索引頁請求失敗') return None
2、解析索引頁數據。
因爲這個數據是在js中存儲的,通常的爬蟲庫並不適用,因此咱們使用正則表達式的方法將他匹配出來,而後取出各個索引的Url與Title。web
def parse_index_json(html): data = json.loads(html) # 解析爲json對象 if data and 'data' in data.keys(): for item in data.get('data'): # 遍歷data這個json數據 if item and 'abstract' in item.keys(): yield (item.get('article_url'), item.get('display').get('title').get('text'))
3、獲取詳情頁。
這個頁面我花了整整一天時間才獲取到,在網上看別人爬的也都是使用requests,可是我使用requests請求他,無論如何配置headers,就是請求不到數據,請求個幾十次可能纔有一個url能請求到,我感受是他反爬可能檢測到了,因而我使用了selenium(不能使用headless Chrome模式,否則返回的仍是None),,去請求數據,果真成功了。(不過這種方法比較慢,不知道有什麼好的方法,你們若是有請分享一下哈)。ajax
def get_detail(url): driver = webdriver.Chrome() # 聲明瀏覽器驅動對象 try: driver.get(url) html = driver.page_source driver.close() return html except RequestException: print('詳情頁請求失敗') return None
4、解析詳情頁數據。
仍是打開doc下的response查看網頁代碼,咱們發現咱們要的數據就在這裏面的js代碼中,雖然進行轉碼了,可是他的後綴跟圖片url的後綴是同樣的,因而一樣對他使用正則表達式取出來轉碼,便可獲得圖片的url。正則表達式
def parse_detail(html): result = re.findall('JSON.parse\("(.*?)"\),', html, re.S) if result: data = result[0].encode('utf-8').decode('unicode-escape') # 去掉\\\u002F,先用utf-8編碼再用unicode-escape解碼 data = json.loads(data) if data and 'sub_images' in data.keys(): # 過濾掉不能爬的 sub_images = data.get('sub_images') ![json.png](/img/bVbI3kO)'url') for item in sub_images] return image_url
5、獲取圖片頁。
直接使用requests請求便可。json
def download_image(img_url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36', 'Connection': 'keep - alive', } try: response = requests.get(img_url, headers=headers) if response.status_code == 200: save_image(response.content) # 返回二進制內容,返回圖片通常返回二進制內容 return None except RequestException: print('圖片請求失敗') return None
6、下載圖片,將圖片信息保存至MongoDB,再也不敘述。api
如下是完整代碼:瀏覽器
import os from hashlib import md5 import requests from requests.exceptions import RequestException from urllib.parse import urlencode import json import re from selenium import webdriver import pymongo from config import * client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] def get_index(offset, keyword): para = { 'aid': '24', 'app_name': 'web_search', 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': '20', 'en_qc': '1', 'cur_tab': '1', 'from': 'search_tab', 'pd': 'synthesis', 'timestamp': '1593665684775', '_signature': '7' + 'yD7.AAgEBApd0Jxcjcfwe8huuAALHovmaBcff719uWpg6PhnCCgTgbuUckg1kLI3dQFdQZ1b3VwbtvV9P3ZaGHpjzTDdgJm.hxt6TELcPiJAsAOBYizC - 15.PpPHKolFtN' } headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36', 'cookie': 'tt_webid=6844895200352765453; ttcid=147baf1d0de44bcfaa3aaccb43de319231; csrftoken=e72569f7c754e03b930bbb54f34b7759; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6845096485504583176; s_v_web_id=verify_kc5ojvq4_iZKfsEzB_AubR_44PZ_AgkE_hvFA5OJnJ4nc; SLARDAR_WEB_ID=ba4439ab-f467-42a9-9dcd-8be836757cc3; __tasessionId=mbg1eigaz1593748245904; tt_webid=6845096485504583176; __ac_nonce=05efeab2d00f5555e228f; __ac_signature=_02B4Z6wo00f01JctsQwAAIBDjnNXOcxFq8yXKbWAAHs9kTU3N2TXnkkV75OXvSXoT5pfhWopyS8N2zbrE1VOQlMWB8pFBr0ZvUmL9qtI0GvgW6d9cv9TeAEXgf1HZ3bDCooMMUBc9Rq-r53241; tt_scid=8TNWiPBmsnVRkGGZJ5frIVGKsH4gMngyY0s4KQWdyckfgNuI45yMdiqvIa1sudDF15e8' } url = 'https://www.toutiao.com/api/search/content/?' + urlencode(para) # urlencode能夠吧字典類型轉化爲url中的請求參數 try: response = requests.get(url, headers=headers) if response.status_code == 200: print('索引頁請求成功') response.encoding = 'UTF-8' return response.text return None except RequestException: print('索引頁請求失敗') return None def parse_index_json(html): data = json.loads(html) # 解析爲json對象 if data and 'data' in data.keys(): for item in data.get('data'): # 遍歷data這個json數據 if item and 'abstract' in item.keys(): yield (item.get('article_url'), item.get('display').get('title').get('text')) def get_detail(url): driver = webdriver.Chrome() # 聲明瀏覽器驅動對象 try: driver.get(url) html = driver.page_source driver.close() return html except RequestException: print('詳情頁請求失敗') return None def parse_detail(html): result = re.findall('JSON.parse\("(.*?)"\),', html, re.S) if result: data = result[0].encode('utf-8').decode('unicode-escape') # 去掉\\\u002F,先用utf-8編碼再用unicode-escape解碼 data = json.loads(data) if data and 'sub_images' in data.keys(): # 過濾掉不能爬的 sub_images = data.get('sub_images') image_url = [item.get('url') for item in sub_images] return image_url def save_to_mongo(result): if db[MONGO_TABLE].insert_one(result): print('存儲到MongoDB成功', result) return True else: return False def download_image(img_url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36', 'Connection': 'keep - alive', } try: response = requests.get(img_url, headers=headers) if response.status_code == 200: save_image(response.content) # 返回二進制內容,返回圖片通常返回二進制內容 return None except RequestException: print('圖片請求失敗') return None def save_image(content): file_path = '{0}/{1}.{2}'.format(os.getcwd() + '\\toutiaojiepai', md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() def main(): index_html = get_index(0, '街拍') # print(index_html) #index_html是一個字符串,json格式 for url, title in parse_index_json(index_html): if url and title: detail_html = get_detail(url) img_url = parse_detail(detail_html) result = { 'url': url, 'title': title, 'image_url': img_url } save_to_mongo(result) if img_url: for j in img_url: download_image(j) if __name__ == '__main__': main()
config.py爲MongoDB配置文件:服務器
MONGO_URL='localhost' MONGO_DB='toutiao' MONGO_TABLE='jiepai'