(1):分析網頁html
分析ajax的請求網址,和須要的參數。經過不斷向下拉動滾動條,發現請求的參數中offset一直在變化,因此每次請求經過offset來控制新的ajax請求。ajax
(2)上代碼數據庫
a、經過ajax請求獲取頁面數據json
# 獲取頁面數據 def get_page_index(offset, keyword): # 參數經過分析頁面的ajax請求得到 data = { 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': '20', 'cur_tab': '1', 'from': 'search_tab', } url = 'https://www.toutiao.com/search_content/?' + urlencode(data) # 將字典轉換爲url參數形式 try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text else: return None except RequestException: print('請求索引頁錯誤') return None
b、分析ajax請求的返回結果,獲取圖片集的urlurl
# 分析ajax請求的返回結果,獲取圖片集的url def parse_page_index(html): data = json.loads(html) # 加載返回的json數據 if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url')
c、獲得圖集url後獲取圖集的內容spa
# 獲取詳情頁的內容 def get_page_detail(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text else: return None except RequestException: print('詳情頁頁錯誤', url) return None
d、其餘看完整代碼code
完整代碼:orm
# -*- coding: utf-8 -*- # @Author : FELIX # @Date : 2018/4/4 12:49 import json import os from hashlib import md5 import requests from urllib.parse import urlencode from requests.exceptions import RequestException from bs4 import BeautifulSoup import re import pymongo from multiprocessing import Pool MONGO_URL='localhost' MONGO_DB='toutiao' MONGO_TABLE='toutiao' GROUP_START=1 GROUP_END=20 KEYWORD='街拍' client = pymongo.MongoClient(MONGO_URL) # 鏈接MongoDB db = client[MONGO_DB] # 若是已經存在鏈接,不然建立數據庫 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', } # 獲取頁面數據 def get_page_index(offset, keyword): # 參數經過分析頁面的ajax請求得到 data = { 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': '20', 'cur_tab': '1', 'from': 'search_tab', } url = 'https://www.toutiao.com/search_content/?' + urlencode(data) # 將字典轉換爲url參數形式 try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text else: return None except RequestException: print('請求索引頁錯誤') return None # 分析ajax請求的返回結果,獲取圖片集的url def parse_page_index(html): data = json.loads(html) # 加載返回的json數據 if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') # 獲取詳情頁的內容 def get_page_detail(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text else: return None except RequestException: print('詳情頁頁錯誤', url) return None def parse_page_detail(html, url): # soup=BeautifulSoup(html,'lxml') # print(soup) # title=soup.select('tetle').get_text() # print(title) images_pattern = re.compile('articleInfo:.*?title: \'(.*?)\'.*?content.*?\'(.*?)\'', re.S) result = re.search(images_pattern, html) if result: title = result.group(1) url_pattern = re.compile('"(http:.*?)"') img_url = re.findall(url_pattern, str(result.group(2))) if img_url: for img in img_url: download_img(img) # 下載 data = { 'title': title, 'url': url, 'images': img_url, } return data def save_to_mongo(result): if result: if db[MONGO_TABLE].insert(result): # 插入數據 print('存儲成功', result) return True return False def download_img(url): print('正在下載', url) try: response = requests.get(url, headers=headers) if response.status_code == 200: save_img(response.content) else: return None except RequestException: print('下載圖片錯誤', url) return None def save_img(content): # os.getcwd()獲取當前文件路徑,用md5命名,保證不重複 file_path = '{}/imgs/{}.{}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb')as f: f.write(content) def main(offset): html = get_page_index(offset, KEYWORD) for url in parse_page_index(html): html = get_page_detail(url) print(url,'++++++++++++++++++++++++++++++++++++++++++++++++') print(html) if html: result = parse_page_detail(html, url) save_to_mongo(result) # print(result) # print(url) if __name__ == '__main__': groups = [i * 20 for i in range(GROUP_START, GROUP_END + 1)] pool = Pool() pool.map(main, groups)