1.抓取索引頁內容
利用requests請求目標站點,獲得索引網頁HTML代碼,返回結果。html
from urllib.parse import urlencode from requests.exceptions import RequestException import requests ''' 遇到不懂的問題?Python學習交流羣:821460695知足你的需求,資料都已經上傳羣文件,能夠自行下載! ''' def get_page_index(offset, keyword): headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } data = { 'format': 'json', 'offset': offset, 'keyword': keyword, 'autoload': 'true', 'count': 20, 'cur_tab': 1, 'from': 'search_tab', 'pd': 'synthesis', } url = 'https://www.toutiao.com/search_content/?' + urlencode(data) response = requests.get(url, headers=headers); try: if response.status_code == 200: return response.text return None except RequestException: print('請求索引頁失敗') return None def main(): html = get_page_index(0,'街拍') print(html) if __name__=='__main__': main()
2.抓取詳情頁內容
解析返回結果,獲得詳情頁的連接,並進一步抓取詳情頁的信息。數據庫
def parse_page_index(html): data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url')
def get_page_detail(url): headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('請求詳情頁頁失敗') return None
def parse_page_detail(html,url): soup = BeautifulSoup(html,'lxml') title = soup.select('title')[0].get_text() images_pattern = re.compile('gallery: JSON.parse\((.*?)\)', re.S) result = re.search(images_pattern, html) if result: data = json.loads(result.group(1)) data = json.loads(data) #將字符串轉爲dict,由於報錯了 if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_image(image) return { 'title': title, 'images':images, 'url':url }
3.下載圖片與保存數據庫
將圖片下載到本地,並把頁面信息及圖片URL保存到MongDB。json
# 存到數據庫 def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print('存儲到MongoDb成功', result) return True return False # 下載圖片 def download_image(url): print('正在下載',url) headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537. 36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } try: response = requests.get(url, headers=headers) if response.status_code == 200: save_image(response.content) return None except RequestException: print('請求圖片失敗', url) return None def save_image(content): file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg') if not os.path.exists(file_path): with open(file_path,'wb') as f: f.write(content)
4.開啓循環及多線程
對多頁內容遍歷,開啓多線程提升抓取速度。多線程
groups = [x*20 for x in range(GROUP_START, GROUP_END+1)] pool = Pool() pool.map(main,groups)
from urllib.parse import urlencode from requests.exceptions import RequestException from bs4 import BeautifulSoup from hashlib import md5 from multiprocessing import Pool from config import * import pymongo import requests import json import re import os ''' 遇到不懂的問題?Python學習交流羣:821460695知足你的需求,資料都已經上傳羣文件,能夠自行下載! ''' client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] def get_page_index(offset, keyword): headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } data = { 'format': 'json','offset': offset,'keyword': keyword,'autoload': 'true','count': 20,'cur_tab': 1,'from': 'search_tab','pd': 'synthesis' } url = 'https://www.toutiao.com/search_content/?' + urlencode(data) try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('請求索引頁失敗') return None def parse_page_index(html): data = json.loads(html) if data and 'data' in data.keys(): for item in data.get('data'): yield item.get('article_url') def get_page_detail(url): headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('請求詳情頁頁失敗') return None def parse_page_detail(html,url): soup = BeautifulSoup(html,'lxml') title = soup.select('title')[0].get_text() images_pattern = re.compile('gallery: JSON.parse\((.*?)\)', re.S) result = re.search(images_pattern, html) if result: data = json.loads(result.group(1)) data = json.loads(data) #將字符串轉爲dict,由於報錯了 if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_image(image) return { 'title': title, 'images':images, 'url':url } def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print('存儲到MongoDb成功', result) return True return False def download_image(url): print('正在下載',url) headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537. 36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } try: response = requests.get(url, headers=headers) if response.status_code == 200: save_image(response.content) return None except RequestException: print('請求圖片失敗', url) return None def save_image(content): file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg') if not os.path.exists(file_path): with open(file_path,'wb') as f: f.write(content) def main(offset): html = get_page_index(offset,KEYWORD) for url in parse_page_index(html): html = get_page_detail(url) if html: result = parse_page_detail(html,url) if isinstance(result,dict): save_to_mongo(result) if __name__=='__main__': groups = [x*20 for x in range(GROUP_START, GROUP_END+1)] pool = Pool() pool.map(main,groups)
config.py學習
MONGO_URL = 'localhost' MONGO_DB = 'toutiao' MONGO_TABLE = 'jiepai' GROUP_START = 1 GROUP_END = 20 KEYWORD = '街拍' ~