最近開始學習爬蟲,因而找了幾個簡單的小項目練習。html
本篇文章主要是對今日頭條圖集的爬取json
參考資料:http://blog.csdn.net/gx864102252/article/details/73479803函數
因爲頭條源碼進行了改動,致使原教程的沒法正常爬取,在此進行了改進,使爬蟲能夠正常進行。學習
下面是源代碼:url
1 import re 2 from hashlib import md5 3 from urllib.parse import urlencode 4 import json 5 from multiprocessing import Pool 6 import os 7 import requests 8 from bs4 import BeautifulSoup 9 10 11 def get_one_html(kd, offset): 12 headers = { 13 'User-Agent': 'Mozilla/5.0', 14 'offset': offset, 15 'format': 'json', 16 'keyword': kd, 17 'cur_tab': 3, 18 'autoload': 'true' 19 } 20 #urlencode將字典轉變爲url的請求方式 21 url = 'https://www.toutiao.com/search_content/?' + urlencode(headers) 22 try: 23 r = requests.get(url, timeout=30) 24 r.encoding = 'utf-8' 25 r.raise_for_status() 26 return r.text 27 except: 28 print('HTML ERROR') 29 return None 30 31 def get_info(html): 32 data = json.loads(html) 33 if data and 'data' in data.keys(): 34 for item in data.get('data'): 35 #用生成器存儲每一個圖集的url 36 yield item.get('article_url') 37 38 def get_detail(url): 39 try: 40 r = requests.get(url, timeout=30) 41 r.encoding = 'utf-8' 42 r.raise_for_status() 43 return r.text 44 except: 45 print('HTML_Detail ERROR') 46 return None 47 def parse_detail(html, url): 48 soup = BeautifulSoup(html, 'lxml') 49 title = soup.select('title')[0].string 50 pat = re.compile('gallery: JSON.parse\((.*?)\)', re.S) 51 result = re.search(pat, html) 52 if result: 53 #eval字符串轉化爲字典 54 data = eval(json.loads(result.group(1))) 55 if data and 'sub_images' in data.keys(): 56 sub_images = data.get('sub_images') 57 58 images = [item.get('url').replace('\\','') for item in sub_images] 59 60 for image in images: download_images(image) 61 return { 62 'title': title, 63 'groups_url': url, 64 'images': images 65 } 66 67 #將圖片進行下載 68 def download_images(url): 69 print("Downloading: ", url) 70 try: 71 r = requests.get(url) 72 if r.status_code == 200: 73 save_images(r.content) 74 except: 75 print("Dont Get images") 76 print(r.status_code) 77 78 def save_images(content): 79 global KEYWORD 80 global DIR 81 file_path = '{0}/{1}.{2}'.format(DIR, md5(content).hexdigest(), 'jpg') 82 if not os.path.exists(file_path): 83 with open(file_path, 'wb') as f: 84 f.write(content) 85 f.close() 86 87 def main(offset): 88 global KEYWORD 89 html = get_one_html(KEYWORD, offset) 90 #將生成器內容進行呈現 91 for url in get_info(html): 92 html = get_detail(url) 93 if html: 94 parse_detail(html, url) 95 96 if __name__ == "__main__": 97 #定義兩個全局變量 98 KEYWORD = input("請輸入搜索關鍵詞:") 99 DIR = dir = os.getcwd() +'/' + KEYWORD 100 os.mkdir(dir) 101 #內容數量控制變量 102 START = 0 103 END = 5 104 #生成列表使用map函數 105 group = [i*20 for i in range(START, END + 1)] 106 #使用多進程 107 pool = Pool() 108 pool.map(main, group)
說明:加入全局變量DIR是爲了在爬取以前生成以關鍵詞命名的目錄,便於分類。spa