僅供交流學習html
#coding=utf-8 import json import requests import re import os from multiprocessing import Pool from urllib.parse import urlencode from fake_useragent import UserAgent from hashlib import md5 from bs4 import BeautifulSoup ua=UserAgent() keyword="街拍" def get_page(offset): param={ 'offset': offset, 'format':'json', 'keyword':keyword, 'autoload':'true', 'count': 20 } base="https://www.toutiao.com/api/search/content/?" url=base+urlencode(param) content=get_content(url) data=json.loads(content) if data and "data" in data.keys(): article_list=data.get('data') return [item.get('article_url') for item in article_list] return None #保存結果到文件 def write_to_file(content): with open("res.txt","a",encoding="utf-8") as f: f.write(content) #解析獲取內頁的圖片 def parse_page_image(url): content=get_content(url) if content!=None: #獲取標題 soup=BeautifulSoup(content,'lxml') res=soup.select('title') title=res[0].get_text() pattern=re.compile(r'gallery: JSON.parse\("(.*?)"\),',re.S) items=pattern.findall(content) for item in items: item=eval("'{}'".format(item)) data=json.loads(item) if data and "sub_images" in data.keys(): items=[item.get("url") for item in data.get('sub_images')] res={ 'title':title, 'imgList':items, 'url':url } write_to_file(json.dumps(res,ensure_ascii=False)+"\n") for url in items: get_img(url) #保存圖片 def save_img(content): path_file="{0}/{1}/{2}.{3}".format(os.getcwd(),"img",md5(content).hexdigest(),"jpg") print(path_file) with open(path_file,"wb") as f: f.write(content) #獲取遠程圖片 def get_img(url): try: headers={'User-Agent':ua.chrome} response=requests.get(url,headers=headers) if response.status_code==200: save_img(response.content) except: pass #獲取文本內容 def get_content(url): try: headers={'User-Agent':ua.chrome} response=requests.get(url,headers=headers) if response.status_code==200: return response.text return None except: return None def main(offset): items=get_page(offset) if items!=None: for item in items: parse_page_image(item) if __name__=='__main__': pool=Pool() pool.map(main,[i*10 for i in range(10)]) pool.close() pool.join()
原文: https://rumenz.com/rumenbiji/python-requests-multiprocessing.htmlpython