1 import requests 2 import re 3 import json 4 from requests.exceptions import RequestException 5 from multiprocessing import Pool 6 7 def get_one_page(url): 8 headers = { 9 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} 10 try: 11 r=requests.get(url,headers=headers) 12 r.encoding='utf-8' 13 if r.status_code==200: 14 return r.text 15 return None 16 except RequestException: 17 return None 18 19 def parse_one_page(html): 20 #pattern=re.compile('<div.*?790.*?<a.*?>(.*?)</a>.*?<div.*?>"(.*?)"</div>',re.S) 21 pattern=re.compile('<h3>.*?">(.*?)</a>.*?">(.*?)</div>',re.S) 22 items=re.findall(pattern,html) 23 for item in items: 24 yield { 25 'title':item[0], 26 'content':item[1].replace(u'\u3000',u'') 27 } 28 29 def write(content): 30 with open('1.txt','a',encoding='utf-8') as f: 31 f.write(json.dumps(content,ensure_ascii=False)+'\n') 32 def main(p): 33 url="http://www.neihan8.com/article/index_"+str(p)+".html" 34 html=get_one_page(url) 35 for item in parse_one_page(html): 36 print(item) 37 write(item) 38 39 if __name__=='__main__': 40 #for p in range(2,100): 41 #main(p) 42 pool=Pool() 43 pool.map(main,[p for p in range(2,50)])