1 # -*- coding: utf-8 -*- 2 # @time : 2019/7/1 14:56 3 import requests 4 import random 5 from multiprocessing import Process, Pool 6 import pymysql 7 8 ''' 9 經過組裝「https://baike.baidu.com/view/」+數字的方式進行多進程遍歷。 10 ''' 11 12 mysql_ip = '' 13 mysql_port = 14 mysql_user = '' 15 mysql_passwd = '' 16 msyql_db = '' 17 18 process_num = 5 19 20 baseUrl = 'https://baike.baidu.com/view/' 21 headers = { 22 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36', 23 'Referer': 'https://www.baidu.com/', 24 'Accept-Encoding': 'gzip, deflate, br' 25 } 26 ip_pool = [ 27 '119.98.44.192:8118', 28 '111.198.219.151:8118', 29 '101.86.86.101:8118', 30 ] 31 32 connection = pymysql.connect(host=mysql_ip, port=mysql_port, user=mysql_user, passwd=mysql_passwd, db=msyql_db) 33 cursor = connection.cursor() 34 filedWriter = open("filedItemUrl.txt", "a+", encoding="utf8") 35 36 37 def ip_proxy(): 38 ip = ip_pool[random.randrange(0, 3)] 39 proxy_ip = 'https://' + ip 40 proxies = {'http': proxy_ip} 41 return proxies 42 43 44 def sprider(start_index, end_index): 45 for i in range(start_index, end_index): 46 try: 47 response = requests.get(baseUrl + str(i), proxies=ip_proxy(), headers=headers, timeout=1) 48 if 'error' in response.url: 49 pass 50 else: 51 id = i 52 url = requests.utils.unquote(response.url) 53 url_cotent = response.text.encode(encoding='ISO-8859-1').decode('utf8') 54 sql = 'insert into baikebaiku (id,url,html_content) values(%s,%s,%s)' 55 cursor.execute(sql, (id, url, url_cotent)) 56 connection.commit() 57 print("第" + str(i) + "個,添加數據庫成功") 58 except Exception as e: 59 filedWriter.write(str(i) + '\n') 60 filedWriter.flush() 61 print(e.args) 62 63 64 if __name__ == '__main__': 65 66 pool = Pool(processes=process_num) 67 68 one_process_task_num = 20000000 // process_num 69 70 for i in range(process_num): 71 pool.apply_async(sprider, args=[one_process_task_num * i, one_process_task_num * (i + 1)]) 72 73 pool.close() 74 pool.join()