百度百科詞條採集

 https://baike.baidu,com/view/? 方式儘量遍歷百科詞條
 1 # -*- coding: utf-8 -*-
 2 # @time : 2019/7/1  14:56
 3 import requests
 4 import random
 5 from multiprocessing import Process, Pool
 6 import pymysql
 7 
 8 '''
 9 經過組裝「https://baike.baidu.com/view/」+數字的方式進行多進程遍歷。
10 '''
11 
12 mysql_ip = ''
13 mysql_port =
14 mysql_user = ''
15 mysql_passwd = ''
16 msyql_db = ''
17 
18 process_num = 5
19 
20 baseUrl = 'https://baike.baidu.com/view/'
21 headers = {
22     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
23     'Referer': 'https://www.baidu.com/',
24     'Accept-Encoding': 'gzip, deflate, br'
25 }
26 ip_pool = [
27     '119.98.44.192:8118',
28     '111.198.219.151:8118',
29     '101.86.86.101:8118',
30 ]
31 
32 connection = pymysql.connect(host=mysql_ip, port=mysql_port, user=mysql_user, passwd=mysql_passwd, db=msyql_db)
33 cursor = connection.cursor()
34 filedWriter = open("filedItemUrl.txt", "a+", encoding="utf8")
35 
36 
37 def ip_proxy():
38     ip = ip_pool[random.randrange(0, 3)]
39     proxy_ip = 'https://' + ip
40     proxies = {'http': proxy_ip}
41     return proxies
42 
43 
44 def sprider(start_index, end_index):
45     for i in range(start_index, end_index):
46         try:
47             response = requests.get(baseUrl + str(i), proxies=ip_proxy(), headers=headers, timeout=1)
48             if 'error' in response.url:
49                 pass
50             else:
51                 id = i
52                 url = requests.utils.unquote(response.url)
53                 url_cotent = response.text.encode(encoding='ISO-8859-1').decode('utf8')
54                 sql = 'insert into baikebaiku (id,url,html_content) values(%s,%s,%s)'
55                 cursor.execute(sql, (id, url, url_cotent))
56                 connection.commit()
57                 print("" + str(i) + "個,添加數據庫成功")
58         except Exception as e:
59             filedWriter.write(str(i) + '\n')
60             filedWriter.flush()
61             print(e.args)
62 
63 
64 if __name__ == '__main__':
65 
66     pool = Pool(processes=process_num)
67 
68     one_process_task_num = 20000000 // process_num
69 
70     for i in range(process_num):
71         pool.apply_async(sprider, args=[one_process_task_num * i, one_process_task_num * (i + 1)])
72 
73     pool.close()
74     pool.join()
相關文章
相關標籤/搜索