實戰1:創建代理IP池

1、爬取免費代理IPpython

一、爬取代理IP:redis

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:Meng Zhaoce
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool #多線程模塊
from pymongo import MongoClient
data = []

def getIp(page):
    url = 'https://www.xicidaili.com/nt/%d'%(page)
    headers ={
        'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'

    }#假裝請求頭
    res = requests.get(url,headers=headers).text #發送請求
    soup = BeautifulSoup(res,'lxml')
    for i in soup.find_all('tr'):
        try:
            data.append({'ip':'%s:%s'%(i.find_all('td')[1].get_text(),i.find_all('td')[2].get_text()),'verify':False})
        except:
            continue

pool = ThreadPool(10)
pool.map(getIp,[i for i in range(100)])
pool.close()
pool.join()
print(data)
print(len(data))

db = MongoClient('127.0.0.1',27017).test
db.ippool.insert_many(data)

此處涉及知識點:請求庫、解析庫、多線程模塊、菲關係型數據庫數據庫

 2、創建代理IP池多線程

 

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:Meng Zhaoce
import multiprocessing #引入多線程模塊
import time
import requests
from pymongo import MongoClient
import redis
db = MongoClient('127.0.0.1',27017).text
url = 'http://www.baidu.com'
ippool = []
for i in db.ippool.find({'verify':False}):
    ippool.append(i['ip'])
start = time.time()
def verify(ip):
    proxies = {
        'http':'http://%s'%(ip)
    }
    try:
        res = requests.get(url,proxies=proxies,timeout=2)
        print(res.status_code)
        if res.status_code == 200:
            db.ippool.insert({'ip':ip,'verify':True})
            print('insert finished'.center(50,'*'))
    except Exception as e:
        print(e)

pool = multiprocessing.Pool(processes=10)
pool.map(verify,ippool[:100])
print(time.time()-start)
print('finshed')
相關文章
相關標籤/搜索