python使用多線程爬取數據

時間 2019-11-12

原文原文鏈接

1.使用普通方式爬取html

#!/usr/bin/python3
#coding:utf8
from bs4 import BeautifulSoup
import requests
import time
from concurrent.futures import ProcessPoolExecutor
from threading import Thread
url='http://www.kan12345.com/class.asp?id=27&page='
w=open('ut','w')
#w=open('ut','r+')
#w.read()
def get(url):
    req=requests.Session()
    html=req.get(url)
    #html=html.content.decode("gb2312") 
    soup=BeautifulSoup(html.text,'html.parser')
    ss=soup.find_all('div',class_='box')
    soup=BeautifulSoup(str(ss),'html.parser')
    ss=soup.find_all('h4')
    soup=BeautifulSoup(str(ss),'html.parser')
    ss=soup.find_all('a')
    print(url)
    for n in ss:
        print(n.string)
        print(n['href'])
        w.write("title="+n.string+",url="+n['href']+'\n')
    print('\n')


if __name__=="__main__":
    #more thread
    start=time.time()
    for i in range(1,69):
        url='http://www.kan12345.com/class.asp?id=27&page='
        url+=str(i)
        get(url)
    end=time.time()
    w.close()
    print('Cost {} seconds'.format(end-start))

發現用了22.88秒，接下來使用多線程爬取python

#!/usr/bin/python3
#coding:utf8
from bs4 import BeautifulSoup
import requests
import time
from concurrent.futures import ProcessPoolExecutor
from threading import Thread
url='http://www.kan12345.com/class.asp?id=27&page='
w=open('ut','w')
#w=open('ut','r+')
#w.read()
def get(url):
    req=requests.Session()
    html=req.get(url)
    #html=html.content.decode("gb2312") 
    soup=BeautifulSoup(html.text,'html.parser')
    ss=soup.find_all('div',class_='box')
    soup=BeautifulSoup(str(ss),'html.parser')
    ss=soup.find_all('h4')
    soup=BeautifulSoup(str(ss),'html.parser')
    ss=soup.find_all('a')
    print(url)
    for n in ss:
        print(n.string)
        print(n['href'])
        w.write("title="+n.string+",url="+n['href']+'\n')
    print('\n')


if __name__=="__main__":
    #more thread
    start=time.time()
    threads=[]
    for i in range(1,69):
        url='http://www.kan12345.com/class.asp?id=27&page='
        url+=str(i)
        #get(url)
        t=Thread(target=get,args=[url])
        t.start()
        threads.append(t)
    for t in threads:
        t.join()
    end=time.time()
    w.close()
    print('Cost {} seconds'.format(end-start))

只用16秒，速度有很大的提高多線程