1.使用普通方式爬取html
#!/usr/bin/python3 #coding:utf8 from bs4 import BeautifulSoup import requests import time from concurrent.futures import ProcessPoolExecutor from threading import Thread url='http://www.kan12345.com/class.asp?id=27&page=' w=open('ut','w') #w=open('ut','r+') #w.read() def get(url): req=requests.Session() html=req.get(url) #html=html.content.decode("gb2312") soup=BeautifulSoup(html.text,'html.parser') ss=soup.find_all('div',class_='box') soup=BeautifulSoup(str(ss),'html.parser') ss=soup.find_all('h4') soup=BeautifulSoup(str(ss),'html.parser') ss=soup.find_all('a') print(url) for n in ss: print(n.string) print(n['href']) w.write("title="+n.string+",url="+n['href']+'\n') print('\n') if __name__=="__main__": #more thread start=time.time() for i in range(1,69): url='http://www.kan12345.com/class.asp?id=27&page=' url+=str(i) get(url) end=time.time() w.close() print('Cost {} seconds'.format(end-start))
發現用了22.88秒,接下來使用多線程爬取python
#!/usr/bin/python3 #coding:utf8 from bs4 import BeautifulSoup import requests import time from concurrent.futures import ProcessPoolExecutor from threading import Thread url='http://www.kan12345.com/class.asp?id=27&page=' w=open('ut','w') #w=open('ut','r+') #w.read() def get(url): req=requests.Session() html=req.get(url) #html=html.content.decode("gb2312") soup=BeautifulSoup(html.text,'html.parser') ss=soup.find_all('div',class_='box') soup=BeautifulSoup(str(ss),'html.parser') ss=soup.find_all('h4') soup=BeautifulSoup(str(ss),'html.parser') ss=soup.find_all('a') print(url) for n in ss: print(n.string) print(n['href']) w.write("title="+n.string+",url="+n['href']+'\n') print('\n') if __name__=="__main__": #more thread start=time.time() threads=[] for i in range(1,69): url='http://www.kan12345.com/class.asp?id=27&page=' url+=str(i) #get(url) t=Thread(target=get,args=[url]) t.start() threads.append(t) for t in threads: t.join() end=time.time() w.close() print('Cost {} seconds'.format(end-start))
只用16秒,速度有很大的提高多線程