python爬蟲學習第六章

時間 2019-11-11

原文原文鏈接

<!DOCTYPE html>

sixth

javascript

手寫python爬蟲

圖片爬蟲實戰

思路css

創建一個爬取圖片的自定義函數，該函數負責爬取一個頁面下的咱們想爬取的圖片，爬取過程爲：首先經過urllib.request.urlopen(url).read()讀取對應網頁的所有源代碼，而後根據上面的第一個正則表達式進行第一次信息過濾，過濾完成以後，在第一次過濾結果的基礎上，根據上面的第二個正則表達式進行第二次信息過濾，提取出該網頁上全部目標圖片的連接，並將這些連接地址存儲的一個列表中，隨後遍歷該列表，分別將對應連接經過urllib.request.urlretrieve(imageurl,filename=imagename)存儲到本地，爲了不程序中途異常崩潰，咱們能夠創建異常處理，若不能爬取某個圖片，則會經過x+=1自動跳到下一個圖片。

經過for循環將該分類下的全部網頁都爬取一遍，連接能夠構造爲url="http://list.jd.com/list.html?cat=23413143151&page="+str(i),在for循環裏面，每一次循環，對應的i會自動加1，每次循環的時候經過調用1）中的函數實現該也圖片的爬取

   
   
   
   
    
    
    
   
   
   
   


import re


import urllib.request


def craw(url,page):


html1=urllib.request.urlopen(url).read()


html1=str(html1)


pat1='<div id="plist".+?'

 result1=re.compile(pat1).findall(html1)

 result1=result1[0]

 pat2='<img width="220" height="220" data-img="1" data-lazy-img="//(.+?.jpg)">'

 imagelist=re.compile(pat2).findall(result1)

 # imagelist=re.search(pat2,result1)

 x=1

 for imageurl in imagelist:

 imagename="D:/shoujitupian/img/"+str(page)+str(x)+".jpg"

 imageurl="http://"+imageurl

 try:

 urllib.request.urlretrieve(imageurl,filename=imagename)

 except urllib.error.URLError as e:

 if hasattr(e,"code"):

 x+=1

 if hasattr(e,"reason"):

 x+=1

 x+=1

for i in range(1,6):


url="https://list.jd.com/list.html?cat=9987,653,655&page="+str(i)


craw(url,i)

</p> <h3>連接爬蟲</h3> <ol> <li>肯定好要爬取的入口連接。</li> <li>根據需求構建好連接提取的正則表達式。</li> <li>模擬成瀏覽器並爬取對應網頁。</li> <li>根據2中的正則表達式提取出該網頁中包含的連接。</li> <li>過濾掉重複的連接。</li> <li>後續操做。好比打印這些連接到屏幕上等。<br />
author = 'My'html

import re#爬取全部頁面連接


import urllib.request


def getlinks(url):


headers=('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36') #模擬成瀏覽器


opener=urllib.request.buildopener()
 opener.addheaders=[headers]

 urllib.request.installopener(opener)#將opener安裝爲全局


file=urllib.request.urlopen(url)


data=str(file.read())


pat='(https?://[^\s)&quot;;]+.(w|/)*)'#根據需求構建好連接表達式


link=re.compile(pat).findall(data)


link=list(set(link))#去除重複元素


return link


url="http://blog.csdn.net/"#要爬取的網頁連接 


linklist=getlinks(url)#獲取對應網頁中包含的連接地址


for link in linklist:#經過for循環分別遍歷輸出獲取到的連接地址到屏幕上


print(link[0])

糗事百科爬蟲實戰

分析各頁間的網址規律，構造網址變量，並能夠經過for循環實現多頁內容的抓取

構建一個自定義函數，專門用來實現抓取某個網頁上的段子，包括兩部份內容，一部分是對應用戶，一部分是用戶發表的段子內容。該函數功能實現的過程爲：首先，模擬成瀏覽器訪問，觀察對應網頁源代碼中的內容，將用戶信息部分與段子內容部分的格式寫成正則表達式。隨後，根據正則表達式分別提取出改業中全部用戶與全部內容，而後經過for循環遍歷段子內容並將內容分別賦給對應的變量，這裏變量名是有規律的，格式爲「content+順序號",接下來在經過for循環遍歷對應用戶，並輸出該用戶對應的內容。

經過for循環分別獲取多頁的各頁URL連接，每頁分別調用一次getcontent（url，page）函數。

author = 'My' 


import re 


import urllib.request 


def getcontent(url,page): 


headers=('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36') #模擬成瀏覽器


opener=urllib.request.build_opener() 


opener.addheaders=[headers] 


urllib.request.install_opener(opener) #將opener安裝爲全局


file=urllib.request.urlopen(url) 


data=str(file.read().decode("utf-8"))


userpat='target="_blank" title="(.?)">'#提取用戶的正則表達式

 contentpat='<div class="content">(.?)</div>'#提取內容的正則表達式 


userlist=re.compile(userpat,re.S).findall(data) 


contentlist=re.compile(contentpat,re.S).findall(data) 


x=1 


for content in contentlist:


content=content.replace("\n","")


content=content.replace("<span>","")


content=content.replace("</span>","") 


content=content.replace("<br/>","") 


name="content"+str(x) 


exec(name+'=content') 


x+=1 


y=1 


for user in userlist:


name="content"+str(y)


print("用戶"+str(page)+str(y)+"是:"+user)


print("內容是:") 


exec("print("+name+")") 


print("\n") 


y+=1 


for i in range(1,10):


url="https://www.qiushibaike.com/8hr/page/"+str(i)


getcontent(url,i)

微信爬蟲實現

創建3個自定義函數：一個函數實現使用代理服務器爬取指定網址並返回爬取到的數據的功能，一個函數實現獲取多個頁面的全部文章連接的功能，另一個函數實現根據文章連接爬取指定標題和內容並寫入文件的功能。
使用代理服務器爬取指定網址的內容的功能在4章提過，爲避免一場致使程序中斷，因此創建異常處理機制
要實現獲取多個頁面的全部文章連接，咱們須要對關鍵詞使用urllib.request.quote(key)進行編碼，編碼後構造出對應的文章列表頁網址，並經過for循環依次爬取各頁的文章連接，爬取時，經過調用2中設置的代理服務器實現。
要實現根據文章連接爬取指定標題和內容並寫入對應文件中，可使用for循環依次爬取3中所提供的網址（真實網址),爬取後根據正則表達式提取出咱們關注的內容並寫入對應的文件中。
代碼中若是發生異常，須要進行延時處理，即等待一段時間以後再嘗試下一次操做。要實現延時處理，咱們能夠導入time模塊，使用time.sleep（）實現，好比time.sleep(7)延時7秒。
```
author = 'My'

import re

import urllib.request

import time 

import urllib.error
```

 模擬瀏覽器
 headers=('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
 opener=urllib.request.build_opener()
 opener.addheaders=[headers] java
 將opener安裝為全局
 urllib.request.install_opener(opener) python
 設置一個列表listurl存儲文章網頁列表
 listurl=[] git
 設置代理ip
 def useproxy(proxyaddr,url):
 #創建異常機制
 try:
 import urllib.request
 proxy=urllib.request.ProxyHandler ({'http':proxy_addr})
 opener=urllib.request.buildopener (proxy,urllib.request.HTTPHandler)
 urllib.request.installopener(opener)
 data=urllib.request.urlopen(url).read().decode('utf-8')
 return data
 except urllib.error.URLError as e:
 if hasattr(e,'code'):
 print(e.code)
 if hasattr(e,'reason'):
 print(e.reason)
 time.sleep(10)
 except Exception as e:
 print('exception:'+str(e))
 time.sleep(1) github
 設置搜索
 def getlisturl(key,pagestart,pageend,proxy):
 try:
 page=pagestart
 #編碼關鍵詞key
 keycode=urllib.request.quote(key)
 #編碼"&page"
 pagecode=urllib.request.quote('&page')
 #循環爬取各頁文章的連接
 for page in range(pagestart,pageend+1):
 #分別構建各頁的url連接，每次循環構建一次
 url="http://weixin.sogou.com/weixin?type=2&query="+keycode+pagecode+str(page)
 '''
 http://weixin.sogou.com/weixin?query=物聯網 &type=2&page=2
 '''
 #用代理服務器爬取，解決ip封殺問題
 data1=use_proxy(proxy,url)
 print(data1)
 #獲取連接正則表達式
 listurlpat='web

.? (http://.?)"' 
 #獲取每頁的全部文章連接並添加到列表listurl中
 print(re.compile(listurlpat,re.S).findall (data1))
 listurl.append(re.compile (listurlpat,re.S).findall(data1))
 print("獲取到",str(len(listurl)),'頁')
 return listurl
 except urllib.error.URLError as e:
 if hasattr(e,'code'):
 print(e.code)
 if hasattr(e,'reason'):
 print(e.reason)
 time.sleep(10)
 except Exception as e:
 print('exception:'+str(e))
 time.sleep(1) 
 設置保存網頁
  
def getcontent(listurl,proxy):

 i=0

 #html頭

 html1='''

 

 微信文章頁面

 

 

 '''

 fh=open("D:/111.html","wb")

 fh.write(html1.encode('utf-8'))

 fh.close()

 #再次追加寫入的方式打開文件，以寫入對應文章內容

 fh=open('D:/111.html','ab')

 #此時listurl

 print(listurl)

 for i in range(0,listurl):

 for j in range(0,listurl[i]):

 try:

 url=listurl[i][j]

 url=url.replace('amp;',"")

 data=useproxy(proxy,url)

 titlepat="(.<em>?)"

 contentpat='id="jscontent">(.?) id="jssgbar"'

 title=re.compile(titlepat).findall(data)

 content=re.compile(contentpat).findall (data)

 thistitle="這次沒有獲取到"

 thiscontent="這次沒有獲取到"

 if(title!=[]):

 thistitle=title[0]

 if(content!=[]):

 thiscontent=content[0]

 dataall="正則表達式
標題爲:"+thistitle+"canvas
內容爲："+thiscontent+"

"

 fh.write(dataall.encode('utf-8'))

 print("第"+str(i)+"個網頁第"+str(j)+" 次處理")

 except urllib.error.URLError as e:

 if hasattr(e,'code'):

 print(e.code)

 if hasattr(e,'reason'):

 print(e.reason)

 time.sleep(10)

 except Exception as e:

 print('exception:'+str(e))

 time.sleep(1)

 fh.close()

 html2='''

 

 

 '''

 fh.open("D:/111.html",'ab')

 fh.write(html2.encode('utf-8'))

 fh.close()

 key='物聯網'

 proxy="125.115.183.26 :808"

 proxy2=""

 pagestart=1

 pageend=2

 listurl=getlisturl(key,pagestart,pageend,proxy)

 getcontent(listurl,proxy)

多線程爬蟲

多線程小程序



import threading


class A(threading.Thread):


def init(self):


threading.Thread.init(self)


def run(self):


for i in range(10):


print('我是線程A')


class B(threading.Thread):


def init(self):


threading.Thread.init(self)


def run(self):


for i in range(10):


print('我是線程B')



t1=A()

 t2=B()

 t1.start() 

 t2.start()

 隊列的使用
 import queue

 a=queue.Queue()

 a.put('hello')

 a.put('wj')

 a.put('like')

 a.put('study')

 a.task_done()

 print(a.qsize())

 print(a.get())

 print(a.get())

 print(a.get())

 print(a.get())

 print(a.qsize())

 print(a.get(),'----')

 微信小程序改爲多線程提高效率


-- coding: utf-8 --


"""

 Created on Sat Apr 22 10:25:08 2017
 


@author: My

 """
 


import threading

 import queue

 import re

 import urllib.request

 import time

 import urllib.error

 urlqueue=queue.Queue()

 headers=('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')

 opener=urllib.request.buildopener()

 opener.addheaders=[headers]

 urllib.request.installopener(opener)

 listurl=[]

 def useproxy(proxyaddr,url): 

 try:

 import urllib.request

 proxy=urllib.request.ProxyHandler ({'http://':proxyaddr})

 opener=urllib.request.buildopener(proxy,urllib.request.HTTPHandler)

 urllib.request.install_opener(opener)

 print(url)

 data=urllib.request.urlopen(url).read().decode('utf-8')

 return data

 except urllib.error.URLError as e:

 if hasattr(e,"code"):

 print(e.code)

 if hasattr(e,"reason"):

 print(e.reason)

 time.sleep(10)

 except Exception as e:

 print("exception:"+str(e))

 time.sleep(1)

 class geturl(threading.Thread):

 def init(self,key,pagestart,pageend,proxy,urlqueue):

 threading.Thread.init(self)

 self.pagestart=pagestart 

 self.pageend=pageend

 self.proxy=proxy

 self.urlqueue=urlqueue

 self.key=key

 def run(self):

 page=self.pagestart

 keycode=urllib.request.quote(key)

 pagecode=urllib.request.quote("&page")

 for page in range(self.pagestart,self.pageend+1):

 url="http://weixin.sogou.com/weixin? type=2&query="+keycode+pagecode+str(page)

 print(url)

 data1=useproxy(self.proxy,url)

 listurlpat='
   
   
   

   
 
   .? (http://.?)"'

 listurl.append(re.compile (listurlpat,re.S).findall(data1))

 print("獲取到："+str(len(listurl))+"頁")

 for i in range(0,len(listurl)):

 time.sleep(7)

 for j in range(0,len(listurl[i])):

 try:

 url=listurl[i][j]

 url=url.replace("amp;","")

 print("第"+str(i)+"i"+str(j)+"j次入隊")

 self.urlqueue.put(url)
 self.urlqueue.taskdone()

 except urllib.error.URLError as e:

 if hasattr(e,"code"):

 print(e.code)

 if hasattr(e,"reason"):

 print(e.reason)

 time.sleep(10)

 except Exception as e:

 print("exception:"+str(e))

 time.sleep(1)

 class getcontent(threading.Thread):

 def init(self,urlqueue,proxy):

 threading.Thread. 
 
   init(self)

 self.urlqueue=urlqueue

 self.proxy=proxy

 def run(self):

 html1='''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR、xhtml1/DTD/xhtml1-transitional.dtd">
 

 
 <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>

 微信文章頁面

 

 

 fh=open("D:/pythontest/7.html",'wb')

 fh.write(html1.encode('utf-8'))

 fh.close()

 fh=open("D:/pythontest/7.html",'ab')

 i=1 

 while(True):

 try:

 url=self.urlqueue.get()

 data=useproxy(self.proxy,url)

 titlepat="(.</em>?)"

 contentpat='id="jscontent">(.?) id="jssgbar"'


title=re.compile(titlepat).findall(data)


content=re.compile (contentpat,re.S).findall(data)


thistitle="這次沒有獲取到"


thiscontent="這次沒有獲取到"


if(title!=[]):


thistitle=title[0]


if(content!=[]):


thiscontent=content[0]


dataall="標題爲："+thistitle+"
內容爲："+thiscontent+"

"

fh.write(dataall.encode("utf-8"))

print("第"+str(i)+"個網頁處理")


i+=1


except urllib.error.URLError as e:


if hasattr(e,'code'):


print(e.code)


if hasattr(e,"reason"):


print(e.reason)


time.sleep(10)


except Exception as e:


print("exception:"+str(e))


time.sleep(1)


fh.close()


html2='''





'''


fh=open("D:/pythontest/7.html",'ab')


fh.write(html2.encode('utf-8'))


fh.close()


class conrl(threading.Thread):


def init(self,urlqueue):

 threading.Thread.init(self)


self.urlqueue=urlqueue


def run(self):


while(True):


print("程序執行中")


time.sleep(60)


if(self.urlqueue.empty()):


print("程序執行完畢！")


exit()


key="物聯網"


proxy="59.61.92.205:8118"


proxy2=""


pagestart=1


pageend=2


t1=geturl(key,pagestart,pageend,proxy,urlqueue)


t1.start()


t2=getcontent(urlqueue,proxy)


t2.start()


t3=conrl(urlqueue)


t3.start() 


 
      

  

   
   
   

  

   
   
   

  

   
   
   

   
 
    
 
    
 
    

  
隊列的使用微信小程序改爲多線程提高效率

 fh=open("D:/pythontest/7.html",'wb')

 fh.write(html1.encode('utf-8'))

 fh.close()

 fh=open("D:/pythontest/7.html",'ab')

 i=1 

 while(True):

 try:

 url=self.urlqueue.get()

 data=useproxy(self.proxy,url)

 titlepat="(.</em>?)"

 contentpat='id="jscontent">(.?) id="jssgbar"'


title=re.compile(titlepat).findall(data)


content=re.compile (contentpat,re.S).findall(data)


thistitle="這次沒有獲取到"


thiscontent="這次沒有獲取到"


if(title!=[]):


thistitle=title[0]


if(content!=[]):


thiscontent=content[0]


dataall="標題爲："+thistitle+"
內容爲："+thiscontent+"

"

fh.write(dataall.encode("utf-8"))

print("第"+str(i)+"個網頁處理")


i+=1


except urllib.error.URLError as e:


if hasattr(e,'code'):


print(e.code)


if hasattr(e,"reason"):


print(e.reason)


time.sleep(10)


except Exception as e:


print("exception:"+str(e))


time.sleep(1)


fh.close()


html2='''





'''


fh=open("D:/pythontest/7.html",'ab')


fh.write(html2.encode('utf-8'))


fh.close()


class conrl(threading.Thread):


def init(self,urlqueue):

 threading.Thread.init(self)


self.urlqueue=urlqueue


def run(self):


while(True):


print("程序執行中")


time.sleep(60)


if(self.urlqueue.empty()):


print("程序執行完畢！")


exit()


key="物聯網"


proxy="59.61.92.205:8118"


proxy2=""


pagestart=1


pageend=2


t1=geturl(key,pagestart,pageend,proxy,urlqueue)


t1.start()


t2=getcontent(urlqueue,proxy)


t2.start()


t3=conrl(urlqueue)


t3.start()

 
 
   posted @ 
  2018-01-13 14:33  
  Doctor_Bool 閱讀( 
  ...) 評論( 
  ...) 
   編輯 
  收藏

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。