<!DOCTYPE html>
思路css
import re
import urllib.request
def craw(url,page):
html1=urllib.request.urlopen(url).read()
html1=str(html1)
pat1='<div id="plist".+?'
result1=re.compile(pat1).findall(html1)
result1=result1[0]
pat2='<img width="220" height="220" data-img="1" data-lazy-img="//(.+?.jpg)">'
imagelist=re.compile(pat2).findall(result1)
# imagelist=re.search(pat2,result1)
x=1
for imageurl in imagelist:
imagename="D:/shoujitupian/img/"+str(page)+str(x)+".jpg"
imageurl="http://"+imageurl
try:
urllib.request.urlretrieve(imageurl,filename=imagename)
except urllib.error.URLError as e:
if hasattr(e,"code"):
x+=1
if hasattr(e,"reason"):
x+=1
x+=1
for i in range(1,6):
url="https://list.jd.com/list.html?cat=9987,653,655&page="+str(i)
craw(url,i)
</p> <h3>連接爬蟲</h3> <ol> <li>肯定好要爬取的入口連接。</li> <li>根據需求構建好連接提取的正則表達式。</li> <li>模擬成瀏覽器並爬取對應網頁。</li> <li>根據2中的正則表達式提取出該網頁中包含的連接。</li> <li>過濾掉重複的連接。</li> <li>後續操做。好比打印這些連接到屏幕上等。<br />
author = 'My'html
import re#爬取全部頁面連接
import urllib.request
def getlinks(url):
headers=('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36') #模擬成瀏覽器
opener=urllib.request.buildopener()
opener.addheaders=[headers]
urllib.request.installopener(opener)#將opener安裝爲全局
file=urllib.request.urlopen(url)
data=str(file.read())
pat='(https?://[^\s)";]+.(w|/)*)'#根據需求構建好連接表達式
link=re.compile(pat).findall(data)
link=list(set(link))#去除重複元素
return link
url="http://blog.csdn.net/"#要爬取的網頁連接
linklist=getlinks(url)#獲取對應網頁中包含的連接地址
for link in linklist:#經過for循環分別遍歷輸出獲取到的連接地址到屏幕上
print(link[0])
author = 'My'
import re
import urllib.request
def getcontent(url,page):
headers=('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36') #模擬成瀏覽器
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener) #將opener安裝爲全局
file=urllib.request.urlopen(url)
data=str(file.read().decode("utf-8"))
userpat='target="_blank" title="(.?)">'#提取用戶的正則表達式
contentpat='<div class="content">(.?)</div>'#提取內容的正則表達式
userlist=re.compile(userpat,re.S).findall(data)
contentlist=re.compile(contentpat,re.S).findall(data)
x=1
for content in contentlist:
content=content.replace("\n","")
content=content.replace("<span>","")
content=content.replace("</span>","")
content=content.replace("<br/>","")
name="content"+str(x)
exec(name+'=content')
x+=1
y=1
for user in userlist:
name="content"+str(y)
print("用戶"+str(page)+str(y)+"是:"+user)
print("內容是:")
exec("print("+name+")")
print("\n")
y+=1
for i in range(1,10):
url="https://www.qiushibaike.com/8hr/page/"+str(i)
getcontent(url,i)
author = 'My'
import re
import urllib.request
import time
import urllib.error
模擬瀏覽器
headers=('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
opener=urllib.request.build_opener()
opener.addheaders=[headers] java
將opener安裝為全局
urllib.request.install_opener(opener) python
設置一個列表listurl存儲文章網頁列表
listurl=[] git
設置代理ip
def useproxy(proxyaddr,url):
#創建異常機制
try:
import urllib.request
proxy=urllib.request.ProxyHandler ({'http':proxy_addr})
opener=urllib.request.buildopener (proxy,urllib.request.HTTPHandler)
urllib.request.installopener(opener)
data=urllib.request.urlopen(url).read().decode('utf-8')
return data
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,'reason'):
print(e.reason)
time.sleep(10)
except Exception as e:
print('exception:'+str(e))
time.sleep(1) github
設置搜索
def getlisturl(key,pagestart,pageend,proxy):
try:
page=pagestart
#編碼關鍵詞key
keycode=urllib.request.quote(key)
#編碼"&page"
pagecode=urllib.request.quote('&page')
#循環爬取各頁文章的連接
for page in range(pagestart,pageend+1):
#分別構建各頁的url連接,每次循環構建一次
url="http://weixin.sogou.com/weixin?type=2&query="+keycode+pagecode+str(page)
'''
http://weixin.sogou.com/weixin?query=物聯網 &type=2&page=2
'''
#用代理服務器爬取,解決ip封殺問題
data1=use_proxy(proxy,url)
print(data1)
#獲取連接正則表達式
listurlpat='web
.? (http://.?)"'
#獲取每頁的全部文章連接並添加到列表listurl中
print(re.compile(listurlpat,re.S).findall (data1))
listurl.append(re.compile (listurlpat,re.S).findall(data1))
print("獲取到",str(len(listurl)),'頁')
return listurl
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,'reason'):
print(e.reason)
time.sleep(10)
except Exception as e:
print('exception:'+str(e))
time.sleep(1) 設置保存網頁
def getcontent(listurl,proxy):
i=0
#html頭
html1='''
微信文章頁面
'''
fh=open("D:/111.html","wb")
fh.write(html1.encode('utf-8'))
fh.close()
#再次追加寫入的方式打開文件,以寫入對應文章內容
fh=open('D:/111.html','ab')
#此時listurl
print(listurl)
for i in range(0,listurl):
for j in range(0,listurl[i]):
try:
url=listurl[i][j]
url=url.replace('amp;',"")
data=useproxy(proxy,url)
titlepat="(.<em>?) "
contentpat='id="jscontent">(.?) id="jssgbar"'
title=re.compile(titlepat).findall(data)
content=re.compile(contentpat).findall (data)
thistitle="這次沒有獲取到"
thiscontent="這次沒有獲取到"
if(title!=[]):
thistitle=title[0]
if(content!=[]):
thiscontent=content[0]
dataall="正則表達式標題爲:"+thistitle+"canvas
內容爲:"+thiscontent+"
"
fh.write(dataall.encode('utf-8'))
print("第"+str(i)+"個網頁第"+str(j)+" 次處理")
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,'reason'):
print(e.reason)
time.sleep(10)
except Exception as e:
print('exception:'+str(e))
time.sleep(1)
fh.close()
html2='''
'''
fh.open("D:/111.html",'ab')
fh.write(html2.encode('utf-8'))
fh.close()
key='物聯網'
proxy="125.115.183.26 :808"
proxy2=""
pagestart=1
pageend=2
listurl=getlisturl(key,pagestart,pageend,proxy)
getcontent(listurl,proxy)
多線程小程序
import threading
class A(threading.Thread):
def init(self):
threading.Thread.init(self)
def run(self):
for i in range(10):
print('我是線程A')
class B(threading.Thread):
def init(self):
threading.Thread.init(self)
def run(self):
for i in range(10):
print('我是線程B')
t1=A()
t2=B()
t1.start()
t2.start()
隊列的使用
import queue
a=queue.Queue()
a.put('hello')
a.put('wj')
a.put('like')
a.put('study')
a.task_done()
print(a.qsize())
print(a.get())
print(a.get())
print(a.get())
print(a.get())
print(a.qsize())
print(a.get(),'----')
微信小程序改爲多線程提高效率
-- coding: utf-8 --
"""
Created on Sat Apr 22 10:25:08 2017
@author: My
"""
import threading
import queue
import re
import urllib.request
import time
import urllib.error
urlqueue=queue.Queue()
headers=('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
opener=urllib.request.buildopener()
opener.addheaders=[headers]
urllib.request.installopener(opener)
listurl=[]
def useproxy(proxyaddr,url):
try:
import urllib.request
proxy=urllib.request.ProxyHandler ({'http://':proxyaddr})
opener=urllib.request.buildopener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
print(url)
data=urllib.request.urlopen(url).read().decode('utf-8')
return data
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:"+str(e))
time.sleep(1)
class geturl(threading.Thread):
def init(self,key,pagestart,pageend,proxy,urlqueue):
threading.Thread.init(self)
self.pagestart=pagestart
self.pageend=pageend
self.proxy=proxy
self.urlqueue=urlqueue
self.key=key
def run(self):
page=self.pagestart
keycode=urllib.request.quote(key)
pagecode=urllib.request.quote("&page")
for page in range(self.pagestart,self.pageend+1):
url="http://weixin.sogou.com/weixin? type=2&query="+keycode+pagecode+str(page)
print(url)
data1=useproxy(self.proxy,url)
listurlpat='.? (http://.?)"'
listurl.append(re.compile (listurlpat,re.S).findall(data1))
print("獲取到:"+str(len(listurl))+"頁")
for i in range(0,len(listurl)):
time.sleep(7)
for j in range(0,len(listurl[i])):
try:
url=listurl[i][j]
url=url.replace("amp;","")
print("第"+str(i)+"i"+str(j)+"j次入隊")
self.urlqueue.put(url)
self.urlqueue.taskdone()
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:"+str(e))
time.sleep(1)
class getcontent(threading.Thread):
def init(self,urlqueue,proxy):
threading.Thread. init(self)
self.urlqueue=urlqueue
self.proxy=proxy
def run(self):
html1='''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR、xhtml1/DTD/xhtml1-transitional.dtd">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
微信文章頁面
fh=open("D:/pythontest/7.html",'wb')
fh.write(html1.encode('utf-8'))
fh.close()
fh=open("D:/pythontest/7.html",'ab')
i=1
while(True):
try:
url=self.urlqueue.get()
data=useproxy(self.proxy,url)
titlepat="(.</em>?) "
contentpat='id="jscontent">(.
?) id="jssgbar"'
title=re.compile(titlepat).findall(data)
content=re.compile (contentpat,re.S).findall(data)
thistitle="這次沒有獲取到"
thiscontent="這次沒有獲取到"
if(title!=[]):
thistitle=title[0]
if(content!=[]):
thiscontent=content[0]
dataall="標題爲:"+thistitle+"
內容爲:"+thiscontent+"
"
fh.write(dataall.encode("utf-8"))
print("第"+str(i)+"個網頁處理")
i+=1
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:"+str(e))
time.sleep(1)
fh.close()
html2='''
'''
fh=open("D:/pythontest/7.html",'ab')
fh.write(html2.encode('utf-8'))
fh.close()
class conrl(threading.Thread):
def init(self,urlqueue):
threading.Thread.init(self)
self.urlqueue=urlqueue
def run(self):
while(True):
print("程序執行中")
time.sleep(60)
if(self.urlqueue.empty()):
print("程序執行完畢!")
exit()
key="物聯網"
proxy="59.61.92.205:8118"
proxy2=""
pagestart=1
pageend=2
t1=geturl(key,pagestart,pageend,proxy,urlqueue)
t1.start()
t2=getcontent(urlqueue,proxy)
t2.start()
t3=conrl(urlqueue)
t3.start()
隊列的使用微信小程序改爲多線程提高效率
fh=open("D:/pythontest/7.html",'wb')
fh.write(html1.encode('utf-8'))
fh.close()
fh=open("D:/pythontest/7.html",'ab')
i=1
while(True):
try:
url=self.urlqueue.get()
data=useproxy(self.proxy,url)
titlepat="(.</em>?) "
contentpat='id="jscontent">(.?) id="jssgbar"'
title=re.compile(titlepat).findall(data)
content=re.compile (contentpat,re.S).findall(data)
thistitle="這次沒有獲取到"
thiscontent="這次沒有獲取到"
if(title!=[]):
thistitle=title[0]
if(content!=[]):
thiscontent=content[0]
dataall="標題爲:"+thistitle+"
內容爲:"+thiscontent+"
"
fh.write(dataall.encode("utf-8"))
print("第"+str(i)+"個網頁處理")
i+=1
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
time.sleep(10)
except Exception as e:
print("exception:"+str(e))
time.sleep(1)
fh.close()
html2='''
'''
fh=open("D:/pythontest/7.html",'ab')
fh.write(html2.encode('utf-8'))
fh.close()
class conrl(threading.Thread):
def init(self,urlqueue):
threading.Thread.init(self)
self.urlqueue=urlqueue
def run(self):
while(True):
print("程序執行中")
time.sleep(60)
if(self.urlqueue.empty()):
print("程序執行完畢!")
exit()
key="物聯網"
proxy="59.61.92.205:8118"
proxy2=""
pagestart=1
pageend=2
t1=geturl(key,pagestart,pageend,proxy,urlqueue)
t1.start()
t2=getcontent(urlqueue,proxy)
t2.start()
t3=conrl(urlqueue)
t3.start()