python模擬遊覽器爬取相關頁面html
import urllib.request url="https://blog.51cto.com/itstyle/2146899" #模擬瀏覽器 headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36") opener=urllib.request.build_opener() opener.addheaders=[headers] data=opener.open(url).read() fh=open("D:/5.html","wb") fh.write(data) fh.close()
python爬取新聞網站並將文章下載到本地python
import urllib.request import urllib.error import re data=urllib.request.urlopen("http://news.sina.com.cn/").read() data2=data.decode("utf-8","ignore") pat='href="(http://news.sina.com.cn/.*?)">' allurl=re.compile(pat).findall(data2) for i in range(0,len(allurl)): try: print("第"+str(i+1)+"次爬取") thisurl=allurl[i] file="D:/pac/sinanews/"+str(i)+".html" urllib.request.urlretrieve(thisurl,file) print("-----成功-----") except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason)
python爬取月光博客文章下載到本地瀏覽器
import re import urllib.request import urllib.error data=urllib.request.urlopen("http://www.williamlong.info/").read() pat='rel="bookmark">(.*?)</a>' pat_url='class="post-title"><a href="(http://www.williamlong.info/archives/.*?)"' data=data.decode("utf-8") mydata=re.compile(pat).findall(data) #全部的博客文章名 allurl=re.compile(pat_url).findall(data) #全部的網址連接 for i in range(0,len(allurl)): try: print("正在生產第"+str(i+1)+"次文件") thisurl=allurl[i] file="E:/PAS/yueguang/"+mydata[i]+".html" urllib.request.urlretrieve(thisurl,file) print("生產成功") except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason)