第一部分-建立爬蟲html
1.urllibpython
1)urllib.requestmysql
request.urlopen(url)正則表達式
request.urlretrieve 能夠根據文件的 URL 下載文件sql
2)urllib.parse數據庫
3)urllib.errorjson
2.異常:api
try...except...else...數組
常見異常:HTTPError,AttributeError,網絡
3.BeautifulSoup
from bs4 import BeautifulSoup
bsObj=BeautifulSoup(html,'lxml')
1)
print(bsObj.text)
print(bsObj.html)
print(bsObj.p.a)
...
2)
findAll(tag, attributes, recursive, text, limit, keywords) #返回一個ResultSet
find(tag, attributes, recursive, text, keywords) #返回一個Tag
藉助它們,你能夠經過標籤的不一樣屬性輕鬆地過濾 HTML 頁面,查找須要的標籤組或單個標籤
例:
.findAll({"h1","h2","h3","h4","h5","h6"})
.findAll("span", {"class":{"green", "red"}})
.findAll(id="text") 同 .findAll("", {"id":"text"})
.findAll(src=True) 有src屬性的標籤
3)
get_text() 會把你正在處理的 HTML 文檔中全部的標籤都清除,而後返回一個只包含文字的str
4)返回類型NavigatorString
.children (全部子標籤)
.next_sibling( 下一個兄弟標籤 ).next_siblings(全部以後的兄弟標籤)
.previous_sibling(上一個兄弟標籤).previous(全部以前的兄弟標籤)
.parent (直接父標籤 ).parents(全部父標籤)、
5)
.attrs 獲取標籤全部屬性(dict)
.attrs['src'] 獲取src值
6)正則表達式
7)lambda表達式
#獲取有兩個屬性的標籤:
bsObj.findAll(lambda tag: len(tag.attrs) == 2)
4.Scrapy
//TODO
5.JSON
把 JSON 轉換成字典,
JSON 數組轉換成列表,
JSON 字符串轉換成 Python 字符串。
經常使用函數:loads,get
6.存儲數據
1)下載
from urllib.request import urlretrieve
urlretrieve(resourceLocation,fileName)
2)CSV(Comma-Separated Values)
import csv
csvFile=open("test.csv","w+")
try:
writer=csv.writer(csvFile)
writer.writerow(('青山隱隱水迢迢 秋盡江南草未凋','24橋明月夜'))
for i in range(1,5):
writer.writerow((i,i+2,i*2))
finally:
csvFile.close()
3)MySQL
import pymysql
#獲取鏈接 獲取光標
conn=pymysql.connect(host='localhost',user='root',passwd=None)
cur=conn.cursor()
#執行SQL語句
cur.execute('use ssm01')
cur.execute('select * from user')
print(cur.fetchone())#獲取一條數據
#關閉資源
cur.close()
coon.close()
4)Email
//TODO
7.讀取文檔
1)讀取txt
from urllib.request import urlopen
txt=urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1.txt')
print(txt.read())
2)讀取csv
#從網上直接把文件讀成一個字符串,而後轉換成一個 StringIO 對象,使它具備文件的屬性。
from urllib.request import urlopen
from io import StringIO
import csv
data = urlopen('http://pythonscraping.com/files/MontyPythonAlbums.csv').read().decode('utf-8')
dataFile=StringIO(data)
csvFile=csv.reader(dataFile)
for row in csvFile:
print(row)
3)讀取PDF
#PDFMiner3K
#把任意 PDF 讀成字符串,而後用 StringIO 轉換成文件對象
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
process_pdf(rsrcmgr, device, pdfFile)
device.close()
content = retstr.getvalue()
retstr.close()
return content
pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
例
3-1.網絡數據採集
#從 http://oreilly.com 開始,而後隨機地從一個外鏈跳到另外一個外鏈。
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
pages = set()
random.seed(datetime.datetime.now())
# 獲取頁面全部內鏈的列表
def getInternalLinks(bsObj, includeUrl):
internalLinks = []
# 找出全部以"/"開頭的連接
for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
internalLinks.append(link.attrs['href'])
return internalLinks
# 獲取頁面全部外鏈的列表
def getExternalLinks(bsObj, excludeUrl):
externalLinks = []
# 找出全部以"http"或"www"開頭且不包含當前URL的連接
for link in bsObj.findAll("a",href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
def splitAddress(address):
addressParts = address.replace("http://", "").split("/")
return addressParts
def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bsObj = BeautifulSoup(html,'lxml')
externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])
if len(externalLinks) == 0:
internalLinks = getInternalLinks(startingPage)
return getNextExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
else:
return externalLinks[random.randint(0, len(externalLinks)-1)]
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink("http://oreilly.com")
print("隨機外鏈是:"+externalLink)
followExternalOnly(externalLink)
followExternalOnly("http://oreilly.com")
5-1.JSON
import json
jsonString='{\
"arrayOfNums":[{"number":0},{"number":1},{"number":2}],\
"arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]\
}'
jsonObj=json.loads(jsonString)
print(jsonObj.get("arrayOfFruits")[2].get("fruit"))
6-1.把 http://pythonscraping.com 的全部圖片下載下來
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
def pageSrc(url):
html=urlopen(url)
bsObj=BeautifulSoup(html,'lxml')
srcList=bsObj.findAll("img",src=True)
urlList=[]
for i in srcList:
urlList.append(i['src'])
return urlList
def getInternalLinks(bsObj,includeUrl):
internalLinks = []
# 找出全部以"/"開頭的連接
for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
internalLinks.append(link.attrs['href'])
return internalLinks
def allimgs(url):
#找到該頁面全部的img src
srcset=set()
for i in pageSrc(url):
if i not in srcset:
print(i)
srcset.add(i)
name=i.split('/').pop()
urlretrieve(i,name)
#找到該頁面的全部內鏈
html=urlopen(url)
bsObj=BeautifulSoup(html,'lxml')
for i in getInternalLinks(bsObj,url):
newUrl=url+i
for j in pageSrc(newUrl):
if j not in srcset:
srcset.add(i)
print(j)
name=j.split('/').pop()
urlretrieve(j,name)
url="http://pythonscraping.com"
allimgs(url)
6-2.存儲到CSV
#獲取 HTML 表格並寫入 CSV 文件
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")
bsObj = BeautifulSoup(html,'lxml')
# 主對比表格是當前頁面上的第一個表格
table = bsObj.findAll("table",{"class":"wikitable"})[0]
rows = table.findAll("tr")
csvFile = open("editors.csv", 'wt', newline='',encoding='utf-8')
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text()[:-1])
print(csvRow)
writer.writerow(csvRow)
finally:
csvFile.close()
6-3.存儲到mysql
#存儲維基百科數據
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
import pymysql
conn = pymysql.connect(host='127.0.0.1',user='root', passwd=None, charset='utf8')
cur = conn.cursor()
cur.execute("USE ssm01")
cur.execute("CREATE TABLE pages(title varchar(200),content varchar(3000))")
random.seed(datetime.datetime.now())
#存儲到數據庫
def store(title, content):
cur.execute("INSERT INTO pages (title, content) VALUES (\"%s\",\"%s\")", (title, content))
cur.connection.commit()
#找到數據 存儲到數據庫
def getLinks(articleUrl):
html = urlopen("http://en.wikipedia.org"+articleUrl)
bsObj = BeautifulSoup(html,'lxml')
title = bsObj.find("h1").get_text()
content = bsObj.find("div", {"id":"mw-content-text"}).find("p").get_text()
store(title, content)
return bsObj.find("div", {"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks("/wiki/Kevin_Bacon")
try:
while len(links) > 0:
newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
print(newArticle)
links = getLinks(newArticle)
finally:
cur.close()
conn.close()