《python網絡數據採集》筆記1

時間 2019-12-01

標籤 python網絡數據採集筆記欄目 Python 简体版

原文原文鏈接

第一部分-建立爬蟲html

1.urllibpython

1)urllib.requestmysql

request.urlopen(url)正則表達式

request.urlretrieve 能夠根據文件的 URL 下載文件sql

2)urllib.parse數據庫

3)urllib.errorjson

2.異常：api

try...except...else...數組

常見異常：HTTPError,AttributeError,網絡

3.BeautifulSoup

from bs4 import BeautifulSoup

bsObj=BeautifulSoup(html,'lxml')

print(bsObj.text)

print(bsObj.html)

print(bsObj.p.a)

...

findAll(tag, attributes, recursive, text, limit, keywords) #返回一個ResultSet

find(tag, attributes, recursive, text, keywords) #返回一個Tag

藉助它們,你能夠經過標籤的不一樣屬性輕鬆地過濾 HTML 頁面，查找須要的標籤組或單個標籤

例：

.findAll({"h1","h2","h3","h4","h5","h6"})

.findAll("span", {"class":{"green", "red"}})

.findAll(id="text") 同 .findAll("", {"id":"text"})

.findAll(src=True) 有src屬性的標籤

get_text() 會把你正在處理的 HTML 文檔中全部的標籤都清除，而後返回一個只包含文字的str

4)返回類型NavigatorString

.children （全部子標籤）

.next_sibling（下一個兄弟標籤）.next_siblings（全部以後的兄弟標籤）

.previous_sibling（上一個兄弟標籤）.previous（全部以前的兄弟標籤）

.parent （直接父標籤）.parents（全部父標籤）、

5）

.attrs 獲取標籤全部屬性(dict)

.attrs['src'] 獲取src值

6)正則表達式

7)lambda表達式

#獲取有兩個屬性的標籤：

bsObj.findAll(lambda tag: len(tag.attrs) == 2)

4.Scrapy

//TODO

5.JSON

把 JSON 轉換成字典，

JSON 數組轉換成列表，

JSON 字符串轉換成 Python 字符串。

經常使用函數：loads,get

6.存儲數據

1)下載

from urllib.request import urlretrieve

urlretrieve(resourceLocation,fileName)

2）CSV（Comma-Separated Values）

import csv

csvFile=open("test.csv","w+")

try:

writer=csv.writer(csvFile)

writer.writerow(('青山隱隱水迢迢秋盡江南草未凋','24橋明月夜'))

for i in range(1,5):

writer.writerow((i,i+2,i*2))

finally:

csvFile.close()

3）MySQL

import pymysql

#獲取鏈接獲取光標

conn=pymysql.connect(host='localhost',user='root',passwd=None)

cur=conn.cursor()

#執行SQL語句

cur.execute('use ssm01')

cur.execute('select * from user')

print(cur.fetchone())#獲取一條數據

#關閉資源

cur.close()

coon.close()

4）Email

//TODO

7.讀取文檔

1)讀取txt

from urllib.request import urlopen

txt=urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1.txt')

print(txt.read())

2)讀取csv

#從網上直接把文件讀成一個字符串，而後轉換成一個 StringIO 對象，使它具備文件的屬性。

from urllib.request import urlopen

from io import StringIO

import csv

data = urlopen('http://pythonscraping.com/files/MontyPythonAlbums.csv').read().decode('utf-8')

dataFile=StringIO(data)

csvFile=csv.reader(dataFile)

for row in csvFile:

print(row)

3）讀取PDF

#PDFMiner3K

#把任意 PDF 讀成字符串，而後用 StringIO 轉換成文件對象

from urllib.request import urlopen

from pdfminer.pdfinterp import PDFResourceManager, process_pdf

from pdfminer.converter import TextConverter

from pdfminer.layout import LAParams

from io import StringIO

def readPDF(pdfFile):

rsrcmgr = PDFResourceManager()

retstr = StringIO()

laparams = LAParams()

device = TextConverter(rsrcmgr, retstr, laparams=laparams)

process_pdf(rsrcmgr, device, pdfFile)

device.close()

content = retstr.getvalue()

retstr.close()

return content

pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")

outputString = readPDF(pdfFile)

print(outputString)

pdfFile.close()

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

例

3-1.網絡數據採集

#從 http://oreilly.com 開始，而後隨機地從一個外鏈跳到另外一個外鏈。

from urllib.request import urlopen

from bs4 import BeautifulSoup

import re

import datetime

import random

pages = set()

random.seed(datetime.datetime.now())

# 獲取頁面全部內鏈的列表

def getInternalLinks(bsObj, includeUrl):

internalLinks = []

# 找出全部以"/"開頭的連接

for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):

if link.attrs['href'] is not None:

if link.attrs['href'] not in internalLinks:

internalLinks.append(link.attrs['href'])

return internalLinks

# 獲取頁面全部外鏈的列表

def getExternalLinks(bsObj, excludeUrl):

externalLinks = []

# 找出全部以"http"或"www"開頭且不包含當前URL的連接

for link in bsObj.findAll("a",href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):

if link.attrs['href'] is not None:

if link.attrs['href'] not in externalLinks:

externalLinks.append(link.attrs['href'])

return externalLinks

def splitAddress(address):

addressParts = address.replace("http://", "").split("/")

return addressParts

def getRandomExternalLink(startingPage):

html = urlopen(startingPage)

bsObj = BeautifulSoup(html,'lxml')

externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])

if len(externalLinks) == 0:

internalLinks = getInternalLinks(startingPage)

return getNextExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])

else:

return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):

externalLink = getRandomExternalLink("http://oreilly.com")

print("隨機外鏈是："+externalLink)

followExternalOnly(externalLink)

followExternalOnly("http://oreilly.com")

5-1.JSON

import json

jsonString='{\

"arrayOfNums":[{"number":0},{"number":1},{"number":2}],\

"arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]\

jsonObj=json.loads(jsonString)

print(jsonObj.get("arrayOfFruits")[2].get("fruit"))

6-1.把 http://pythonscraping.com 的全部圖片下載下來

from urllib.request import urlretrieve

from urllib.request import urlopen

from bs4 import BeautifulSoup

def pageSrc(url):

html=urlopen(url)

bsObj=BeautifulSoup(html,'lxml')

srcList=bsObj.findAll("img",src=True)

urlList=[]

for i in srcList:

urlList.append(i['src'])

return urlList

def getInternalLinks(bsObj,includeUrl):

internalLinks = []

# 找出全部以"/"開頭的連接

for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):

if link.attrs['href'] is not None:

if link.attrs['href'] not in internalLinks:

internalLinks.append(link.attrs['href'])

return internalLinks

def allimgs(url):

#找到該頁面全部的img src

srcset=set()

for i in pageSrc(url):

if i not in srcset:

print(i)

srcset.add(i)

name=i.split('/').pop()

urlretrieve(i,name)

#找到該頁面的全部內鏈

html=urlopen(url)

bsObj=BeautifulSoup(html,'lxml')

for i in getInternalLinks(bsObj,url):

newUrl=url+i

for j in pageSrc(newUrl):

if j not in srcset:

srcset.add(i)

print(j)

name=j.split('/').pop()

urlretrieve(j,name)

url="http://pythonscraping.com"

allimgs(url)

6-2.存儲到CSV

#獲取 HTML 表格並寫入 CSV 文件

import csv

from urllib.request import urlopen

from bs4 import BeautifulSoup

html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")

bsObj = BeautifulSoup(html,'lxml')

# 主對比表格是當前頁面上的第一個表格

table = bsObj.findAll("table",{"class":"wikitable"})[0]

rows = table.findAll("tr")

csvFile = open("editors.csv", 'wt', newline='',encoding='utf-8')

writer = csv.writer(csvFile)

try:

for row in rows:

csvRow = []

for cell in row.findAll(['td', 'th']):

csvRow.append(cell.get_text()[:-1])

print(csvRow)

writer.writerow(csvRow)

finally:

csvFile.close()

6-3.存儲到mysql

#存儲維基百科數據

from urllib.request import urlopen

from bs4 import BeautifulSoup

import re

import datetime

import random

import pymysql

conn = pymysql.connect(host='127.0.0.1',user='root', passwd=None, charset='utf8')

cur = conn.cursor()

cur.execute("USE ssm01")

cur.execute("CREATE TABLE pages(title varchar(200),content varchar(3000))")

random.seed(datetime.datetime.now())

#存儲到數據庫

def store(title, content):

cur.execute("INSERT INTO pages (title, content) VALUES (\"%s\",\"%s\")", (title, content))

cur.connection.commit()

#找到數據存儲到數據庫

def getLinks(articleUrl):

html = urlopen("http://en.wikipedia.org"+articleUrl)

bsObj = BeautifulSoup(html,'lxml')

title = bsObj.find("h1").get_text()

content = bsObj.find("div", {"id":"mw-content-text"}).find("p").get_text()

store(title, content)

return bsObj.find("div", {"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))

links = getLinks("/wiki/Kevin_Bacon")

try:

while len(links) > 0:

newArticle = links[random.randint(0, len(links)-1)].attrs["href"]

print(newArticle)

links = getLinks(newArticle)

finally:

cur.close()

conn.close()

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。