《python網絡數據採集》筆記1

第一部分-建立爬蟲html

1.urllibpython

       1)urllib.requestmysql

       request.urlopen(url)正則表達式

       request.urlretrieve 能夠根據文件的 URL 下載文件sql

       2)urllib.parse數據庫

       3)urllib.errorjson

      

             

2.異常:api

       try...except...else...數組

       常見異常:HTTPError,AttributeError,網絡

      

3.BeautifulSoup

       from bs4 import BeautifulSoup

       bsObj=BeautifulSoup(html,'lxml')

       1)

       print(bsObj.text)

       print(bsObj.html)

       print(bsObj.p.a)

       ...

       2)

       findAll(tag, attributes, recursive, text, limit, keywords)     #返回一個ResultSet

       find(tag, attributes, recursive, text, keywords)   #返回一個Tag

       藉助它們,你能夠經過標籤的不一樣屬性輕鬆地過濾 HTML 頁面,查找須要的標籤組或單個標籤   

       例:

       .findAll({"h1","h2","h3","h4","h5","h6"})

       .findAll("span", {"class":{"green", "red"}})

       .findAll(id="text")   同 .findAll("", {"id":"text"})

       .findAll(src=True) 有src屬性的標籤

       3)

       get_text() 會把你正在處理的 HTML 文檔中全部的標籤都清除,而後返回一個只包含文字的str

       4)返回類型NavigatorString

       .children (全部子標籤)

       .next_sibling(      下一個兄弟標籤 ).next_siblings(全部以後的兄弟標籤)

       .previous_sibling(上一個兄弟標籤).previous(全部以前的兄弟標籤)

       .parent (直接父標籤 ).parents(全部父標籤)、

       5)

       .attrs      獲取標籤全部屬性(dict)

       .attrs['src']      獲取src值    

       6)正則表達式

       7)lambda表達式

       #獲取有兩個屬性的標籤:

       bsObj.findAll(lambda tag: len(tag.attrs) == 2)

      

 

4.Scrapy

       //TODO

 

5.JSON

       把 JSON 轉換成字典,

       JSON 數組轉換成列表,

       JSON 字符串轉換成 Python 字符串。

       經常使用函數:loads,get

 

6.存儲數據

       1)下載

              from urllib.request import urlretrieve

              urlretrieve(resourceLocation,fileName)

       2)CSV(Comma-Separated Values)

              import csv

              csvFile=open("test.csv","w+")

              try:

                     writer=csv.writer(csvFile)

                     writer.writerow(('青山隱隱水迢迢 秋盡江南草未凋','24橋明月夜'))

                     for i in range(1,5):

                            writer.writerow((i,i+2,i*2))

              finally:

                     csvFile.close()

       3)MySQL

              import pymysql

              #獲取鏈接 獲取光標

              conn=pymysql.connect(host='localhost',user='root',passwd=None)

              cur=conn.cursor()

              #執行SQL語句

              cur.execute('use ssm01')

              cur.execute('select * from user')

              print(cur.fetchone())#獲取一條數據

              #關閉資源

              cur.close()

              coon.close()

       4)Email

       //TODO

 

7.讀取文檔

       1)讀取txt

       from urllib.request import urlopen    

       txt=urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1.txt')

       print(txt.read())

      

       2)讀取csv

       #從網上直接把文件讀成一個字符串,而後轉換成一個 StringIO 對象,使它具備文件的屬性。

       from urllib.request import urlopen

       from io import StringIO

       import csv

       data = urlopen('http://pythonscraping.com/files/MontyPythonAlbums.csv').read().decode('utf-8')

       dataFile=StringIO(data)

       csvFile=csv.reader(dataFile)

       for row in csvFile:

              print(row)

      

       3)讀取PDF 

       #PDFMiner3K

       #把任意 PDF 讀成字符串,而後用 StringIO 轉換成文件對象

       from urllib.request import urlopen

       from pdfminer.pdfinterp import PDFResourceManager, process_pdf

       from pdfminer.converter import TextConverter

       from pdfminer.layout import LAParams

       from io import StringIO

       def readPDF(pdfFile):

              rsrcmgr = PDFResourceManager()

              retstr = StringIO()

              laparams = LAParams()

              device = TextConverter(rsrcmgr, retstr, laparams=laparams)

              process_pdf(rsrcmgr, device, pdfFile)

              device.close()

              content = retstr.getvalue()

              retstr.close()

              return content

       pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")

       outputString = readPDF(pdfFile)

       print(outputString)

       pdfFile.close()

 

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 

 

3-1.網絡數據採集

       #從 http://oreilly.com 開始,而後隨機地從一個外鏈跳到另外一個外鏈。

       from urllib.request import urlopen

       from bs4 import BeautifulSoup

       import re

       import datetime

       import random

       pages = set()

       random.seed(datetime.datetime.now())

       # 獲取頁面全部內鏈的列表

       def getInternalLinks(bsObj, includeUrl):

              internalLinks = []

              # 找出全部以"/"開頭的連接

              for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):

                     if link.attrs['href'] is not None:

                            if link.attrs['href'] not in internalLinks:

                                   internalLinks.append(link.attrs['href'])

              return internalLinks

       # 獲取頁面全部外鏈的列表

       def getExternalLinks(bsObj, excludeUrl):

              externalLinks = []

              # 找出全部以"http"或"www"開頭且不包含當前URL的連接

              for link in bsObj.findAll("a",href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):

                     if link.attrs['href'] is not None:

                            if link.attrs['href'] not in externalLinks:

                                   externalLinks.append(link.attrs['href'])

              return externalLinks

       def splitAddress(address):

              addressParts = address.replace("http://", "").split("/")

              return addressParts

       def getRandomExternalLink(startingPage):

              html = urlopen(startingPage)

              bsObj = BeautifulSoup(html,'lxml')

              externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])

              if len(externalLinks) == 0:

                     internalLinks = getInternalLinks(startingPage)

                     return getNextExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])

              else:

                     return externalLinks[random.randint(0, len(externalLinks)-1)]

       def followExternalOnly(startingSite):

              externalLink = getRandomExternalLink("http://oreilly.com")

              print("隨機外鏈是:"+externalLink)

              followExternalOnly(externalLink)

       followExternalOnly("http://oreilly.com")

 

5-1.JSON

       import json

       jsonString='{\

              "arrayOfNums":[{"number":0},{"number":1},{"number":2}],\

              "arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]\

              }'

       jsonObj=json.loads(jsonString)

       print(jsonObj.get("arrayOfFruits")[2].get("fruit"))

 

6-1.把 http://pythonscraping.com 的全部圖片下載下來

       from urllib.request import urlretrieve

       from urllib.request import urlopen

       from bs4 import BeautifulSoup

       def pageSrc(url):

              html=urlopen(url)

              bsObj=BeautifulSoup(html,'lxml')

              srcList=bsObj.findAll("img",src=True)

              urlList=[]

              for i in srcList:

                     urlList.append(i['src'])

              return urlList

       def getInternalLinks(bsObj,includeUrl):

                     internalLinks = []

                     # 找出全部以"/"開頭的連接

                     for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):

                            if link.attrs['href'] is not None:

                                   if link.attrs['href'] not in internalLinks:

                                          internalLinks.append(link.attrs['href'])

                     return internalLinks

       def allimgs(url):

              #找到該頁面全部的img src

              srcset=set()

              for i in pageSrc(url):

                     if i not in srcset:

                            print(i)

                            srcset.add(i)

                            name=i.split('/').pop()

                            urlretrieve(i,name)

              #找到該頁面的全部內鏈

              html=urlopen(url)

              bsObj=BeautifulSoup(html,'lxml')

              for i in getInternalLinks(bsObj,url):

                     newUrl=url+i

                     for j in pageSrc(newUrl):

                            if j not in srcset:

                                   srcset.add(i)

                                   print(j)

                                   name=j.split('/').pop()

                                   urlretrieve(j,name)

       url="http://pythonscraping.com"

       allimgs(url)

 

 

6-2.存儲到CSV

       #獲取 HTML 表格並寫入 CSV 文件

       import csv

       from urllib.request import urlopen

       from bs4 import BeautifulSoup

       html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")

       bsObj = BeautifulSoup(html,'lxml')

       # 主對比表格是當前頁面上的第一個表格

       table = bsObj.findAll("table",{"class":"wikitable"})[0]

       rows = table.findAll("tr")

       csvFile = open("editors.csv", 'wt', newline='',encoding='utf-8')

       writer = csv.writer(csvFile)

       try:

              for row in rows:

                     csvRow = []

                     for cell in row.findAll(['td', 'th']):

                            csvRow.append(cell.get_text()[:-1])

                     print(csvRow)

                     writer.writerow(csvRow)

       finally:

              csvFile.close()

6-3.存儲到mysql

       #存儲維基百科數據

       from urllib.request import urlopen

       from bs4 import BeautifulSoup

       import re

       import datetime

       import random

       import pymysql

 

       conn = pymysql.connect(host='127.0.0.1',user='root', passwd=None, charset='utf8')

       cur = conn.cursor()

       cur.execute("USE ssm01")

       cur.execute("CREATE TABLE pages(title varchar(200),content varchar(3000))")

       random.seed(datetime.datetime.now())

       #存儲到數據庫

       def store(title, content):

              cur.execute("INSERT INTO pages (title, content) VALUES (\"%s\",\"%s\")", (title, content))

              cur.connection.commit()

       #找到數據 存儲到數據庫

       def getLinks(articleUrl):

              html = urlopen("http://en.wikipedia.org"+articleUrl)

              bsObj = BeautifulSoup(html,'lxml')

              title = bsObj.find("h1").get_text()

              content = bsObj.find("div", {"id":"mw-content-text"}).find("p").get_text()

              store(title, content)

              return bsObj.find("div", {"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))

       links = getLinks("/wiki/Kevin_Bacon")

       try:

              while len(links) > 0:

                     newArticle = links[random.randint(0, len(links)-1)].attrs["href"]

                     print(newArticle)

                     links = getLinks(newArticle)

       finally:

              cur.close()

              conn.close()

相關文章
相關標籤/搜索