Python提取網站數據筆記

時間 2019-11-17

原文原文鏈接

解析網址例子

使用urllib.parse模塊來解析網址html

from urllib.parse import urlparse
uc = urlparse('http://blog.sina.com.cn/s/blog_4e345ca90102wm9v.html?tj=fina')
print(uc)
print(uc.netloc)
# result
# ParseResult(scheme='http', netloc='blog.sina.com.cn', path='/s/blog_4e345ca90102wm9v.html', params='', query='tj=fina', fragment='')
# blog.sina.com.cn
複製代碼

from urllib.parse import urlparse

ur1 = 'http://xa.58.com/hezu/33234645063725x.shtml?psid=157273217199223692452914449&ClickID=2&cookie=||https://www.google.com/|c5/njVqYsK5HO3MUB9jaAg==&PGTID=0d3090a7-001e-3db6-cece-a8d9e668e348&apptype=0&entinfo=33234645063725_0&fzbref=0&iuType=gz_2&key=&pubid=28056322&from=1-list-0&params=busitime^desc&local=483&trackkey=33234645063725_512fc4d7-75f8-4285-a8d2-25cb5d61d4d1_20180302100235_1519956155555&fcinfotype=gz'

uc = urlparse(ur1)

print("NetLoc: ", uc.netloc)
print("Path: ", uc.path)

q_cmds = uc.query.split('&')
print("Query Commands: ")
for q_cmd in q_cmds:
    print(q_cmd)
# result
# NetLoc: xa.58.com
# Path: /hezu/33234645063725x.shtml
# Query Commands: 
#psid=157273217199223692452914449
#ClickID=2
#.......
複製代碼

從上面的例子能夠看出來，改變請求（query）等號後面的數字，就會到不一樣的頁面。python

提取網頁數據

在練習中，使用 requests 模塊, 若是沒有安裝這個模塊，在cmd命令行直接 pip install requests 就能夠了。bash

from urllib.parse import urlparse
import requests

ur1 = 'http://xa.58.com/hezu/33234645063725x.shtml?psid=157273217199223692452914449&ClickID=2&cookie=||https://www.google.com/|c5/njVqYsK5HO3MUB9jaAg==&PGTID=0d3090a7-001e-3db6-cece-a8d9e668e348&apptype=0&entinfo=33234645063725_0&fzbref=0&iuType=gz_2&key=&pubid=28056322&from=1-list-0&params=busitime^desc&local=483&trackkey=33234645063725_512fc4d7-75f8-4285-a8d2-25cb5d61d4d1_20180302100235_1519956155555&fcinfotype=gz'

uc = urlparse(ur1)
htm1 = requests.get(ur1).text.splitlines()#使用requests.get提取網頁內容，並以文本的格式存放到htm1中
for i in range (0, 15):
    print(htm1[i])

# result
# <!DOCTYPE html>
# <html>
# <head>
# <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
# <title>
# 【7圖】(單間出租)土門 任家口 開遠門地鐵口 西電醫院 大慶路
# 豐禾路太奧廣場,灃惠北路42號-西安58同城 </title>
複製代碼

使用正則表達在網頁中提取所想要的內容

from urllib.parse import urlparse
import requests
import re

regax = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)" #郵箱帳號正則表達
ur1 = "http://blog.sina.com.cn/s/blog_147f99d5d0102vmyx.html"#僅作測試用


htm1 = requests.get(ur1).text
emails = re.findall(regax, htm1)

for eamil in emails :
    print(eamil)

複製代碼

網頁分析與應用

在此次的練習中使用pip install beautifulsoup4 模塊。cookie

通常的操做是：用request模塊提取網頁數據，將提取的數據轉換成文本以後存放在html中，而後把htm1使用beautifulsoup加以解析，解析以後放在sp中，以後就能夠用beautifulsoup所提供的函數存取sp中解析好的數據了，這些函數主要是經過標籤來操做的。app

from urllib.parse import urlparse
import requests
import re
from bs4 import BeautifulSoup as bs

ur1 = "http://blog.sina.com.cn/s/blog_147f99d5d0102vmyx.html"

htm1 = requests.get(ur1).text
sp = bs(htm1,'lxml')

#emails = re.findall(regax, htm1)
#for eamil in emails :
# print(eamil)

links = sp.find_all('a')
a = links[1] #打印第二個連接
print(a)
複製代碼

使用beautifulsoup提取信息dom

title:返回此頁面的標題-sp.title
text:除去全部HTML標籤，把網頁變爲字符串返回-so.text
find：返回第一個符合條件的內容-sp.find('img')
find_all:返回全部符合條件的內容-sp.find_all('a')
select:返回以CSS選擇器做爲運算結果的全部內容，主要操做對象爲id和class-sp.select('#Showtd')

import requests
import sys
from bs4 import BeautifulSoup as bs

if len(sys.argv) < 2:
    print("用法：python **.py <target url>") #若是輸入的網址少於2就報錯
    exit(1)

ur1 = sys.argv[1]

htm1 = requests.get(ur1).text
sp = bs(htm1,'lxml')
all_links = sp.find_all('a')


for link in all_links :
    href = link.get('href')
    if href != None and href.startswith('http://'):
        print(href)
#須要用到cmd 輸入指令.py http://www.baidu.com
複製代碼

用一樣的方法提取網頁中全部的圖像文件連接函數

import requests
import sys
from bs4 import BeautifulSoup as bs
from urllib.parse import urlparse

if len(sys.argv) < 2:
    print("用法：python **.py <target url>") #若是輸入的網址少於2就報錯
    exit(1)

url = sys.argv[1]
domain = "{}://{}".format(urlparse(url).scheme, ur1parse(url).hostname)
htm1 = requests.get(url).text
sp = bs(htm1,'lxml')
all_links = sp.find_all('a', 'img')


for link in all_links :
    src = link.get('src')
    href = link.get('href')
    targets = [src, href]
    for t in targets:
        if t.startswith('http'): full_path = t
        else:                    full_path = domain
        print(full_path)
        if not os.path.exists(img_dir):os.mkdir(img_dir)
        filename = full_path.split('/')[-1]
        ext = filename.split(".")[-1]
        filename = filename.split('.')[-2]
        if 'jpg' in ext : filename = filename + '.jpg'
        else :            filename = filename + '.png'
        image = urlopen(full_path)
        fp = open(os.patn.join(image_dir, filename), wb)
        fp.write(image.read())
        fp.close()
複製代碼