使用urllib.parse模塊來解析網址html
from urllib.parse import urlparse
uc = urlparse('http://blog.sina.com.cn/s/blog_4e345ca90102wm9v.html?tj=fina')
print(uc)
print(uc.netloc)
# result
# ParseResult(scheme='http', netloc='blog.sina.com.cn', path='/s/blog_4e345ca90102wm9v.html', params='', query='tj=fina', fragment='')
# blog.sina.com.cn
複製代碼
from urllib.parse import urlparse
ur1 = 'http://xa.58.com/hezu/33234645063725x.shtml?psid=157273217199223692452914449&ClickID=2&cookie=||https://www.google.com/|c5/njVqYsK5HO3MUB9jaAg==&PGTID=0d3090a7-001e-3db6-cece-a8d9e668e348&apptype=0&entinfo=33234645063725_0&fzbref=0&iuType=gz_2&key=&pubid=28056322&from=1-list-0¶ms=busitime^desc&local=483&trackkey=33234645063725_512fc4d7-75f8-4285-a8d2-25cb5d61d4d1_20180302100235_1519956155555&fcinfotype=gz'
uc = urlparse(ur1)
print("NetLoc: ", uc.netloc)
print("Path: ", uc.path)
q_cmds = uc.query.split('&')
print("Query Commands: ")
for q_cmd in q_cmds:
print(q_cmd)
# result
# NetLoc: xa.58.com
# Path: /hezu/33234645063725x.shtml
# Query Commands:
#psid=157273217199223692452914449
#ClickID=2
#.......
複製代碼
從上面的例子能夠看出來,改變請求(query)等號後面的數字,就會到不一樣的頁面。python
在練習中,使用 requests 模塊, 若是沒有安裝這個模塊,在cmd命令行直接 pip install requests 就能夠了。bash
from urllib.parse import urlparse
import requests
ur1 = 'http://xa.58.com/hezu/33234645063725x.shtml?psid=157273217199223692452914449&ClickID=2&cookie=||https://www.google.com/|c5/njVqYsK5HO3MUB9jaAg==&PGTID=0d3090a7-001e-3db6-cece-a8d9e668e348&apptype=0&entinfo=33234645063725_0&fzbref=0&iuType=gz_2&key=&pubid=28056322&from=1-list-0¶ms=busitime^desc&local=483&trackkey=33234645063725_512fc4d7-75f8-4285-a8d2-25cb5d61d4d1_20180302100235_1519956155555&fcinfotype=gz'
uc = urlparse(ur1)
htm1 = requests.get(ur1).text.splitlines()#使用requests.get提取網頁內容,並以文本的格式存放到htm1中
for i in range (0, 15):
print(htm1[i])
# result
# <!DOCTYPE html>
# <html>
# <head>
# <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
# <title>
# 【7圖】(單間出租)土門 任家口 開遠門地鐵口 西電醫院 大慶路
# 豐禾路太奧廣場,灃惠北路42號-西安58同城 </title>
複製代碼
from urllib.parse import urlparse
import requests
import re
regax = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)" #郵箱帳號正則表達
ur1 = "http://blog.sina.com.cn/s/blog_147f99d5d0102vmyx.html"#僅作測試用
htm1 = requests.get(ur1).text
emails = re.findall(regax, htm1)
for eamil in emails :
print(eamil)
複製代碼
在此次的練習中使用pip install beautifulsoup4 模塊。cookie
通常的操做是:用request模塊提取網頁數據,將提取的數據轉換成文本以後存放在html中,而後把htm1使用beautifulsoup加以解析,解析以後放在sp中,以後就能夠用beautifulsoup所提供的函數存取sp中解析好的數據了,這些函數主要是經過標籤來操做的。app
from urllib.parse import urlparse
import requests
import re
from bs4 import BeautifulSoup as bs
ur1 = "http://blog.sina.com.cn/s/blog_147f99d5d0102vmyx.html"
htm1 = requests.get(ur1).text
sp = bs(htm1,'lxml')
#emails = re.findall(regax, htm1)
#for eamil in emails :
# print(eamil)
links = sp.find_all('a')
a = links[1] #打印第二個連接
print(a)
複製代碼
使用beautifulsoup提取信息dom
import requests
import sys
from bs4 import BeautifulSoup as bs
if len(sys.argv) < 2:
print("用法:python **.py <target url>") #若是輸入的網址少於2就報錯
exit(1)
ur1 = sys.argv[1]
htm1 = requests.get(ur1).text
sp = bs(htm1,'lxml')
all_links = sp.find_all('a')
for link in all_links :
href = link.get('href')
if href != None and href.startswith('http://'):
print(href)
#須要用到cmd 輸入指令.py http://www.baidu.com
複製代碼
用一樣的方法提取網頁中全部的圖像文件連接函數
import requests
import sys
from bs4 import BeautifulSoup as bs
from urllib.parse import urlparse
if len(sys.argv) < 2:
print("用法:python **.py <target url>") #若是輸入的網址少於2就報錯
exit(1)
url = sys.argv[1]
domain = "{}://{}".format(urlparse(url).scheme, ur1parse(url).hostname)
htm1 = requests.get(url).text
sp = bs(htm1,'lxml')
all_links = sp.find_all('a', 'img')
for link in all_links :
src = link.get('src')
href = link.get('href')
targets = [src, href]
for t in targets:
if t.startswith('http'): full_path = t
else: full_path = domain
print(full_path)
if not os.path.exists(img_dir):os.mkdir(img_dir)
filename = full_path.split('/')[-1]
ext = filename.split(".")[-1]
filename = filename.split('.')[-2]
if 'jpg' in ext : filename = filename + '.jpg'
else : filename = filename + '.png'
image = urlopen(full_path)
fp = open(os.patn.join(image_dir, filename), wb)
fp.write(image.read())
fp.close()
複製代碼