Beautiful Soup 是一個能夠從HTML或XML文件中提取數據的Python庫。bs4 模塊的 BeautifulSoup 配合requests庫能夠寫簡單的爬蟲。html
安裝html5
解析器python
安裝命令:sql
requests數組
pandas數據結構
數據結構:app
使用工具
Beautiful Soup將複雜HTML文檔轉換成一個複雜的樹形結構,每一個節點都是Python對象,全部對象能夠概括爲4種:sqlserver
Tag:Tag 對象與XML或HTML原生文檔中的tag相同,tag中最重要的屬性: name和attributesurl
從網頁中獲取指定標籤、屬性值,取值方式:
功能標籤
查找元素:
demo
import sys
import io
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from py_teldcore import sqlserver_db as db
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')
url = "http://www.tianqihoubao.com/lishi/hefei/month/201812.html"
def get_soap():
try:
r = requests.get(url)
soap = bs(r.text, "lxml")
return soap
except Exception as e:
print(e)
return "Request Error"
def save2cvs(data, path):
result_weather = pd.DataFrame(data, columns=['date', 'tq', 'temp', 'wind'])
result_weather.to_csv(path, encoding='gbk')
print('save weather sucess')
def save2mssql(data):
sql = "Insert into Weather(date, tq, temp, wind) values(%s, %s, %s, %s)"
data_list = np.ndarray.tolist(data)
# sqlvalues = list()
# for data in data_list:
# sqlvalues.append(tuple(data))
sqlvalues = [tuple(iq) for iq in data_list]
try:
db.exec_sqlmany(sql, sqlvalues)
except Exception as e:
print(e)
def get_data():
soap = get_soap()
print(soap)
all_weather = soap.find("div", class_="wdetail").find("table").find_all("tr")
data = list()
for tr in all_weather[1:]:
td_li = tr.find_all("td")
for td in td_li:
s = td.get_text()
data.append("".join(s.split()))
res = np.array(data).reshape(-1, 4)
return res
if __name__ == "__main__":
data = get_data()
save2mssql(data)
print("save2 Sqlserver ok!")
參考資料