本文抽取證券之星基金的一個小表格爲例(xpath的使用本身看菜鳥驛站)
import requests
from lxml import etree
import csv
def gethtml(url,headers): #獲取網頁text
try:
req = requests.get(url)
req.raise_for_status()
req.encoding = 'gb18030'
html = req.text
return html
except:
print('Error')
def getcontent(html): #獲取所要的表格下的全部內容,但不包括標題
html = etree.HTML(html)
name = html.xpath('//*[@id="datalist"]/tr//text()')
# tobady = name[0]
td = []
for i in range(len(name)):
td.append(name[i])
td1 = [td[i:i+8] for i in range(0,len(td),8)] #此處按8個一行切分,便於後面csv儲存
return td1
def save_data(fname,td1): #保存成csv文件
f = open(fname, 'w', encoding='gb18030', newline="")
writer = csv.writer(f)
writer.writerow(('基金代碼', '基金名稱', '單位淨值', '累計淨值', '日增加額', '日增加率', '申購', '贖回'))
for i in td1:
writer.writerow(i)
def main():
url = 'http://quote.stockstar.com/fund/mixed.shtml'
fname = 'E:/shuju/t.csv'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
save_data(fname,getcontent(gethtml(url,headers=headers)))
if __name__ == '__main__':
main()