爬蟲 實踐 小例子
import requests,os
from urllib import request
from lxml import etree
dirName = './books'
if not os.path.exists(dirName):
os.mkdir(dirName)
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
url = 'http://www.shicimingju.com/book'
page_text = requests.get(url,headers=headers).text
# print(page_text)
tree = etree.HTML(page_text)
a_list = tree.xpath('//div[@class="bookmark-list"]//a')
for a in a_list:
bookname = a.xpath('./text()')[0]
book_path = "http://www.shicimingju.com" + a.xpath('./@href')[0]
#print(bookname,book_path) # 不取第一個元素的話返回的是列表 ['三國演義'] ['/book/sanguoyanyi.html']
book_page = requests.get(book_path,headers=headers).text
tree = etree.HTML(book_page)
book_a_list = tree.xpath('//div[@class="book-mulu"]//a')
path = dirName + '/' + bookname
with open(path,'w',encoding='utf-8') as f:
for a in book_a_list:
title = a.xpath('./text()')[0]
detail_path = 'http://www.shicimingju.com'+a.xpath('./@href')[0]
detail_page = requests.get(detail_path,headers=headers).text
content = etree.HTML(detail_page).xpath('//div[@class="chapter_content"]//text()')
content = ''.join(content)
f.write(title+':' + content + '\n')
print(title,"下載成功")