from lxml import etree import time import json import urllib.request item_list = [] # 建立一個列表存儲獲取的信息 # 構造request對象 def handler_request(url, page): headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\ WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } get_url = url + str(page) request = urllib.request.Request(url=get_url, headers=headers) return request # 解析獲取的html文件 def parse_content(content): # 生成對象 tree = etree.HTML(content) article_list = tree.xpath('//main[@class="col-md-8 main-content"]/article') # 遍歷article列表 for article in article_list: # 獲取標題 title = article.xpath('.//div[@class="post-head"]/h1/a/text()')[0] # 獲取內容 text = article.xpath('.//div[@class="post-content"]/p/text()') text = '\n'.join(text) # 將內容進行拼接 item = { '標題': title, '內容': text, } item_list.append(item) def main(): start_page = int(input("請輸入查詢起始頁面:")) end_page = int(input("查詢結束頁面:")) url = "http://duanziwang.com/page/" for page in range(start_page, end_page+1): request = handler_request(url, page) try: content = urllib.request.urlopen(request).read().decode() parse_content(content) except: print("第%d頁面爬取失敗" % page) string = json.dumps(item_list, ensure_ascii=False) with open('duanzi.txt', "w", encoding='utf-8') as f: f.write(string) if __name__ == '__main__': main()