使用xpath技術爬取段子網

時間 2019-11-12

標籤使用 xpath 技術子網简体版

原文原文鏈接

from lxml import etree
import time
import json
import urllib.request
item_list = []  # 建立一個列表存儲獲取的信息


# 構造request對象
def handler_request(url, page):
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\
                              WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    get_url = url + str(page)
    request = urllib.request.Request(url=get_url, headers=headers)
    return request


# 解析獲取的html文件
def parse_content(content):
    # 生成對象
    tree = etree.HTML(content)
    article_list = tree.xpath('//main[@class="col-md-8 main-content"]/article')
    # 遍歷article列表
    for article in article_list:
        # 獲取標題
        title = article.xpath('.//div[@class="post-head"]/h1/a/text()')[0]
        # 獲取內容
        text = article.xpath('.//div[@class="post-content"]/p/text()')
        text = '\n'.join(text)  # 將內容進行拼接
        item = {
            '標題': title,
            '內容': text,
        }
        item_list.append(item)


def main():
    start_page = int(input("請輸入查詢起始頁面："))
    end_page = int(input("查詢結束頁面："))
    url = "http://duanziwang.com/page/"
    for page in range(start_page, end_page+1):
        request = handler_request(url, page)
        try:
            content = urllib.request.urlopen(request).read().decode()
            parse_content(content)
        except:
            print("第%d頁面爬取失敗" % page)
    string = json.dumps(item_list, ensure_ascii=False)
    with open('duanzi.txt', "w", encoding='utf-8') as f:
        f.write(string)


if __name__ == '__main__':
    main()

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。