爬取糗事百科段子

時間 2020-07-18
標籤糗事百科段子简体版
原文原文鏈接
# 導入requests 和 BeautifulSoupimport requestsfrom bs4 import BeautifulSoupdef download_page(url):    # 定義頭部，用來騙過瀏覽器    headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}    # 這裏我是用了代理，是我本地電腦上跑的一個程序，能夠隨機尋找一個代理IP地址    # 爬取大量數據的時候會用到    # PROXY_POOL_URL = 'http://localhost:5555/random'    # response = requests.get(PROXY_POOL_URL)    # proxies = {"http:": "http://" + response.text}    # html = requests.get(url,headers = headers,proxies = proxies)    # 訪問網頁並獲取HTML文件    html = requests.get(url,headers = headers)    return html.textdef get_content(html,page):    # 從返回的HTML網頁中找到須要的做者，段子，年齡等信息    output = """第{}頁 做者：{} 性別：{} 年齡：{} 點贊：{} 評論：{}\n{}\n------------\n"""  # 最終輸出格式    # 作一鍋湯。    soup = BeautifulSoup(html,'lxml')    # 找到每一頁每個段子的信息    content = soup.find(id = 'content')    content_list = content.find_all('div',class_ = 'article')    # 循環遍歷每個段子的信息    for index in content_list:        # 查詢出做者的暱稱        author = index.find('h2').string        # 獲取段子內容        content = index.find('div', class_= 'content').find('span').get_text()  # 獲取內容        # 獲取點贊和評論數的標籤        stats = index.find('div',class_ = 'stats')        # 獲取點贊數        dianzan = stats.find('span',class_ = 'stats-vote').find('i').string        # 獲取評論數        pinglun = stats.find('span',class_ = 'stats-comments').find('a').find('i').string        # 獲取做者的性別和年齡        author_info = index.find('div',class_ = 'articleGender')        # 這裏首先判斷做者是否匿名        if author_info is not None:            class_list = author_info['class']            # 根據標籤來判斷做者的性別            if 'womenIcon' in class_list:                gender = '女'            elif 'manIcon' in class_list:                gender = '男'            else:                gender = ''            age = author_info.string        else:            gender = ''            age = ''        # 調用函數將數據寫入文件中        save_text(output.format(page,author,gender,age,dianzan,pinglun,content))# 將數據寫入文件中的函數def save_text(*args):    # 遍歷出入的每一組數據，而後依次寫入    for index in args:        with open(r"D:\python\qiushibaike.txt","a",encoding = "utf-8") as f:            f.write(index)def main():    # 主函數，循環查詢能夠查詢不少頁    for index in range(1,2):        # 首先定義url地址        url = "https://qiushibaike.com/text/page/{}".format(index)        # 調用函數下載網頁        html = download_page(url)        # 調用函數獲取咱們須要的數據        get_content(html,index)if __name__ == "__main__":    main()
相關標籤/搜索
每日一句
每一个你不满意的现在，都有一个你没有努力的曾经。