python爬蟲學習04-爬取貼吧

時間 2020-07-17

原文原文鏈接

百度貼吧網頁爬取

如下是代碼html

from urllib.request import urlopen
from urllib.request import Request
from urllib.parse import urlencode
from fake_useragent import UserAgent

def get_html(url):
    # 隨機獲取一個動態ua
    headers = {
        "User-Agent": UserAgent().random
    }
    # 發起請求
    request = Request(url, headers=headers)
    # urlopen()獲取頁面，類型是字節，須要用decode()解碼，轉換成str類型
    respose = urlopen(request)
    return respose.read()

def save_html(filename,html_bytes):
    with open(filename,"wb") as f:
        f.write(html_bytes)

def main():
    content = input(print("請輸入你要下載的內容："))
    num = input(print("請輸入你要下載多少頁："))
    base_url = "https://tieba.baidu.com/f?ie=utf-8{}"
    for pn in range(int(num)):
        args = {
            "pn":pn*50,
            "kw":content
        }
        filename = "第"+str(pn+1)+"頁.html"
        args = urlencode(args)
        print("正在下載"+filename)
        html_bytes = get_html(base_url.format(args))
        save_html(filename,html_bytes)

if __name__ == '__main__':
    main()

相關標籤/搜索