爬取百度貼吧數據（練習Python爬蟲）

時間 2020-10-01

原文原文鏈接

爬取百度貼吧數據（Python）

1.總代碼：

from urllib.request import Request, urlopen
from urllib.parse import quote
def get_html(html):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0"
    }
    request = Request(html, headers=headers)
    response = urlopen(request)
    return response.read().decode()


def save_html(html,filename):
    with open(filename,'w',encoding='utf-8') as f:
        f.write(html)

def main():
    content = input("請輸入想要獲取哪一個貼吧:")
    num = int(input("請輸入想要獲取多少頁:"))
    for i in range(num):
        url = 'https://tieba.baidu.com/f?fr=ala0&kw='+quote(content)+'&tpl={}'.format(i * 50)
        html = get_html(url)
        filename = '第'+ str(i+1) +'頁.html'
        save_html(html,filename)

if __name__ == '__main__':
    main()