# 導入requests 和 BeautifulSoupimport requestsfrom bs4 import BeautifulSoupdef download_page(url): # 定義頭部,用來騙過瀏覽器 headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'} # 這裏我是用了代理,是我本地電腦上跑的一個程序,能夠隨機尋找一個代理IP地址 # 爬取大量數據的時候會用到 # PROXY_POOL_URL = 'http://localhost:5555/random' # response = requests.get(PROXY_POOL_URL) # proxies = {"http:": "http://" + response.text} # html = requests.get(url,headers = headers,proxies = proxies) # 訪問網頁並獲取HTML文件 html = requests.get(url,headers = headers) return html.textdef get_content(html,page): # 從返回的HTML網頁中找到須要的做者,段子,年齡等信息 output = """第{}頁 做者:{} 性別:{} 年齡:{} 點贊:{} 評論:{}\n{}\n------------\n""" # 最終輸出格式 # 作一鍋湯。 soup = BeautifulSoup(html,'lxml') # 找到每一頁每個段子的信息 content = soup.find(id = 'content') content_list = content.find_all('div',class_ = 'article') # 循環遍歷每個段子的信息 for index in content_list: # 查詢出做者的暱稱 author = index.find('h2').string # 獲取段子內容 content = index.find('div', class_= 'content').find('span').get_text() # 獲取內容 # 獲取點贊和評論數的標籤 stats = index.find('div',class_ = 'stats') # 獲取點贊數 dianzan = stats.find('span',class_ = 'stats-vote').find('i').string # 獲取評論數 pinglun = stats.find('span',class_ = 'stats-comments').find('a').find('i').string # 獲取做者的性別和年齡 author_info = index.find('div',class_ = 'articleGender') # 這裏首先判斷做者是否匿名 if author_info is not None: class_list = author_info['class'] # 根據標籤來判斷做者的性別 if 'womenIcon' in class_list: gender = '女' elif 'manIcon' in class_list: gender = '男' else: gender = '' age = author_info.string else: gender = '' age = '' # 調用函數將數據寫入文件中 save_text(output.format(page,author,gender,age,dianzan,pinglun,content))# 將數據寫入文件中的函數def save_text(*args): # 遍歷出入的每一組數據,而後依次寫入 for index in args: with open(r"D:\python\qiushibaike.txt","a",encoding = "utf-8") as f: f.write(index)def main(): # 主函數,循環查詢能夠查詢不少頁 for index in range(1,2): # 首先定義url地址 url = "https://qiushibaike.com/text/page/{}".format(index) # 調用函數下載網頁 html = download_page(url) # 調用函數獲取咱們須要的數據 get_content(html,index)if __name__ == "__main__": main()