import pandas as pd import requests from bs4 import BeautifulSoup import time def spider(url, headers): print("正在抓取url: " + url) datas = requests.get(url=url, headers=headers).text # 解析url soup = BeautifulSoup(datas, 'lxml') # 獲取數據集合,find_all 返回的是集合類型,因此取[0], 找table標籤下 的 屬性是 id:tbContent moives_tables = soup.find_all('table', {'id': 'tbContent'})[0] # 獲取每個子節點 tr標籤 moives = moives_tables.findAll('tr') # 獲取電影名字,電影名字在每一個tr標籤裏面的第一個td標籤裏面,因爲是有多個td因此要用for遍歷 names = [tr.find_all('td')[0].a.get('title') for tr in moives[1:]] # 獲取電影的詳情頁url地址,並且下面提供給獲取導演使用,由於導演信息不在主頁面上 hrefs = [tr.find_all('td')[0].a.get('href') for tr in moives[1:]] # 獲取電影類型 types = [tr.find_all('td')[1].string for tr in moives[1:]] # 獲取票房數據 box_offices = [int(tr.find_all('td')[2].string) for tr in moives[1:]] # 獲取平均票價 Average_fare = [tr.find_all('td')[3].string for tr in moives[1:]] # 獲取上映日期 show_time = [tr.find_all('td')[6].string for tr in moives[1:]] # print(names, hrefs, types, box_offices, Average_fare, show_time) # print(len(hrefs)) daoyans = [] for href in hrefs: try: daoyan_datas = requests.get(href) # 出現錯誤的緣由是由於這裏的daoyan_datas是requests對象,沒法用BeautifulSoup解析,能夠在daoyan_datas後面加上content soup = BeautifulSoup(daoyan_datas.content, 'lxml') # 獲取導演,因爲數據是帶換行的,因此要用replace("\n","") 取消換行 daoyan = soup.select('dl.dltext dd')[0].get_text().replace("\n", "") #print(daoyan) daoyans.append(daoyan) #print(len(daoyans)) time.sleep(0.5) except: daoyans.append("獲取失敗") # 數據拼接,獲得的數據類型是 <class 'pandas.core.frame.DataFrame'> ,因此要用 DataFrame() 函數來寫入excel df = pd.DataFrame({ 'name': names, 'href': hrefs, 'type': types, 'box_office': box_offices, 'Average_fare': Average_fare, 'show_time': show_time, 'directors': daoyans }) download(df) ''' 問題是不能連續存儲,都是從新建立文件csv, os文件操做 mode='a' ''' def download(df): df.to_csv('D://box_office.csv', mode='a', index=False, header=False) print("done") if __name__ == "__main__": start_time = time.time() headers = { 'Cookie': 'Hm_lvt_daabace29afa1e8193c0e3000d391562=1570691612; Hm_lpvt_daabace29afa1e8193c0e3000d391562=1570691612', 'Host': 'www.cbooo.cn', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' } base_url = "http://www.cbooo.cn/year?year=" for i in range(2008, 2020): url = base_url + str(i) spider(url, headers) time.sleep(2) print(round((time.time() - start_time), 3))