實戰項目 — 爬取中國票房網年度電影信息並保存在csv

 

 

 

 

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time


def spider(url, headers):
    print("正在抓取url:  " + url)
    datas = requests.get(url=url, headers=headers).text
    # 解析url
    soup = BeautifulSoup(datas, 'lxml')
    # 獲取數據集合,find_all 返回的是集合類型,因此取[0], 找table標籤下 的 屬性是 id:tbContent
    moives_tables = soup.find_all('table', {'id': 'tbContent'})[0]
    # 獲取每個子節點 tr標籤
    moives = moives_tables.findAll('tr')
    # 獲取電影名字,電影名字在每一個tr標籤裏面的第一個td標籤裏面,因爲是有多個td因此要用for遍歷
    names = [tr.find_all('td')[0].a.get('title') for tr in moives[1:]]
    # 獲取電影的詳情頁url地址,並且下面提供給獲取導演使用,由於導演信息不在主頁面上
    hrefs = [tr.find_all('td')[0].a.get('href') for tr in moives[1:]]
    # 獲取電影類型
    types = [tr.find_all('td')[1].string for tr in moives[1:]]
    # 獲取票房數據
    box_offices = [int(tr.find_all('td')[2].string) for tr in moives[1:]]
    # 獲取平均票價
    Average_fare = [tr.find_all('td')[3].string for tr in moives[1:]]
    # 獲取上映日期
    show_time = [tr.find_all('td')[6].string for tr in moives[1:]]
    # print(names, hrefs, types, box_offices, Average_fare, show_time)
    # print(len(hrefs))
    daoyans = []
    for href in hrefs:
        try:
            daoyan_datas = requests.get(href)
            # 出現錯誤的緣由是由於這裏的daoyan_datas是requests對象,沒法用BeautifulSoup解析,能夠在daoyan_datas後面加上content
            soup = BeautifulSoup(daoyan_datas.content, 'lxml')
            # 獲取導演,因爲數據是帶換行的,因此要用replace("\n","") 取消換行
            daoyan = soup.select('dl.dltext dd')[0].get_text().replace("\n", "")
            #print(daoyan)
            daoyans.append(daoyan)
            #print(len(daoyans))
            time.sleep(0.5)
        except:
            daoyans.append("獲取失敗")
    # 數據拼接,獲得的數據類型是  <class 'pandas.core.frame.DataFrame'> ,因此要用 DataFrame() 函數來寫入excel
    df = pd.DataFrame({
        'name': names,
        'href': hrefs,
        'type': types,
        'box_office': box_offices,
        'Average_fare': Average_fare,
        'show_time': show_time,
        'directors': daoyans
    })
    download(df)


'''
問題是不能連續存儲,都是從新建立文件csv, os文件操做 mode='a'
'''
def download(df):
    df.to_csv('D://box_office.csv', mode='a', index=False, header=False)
    print("done")


if __name__ == "__main__":
    start_time = time.time()
    headers = {
        'Cookie': 'Hm_lvt_daabace29afa1e8193c0e3000d391562=1570691612; Hm_lpvt_daabace29afa1e8193c0e3000d391562=1570691612',
        'Host': 'www.cbooo.cn',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
    }
    base_url = "http://www.cbooo.cn/year?year="
    for i in range(2008, 2020):
        url = base_url + str(i)
        spider(url, headers)
        time.sleep(2)
print(round((time.time() - start_time), 3))
相關文章
相關標籤/搜索