Python 爬取豆瓣電影Top250排行榜,爬蟲初試

from bs4 import BeautifulSoup
import openpyxl
import re
import urllib.request
import urllib.error

# 訪問url
def ask_url(url):
    # 假裝瀏覽器
    head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
    req = urllib.request.Request(url, headers=head)  # 包裝
    try:
        response = urllib.request.urlopen(req, timeout=3)  # 訪問 超時3s結束
        html = response.read().decode('utf-8')  # 解碼
        return html  # 返回url網頁html源碼
    except urllib.request.HTTPError as e:
        if hasattr(e, 'code'):
            print(e.code)
    except urllib.error.URLError as e:
        if hasattr(e, 'reason'):
            print(e.reason)


# 爬取網頁
def crawl_web(base_url):
    data_list = []
    # re電影名
    re_movie_name = re.compile(r'<span class="title">(.*?)</span>')
    # re影片詳情鏈接
    re_movie_link = re.compile(r'<a class="" href="(.*?)">')
    # re影片海報圖片
    re_movie_img = re.compile(r'<img.*src="(.*?)".*?>', re.S)
    # re電影簡介
    re_movie_introduction = re.compile(r'<p class="">(.*?)</p>', re.S)
    # re評分
    re_movie_score = re.compile(
        r'<span class="rating_num" property="v:average">(.*?)</span>')
    # re評分人數
    re_movie_judge = re.compile(r'<span>(.*?)人評價</span>')
    # re一句話評價
    re_moive_inq = re.compile(r'<span class="inq">(.*?)。*</span>')
    for i in range(10):
        url = base_url + str(i * 25)
        html = ask_url(url)  # 獲取網頁源碼
        soup = BeautifulSoup(html, 'html.parser')  # 解析源碼
        for item in soup.find_all(class_='item'):
            item = str(item).replace(u'\xa0', ' ')  # 獲取的頁面中有奇妙代碼(●'◡'●),因此要去掉
            data = []
            # 獲取須要的信息
            movie_name = re.findall(re_movie_name, item)
            if len(movie_name) > 1:
                data.append(movie_name[0])
                data.append(movie_name[1].replace('/', ''))
            else:  # 沒有外語名也要空出來,方便後續儲存
                data.append(movie_name[0])
                data.append('暫無')

            movie_link = re.findall(re_movie_link, item)[0]
            data.append(movie_link)

            movie_img = re.findall(re_movie_img, item)[0]
            data.append(movie_img)

            movie_introduction = re.findall(re_movie_introduction, item)[0]
            movie_introduction = re.sub(
                r'<br(.*)?>', ' ', movie_introduction).strip()  # 存入簡介時要去掉含有的html標籤
            data.append(movie_introduction)

            movie_score = re.findall(re_movie_score, item)[0]
            data.append(movie_score)

            movie_judge = re.findall(re_movie_judge, item)[0]
            data.append(movie_judge)

            moive_inq = re.findall(re_moive_inq, item)
            if len(moive_inq) == 0:  # 有時候沒有一句話短評,同上要空出
                data.append('暫無')
            else:
                data.append(moive_inq[0])

            # 添加到data_list
            data_list.append(data)
    return data_list


def save_data(save_path, data_list):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = '豆瓣TOP250'
    first_row = ("電影名", "其餘名", "影片詳情鏈接", "影片海報圖片",
                 "電影簡介", "評分", "評分人數", "一句話評價")
    temp = 1
    for i in first_row:  # 生成表頭
        ws.cell(1, temp, i)
        temp += 1
    row = 1
    for i in data_list:  # 存入數據
        row += 1
        column = 1
        for j in i:
            ws.cell(row, column, j)
            column += 1
    wb.save(save_path+'豆瓣TOP250.xlsx')
    return None


if __name__ == "__main__":
    base_url = 'https://movie.douban.com/top250?start='
    save_path = 'E:\\School\\Study\\Python\\爬蟲\\'
    data_list = crawl_web(base_url)
    save_data(save_path, data_list)
    print('Crawl over')
相關文章
相關標籤/搜索
本站公眾號
   歡迎關注本站公眾號,獲取更多信息