Python3爬取豆瓣電影保存到MySQL數據庫

時間 2019-11-21

標籤 python3 python 豆瓣保存 mysql 數據庫欄目 Python 简体版

原文原文鏈接

48行代碼實現Python3爬取豆瓣電影排行榜

代碼基於python3，用到的類庫有:html

requests:經過僞造請求頭或設置代理等方式獲取頁面內容，參考文檔
BeautifulSoup:對頁面進行解析，提取數據，參考文檔
PyMySQL:python3版本中用於操做MySQL數據庫，python2中則使用mysqldb，Githubpython

pip安裝用到的幾個類庫:mysql

pip install requests
pip install bs4
pip install pymysql

分析豆瓣電影頁面

頁面分析:
爬取數據以前，咱們都須要對頁面進行分析，看咱們能夠從中提取到哪些數據，從下圖咱們看到豆瓣電影top250的頁面結構，咱們能夠從中提取出排行榜(rank)、電影名字(name)、電影詳情頁連接(link)、電影海報(poster)、電影評分(score)、電影評論(quote)等，我在圖中進行了標註

URL分析:
經過點擊分頁咱們能夠發現URL的格式爲:https://movie.douban.com/top250?start=num&filter=
其中num表示25的倍數的數字，最小是0也就是第一頁，最大爲225也就是最後一頁，這能夠做爲咱們爬取頁面的限制條件，filter爲過濾條件這裏可不用管git

代碼

引入類庫:github

import pymysql
import requests
from bs4 import BeautifulSoup

定義爬取連接，%d用做數字佔位:sql

baseUrl = "https://movie.douban.com/top250?start=%d&filter="

定義爬取數據方法:數據庫

def get_movies(start):
    url = baseUrl % start   # 拼接爬取連接
    lists = []              # 存儲此頁面的電影數據
    html = requests.get(url)    # requests請求頁面內容，因爲豆瓣沒有限制爬取，因此不用設置僞請求頭
    soup = BeautifulSoup(html.content, "html.parser")   # BeautifulSoup解析頁面內容
    items = soup.find("ol", "grid_view").find_all("li") # 獲取全部的電影內容
    for i in items:
        movie = {}      # 臨時存取電影的數據
        movie["rank"] = i.find("em").text   # 電影排行榜
        movie["link"] = i.find("div","pic").find("a").get("href")   # 電影詳情頁連接
        movie["poster"] = i.find("div","pic").find("a").find('img').get("src")  # 電影海報地址
        movie["name"] = i.find("span", "title").text    # 電影名字
        movie["score"] = i.find("span", "rating_num").text  # 電影評分
        movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else "" # 某些電影沒有點評，沒有就設爲空
        lists.append(movie) # 保存到返回數組中
    return lists

鏈接數據庫並建立數據表:數組

# 鏈接數據庫，需指定charset不然可能會報錯
db = pymysql.connect(host="localhost",user="root",password="root",db="test",charset="utf8mb4")
cursor = db.cursor()    # 建立一個遊標對象
cursor.execute("DROP TABLE IF EXISTS movies")   # 若是表存在則刪除
# 建立表sql語句
createTab = """CREATE TABLE movies(
    id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
    name VARCHAR(20) NOT NULL,
    rank VARCHAR(4) NOT NULL,
    link VARCHAR(50) NOT NULL,
    poster VARCHAR(100) NOT NULL,
    score VARCHAR(4) NOT NULL,
    quote VARCHAR(50)
)"""
cursor.execute(createTab)     # 執行建立數據表操做
......
db.close()            # 關閉數據庫

將提取到的數據存儲到數據表中:app

lists = get_movies(start)   # 獲取提取到數據
    for i in lists:
        # 插入數據到數據庫sql語句，%s用做字符串佔位
        sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`quote`) VALUES(%s,%s,%s,%s,%s,%s)"
        try:
            cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["quote"]))
            db.commit()
            print(i[0]+" is success")
        except:
            db.rollback()
    start += 25

完整代碼:post

import pymysql
import requests
from bs4 import BeautifulSoup
baseUrl = "https://movie.douban.com/top250?start=%d&filter="
def get_movies(start):
    url = baseUrl % start
    lists = []
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")
    items = soup.find("ol", "grid_view").find_all("li")
    for i in items:
        movie = {}
        movie["rank"] = i.find("em").text
        movie["link"] = i.find("div","pic").find("a").get("href")
        movie["poster"] = i.find("div","pic").find("a").find('img').get("src")
        movie["name"] = i.find("span", "title").text
        movie["score"] = i.find("span", "rating_num").text
        movie["quote"] = i.find("span", "inq").text if(i.find("span", "inq")) else ""
        lists.append(movie)
    return lists

if __name__ == "__main__":
    db = pymysql.connect(host="localhost",user="root",password="root",db="test",charset="utf8mb4")
    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS movies")
    createTab = """CREATE TABLE movies(
        id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
        name VARCHAR(20) NOT NULL,
        rank VARCHAR(4) NOT NULL,
        link VARCHAR(50) NOT NULL,
        poster VARCHAR(100) NOT NULL,
        score VARCHAR(4) NOT NULL,
        quote VARCHAR(50)
    )"""
    cursor.execute(createTab)
    start = 0
    while (start < 250):
        lists = get_movies(start)
        for i in lists:
            sql = "INSERT INTO `movies`(`name`,`rank`,`link`,`poster`,`score`,`quote`) VALUES(%s,%s,%s,%s,%s,%s)"
            try:
                cursor.execute(sql, (i["name"], i["rank"], i["link"], i["poster"], i["score"], i["quote"]))
                db.commit()
                print(i["name"]+" is success")
            except:
                db.rollback()
        start += 25
    db.close()