python3+requests+BeautifulSoup+mysql爬取豆瓣電影top250

　　基礎頁面：https://movie.douban.com/top250html

　　代碼：python

from time import sleep
from requests import get
from bs4 import BeautifulSoup
import re
import pymysql

db = pymysql.connect(host='localhost',
                     user='root',
                     password='123456',
                     db='douban',
                     charset='utf8mb4',
                     cursorclass=pymysql.cursors.DictCursor
                     )
try:
    with db.cursor() as cursor:
        sql = "CREATE TABLE IF NOT EXISTS `top250` (" \
            "`id` int(6) NOT NULL AUTO_INCREMENT," \
            "`top` int(6) NOT NULL," \
            "`page-code` int(6) NOT NULL," \
            "`title` varchar(255) NOT NULL," \
            "`origin-title` varchar(255)," \
            "`score` float NOT NULL," \
            "`theme` varchar(255) NOT NULL," \
            "PRIMARY KEY(`id`)" \
            ") ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1;"
        cursor.execute(sql,)
finally:
    db.commit()

base_url = 'https://movie.douban.com/top250'
header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Cookie': 'xxx',
    'Host': 'movie.douban.com',
    'Referer': 'https://movie.douban.com/chart',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'xxx'
}


def crawler(url=None, headers=None, delay=1):
    r = get(url=url, headers=headers, timeout=3)
    soup = BeautifulSoup(r.text, 'html.parser')
    page_tag = soup.find('span', attrs={'class': 'thispage'})
    page_code = re.compile(r'<span class="thispage">(.*)</').findall(str(page_tag))[0]
    movie_ranks = soup.find_all('em', attrs={'class': ''})
    movie_titles = soup.find_all('div', attrs={'class': 'hd'})
    movie_scores = soup.find_all('span', attrs={'class': 'rating_num'})
    movie_themes = soup.find_all('span', attrs={'class': 'inq'})
    next_page = soup.find('link', attrs={'rel': 'next'})
    for ranks, titles, scores, themes in zip(movie_ranks, movie_titles, movie_scores, movie_themes):
        rank = re.compile(r'<em class="">(.*)</').findall(str(ranks))
        regex_ts = re.compile(r'<span class="title">(.*)</').findall(str(titles))
        title = regex_ts[0]
        score = re.compile(r'<span class="rating_num" property="v:average">(.*)</').findall(str(scores))[0]
        theme = re.compile(r'<span class="inq">(.*)</').findall(str(themes))[0]
        try:
            origin_title = regex_ts[1]
            origin_title = re.compile(r'./.(.+)').findall(origin_title)[0]
            with db.cursor() as cursor:
                sql = "INSERT INTO `top250` (`top`, `page-code`, `title`, `origin-title`, `score`, `theme`)" \
                      " VALUES (%s, %s, %s, %s, %s, %s)"
                cursor.execute(sql, (rank, page_code, title, origin_title, score, theme,))
        except IndexError:
            with db.cursor() as cursor:
                sql = "INSERT INTO `top250` (`top`, `page-code`, `title`, `score`, `theme`)" \
                      " VALUES (%s, %s, %s, %s, %s)"
                cursor.execute(sql, (rank, page_code, title, score, theme,))
        finally:
            db.commit()
    if next_page is not None:
        headers['Referer'] = url
        next_url = base_url + re.compile(r'<link href="(.*)" rel="next">').findall(str(next_page))[0]
        sleep(delay)
        crawler(url=next_url, headers=headers, delay=3)


crawler(base_url, header, 0)
db.close()

　　結果：mysql

mysql> select top,title,score from top250 where id = 175;
+-----+--------+-------+
| top | title  | score |
+-----+--------+-------+
| 176 | 羅生門 |   8.7 |
+-----+--------+-------+
1 row in set (0.00 sec)

mysql> select top,title,page-code,score from top250 where id = 175;
ERROR 1054 (42S22): Unknown column 'page' in 'field list'
mysql> select top,page-code,title,score from top250 where id = 175;
ERROR 1054 (42S22): Unknown column 'page' in 'field list'
mysql> select page-code from top250 where id = 175;
ERROR 1054 (42S22): Unknown column 'page' in 'field list'
mysql> describe top250
    -> ;
+--------------+--------------+------+-----+---------+----------------+
| Field        | Type         | Null | Key | Default | Extra          |
+--------------+--------------+------+-----+---------+----------------+
| id           | int(6)       | NO   | PRI | NULL    | auto_increment |
| top          | int(6)       | NO   |     | NULL    |                |
| page-code    | int(6)       | NO   |     | NULL    |                |
| title        | varchar(255) | NO   |     | NULL    |                |
| origin-title | varchar(255) | YES  |     | NULL    |                |
| score        | float        | NO   |     | NULL    |                |
| theme        | varchar(255) | NO   |     | NULL    |                |
+--------------+--------------+------+-----+---------+----------------+
7 rows in set (0.32 sec)

mysql> select page-code from top250 where id = 175;
ERROR 1054 (42S22): Unknown column 'page' in 'field list'
mysql> select origin-title from top250 where id = 175;
ERROR 1054 (42S22): Unknown column 'origin' in 'field list'
mysql> select origin_title from top250 where id = 175;
ERROR 1054 (42S22): Unknown column 'origin_title' in 'field list'
mysql> select * from top250 where id = 175;
+-----+-----+-----------+--------+--------------+-------+-------------------+
| id  | top | page-code | title  | origin-title | score | theme             |
+-----+-----+-----------+--------+--------------+-------+-------------------+
| 175 | 176 |         8 | 羅生門 | 羅生門       |   8.7 | 人生的N種可能性。 |
+-----+-----+-----------+--------+--------------+-------+-------------------+
1 row in set (0.00 sec)

mysql> select * from top250 where title = 未麻的部屋;
ERROR 1054 (42S22): Unknown column '未麻的部屋' in 'where clause'
mysql> select * from top250 where top=175;
Empty set (0.00 sec)

mysql>

　　兩個小問題：web

　　1.沒想到數據庫字段不能用'-'...，因而page-code字段與origin-title字段不能獨立進行查找。。。sql

　　2.不知道爲啥top175的電影《未麻的部屋》沒爬到。。。數據庫

　　建議使用scrapy。bash

　　用scrapy的一些好處是配置爬蟲很方便，還有其內部自帶的html解析器、對不完整的url的組建等十分便利。app

　　最後，吐槽一下，以前的電腦配置太差，跑深度學習程序的過程耗盡內存，出現莫名的bug後，藍屏死機就再也無法啓動了。。。因此，暫時不能更新博客了。。。scrapy