基礎頁面:https://movie.douban.com/top250html
代碼:python
from time import sleep from requests import get from bs4 import BeautifulSoup import re import pymysql db = pymysql.connect(host='localhost', user='root', password='123456', db='douban', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor ) try: with db.cursor() as cursor: sql = "CREATE TABLE IF NOT EXISTS `top250` (" \ "`id` int(6) NOT NULL AUTO_INCREMENT," \ "`top` int(6) NOT NULL," \ "`page-code` int(6) NOT NULL," \ "`title` varchar(255) NOT NULL," \ "`origin-title` varchar(255)," \ "`score` float NOT NULL," \ "`theme` varchar(255) NOT NULL," \ "PRIMARY KEY(`id`)" \ ") ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1;" cursor.execute(sql,) finally: db.commit() base_url = 'https://movie.douban.com/top250' header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'xxx', 'Host': 'movie.douban.com', 'Referer': 'https://movie.douban.com/chart', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'xxx' } def crawler(url=None, headers=None, delay=1): r = get(url=url, headers=headers, timeout=3) soup = BeautifulSoup(r.text, 'html.parser') page_tag = soup.find('span', attrs={'class': 'thispage'}) page_code = re.compile(r'<span class="thispage">(.*)</').findall(str(page_tag))[0] movie_ranks = soup.find_all('em', attrs={'class': ''}) movie_titles = soup.find_all('div', attrs={'class': 'hd'}) movie_scores = soup.find_all('span', attrs={'class': 'rating_num'}) movie_themes = soup.find_all('span', attrs={'class': 'inq'}) next_page = soup.find('link', attrs={'rel': 'next'}) for ranks, titles, scores, themes in zip(movie_ranks, movie_titles, movie_scores, movie_themes): rank = re.compile(r'<em class="">(.*)</').findall(str(ranks)) regex_ts = re.compile(r'<span class="title">(.*)</').findall(str(titles)) title = regex_ts[0] score = re.compile(r'<span class="rating_num" property="v:average">(.*)</').findall(str(scores))[0] theme = re.compile(r'<span class="inq">(.*)</').findall(str(themes))[0] try: origin_title = regex_ts[1] origin_title = re.compile(r'./.(.+)').findall(origin_title)[0] with db.cursor() as cursor: sql = "INSERT INTO `top250` (`top`, `page-code`, `title`, `origin-title`, `score`, `theme`)" \ " VALUES (%s, %s, %s, %s, %s, %s)" cursor.execute(sql, (rank, page_code, title, origin_title, score, theme,)) except IndexError: with db.cursor() as cursor: sql = "INSERT INTO `top250` (`top`, `page-code`, `title`, `score`, `theme`)" \ " VALUES (%s, %s, %s, %s, %s)" cursor.execute(sql, (rank, page_code, title, score, theme,)) finally: db.commit() if next_page is not None: headers['Referer'] = url next_url = base_url + re.compile(r'<link href="(.*)" rel="next">').findall(str(next_page))[0] sleep(delay) crawler(url=next_url, headers=headers, delay=3) crawler(base_url, header, 0) db.close()
結果:mysql
mysql> select top,title,score from top250 where id = 175; +-----+--------+-------+ | top | title | score | +-----+--------+-------+ | 176 | 羅生門 | 8.7 | +-----+--------+-------+ 1 row in set (0.00 sec) mysql> select top,title,page-code,score from top250 where id = 175; ERROR 1054 (42S22): Unknown column 'page' in 'field list' mysql> select top,page-code,title,score from top250 where id = 175; ERROR 1054 (42S22): Unknown column 'page' in 'field list' mysql> select page-code from top250 where id = 175; ERROR 1054 (42S22): Unknown column 'page' in 'field list' mysql> describe top250 -> ; +--------------+--------------+------+-----+---------+----------------+ | Field | Type | Null | Key | Default | Extra | +--------------+--------------+------+-----+---------+----------------+ | id | int(6) | NO | PRI | NULL | auto_increment | | top | int(6) | NO | | NULL | | | page-code | int(6) | NO | | NULL | | | title | varchar(255) | NO | | NULL | | | origin-title | varchar(255) | YES | | NULL | | | score | float | NO | | NULL | | | theme | varchar(255) | NO | | NULL | | +--------------+--------------+------+-----+---------+----------------+ 7 rows in set (0.32 sec) mysql> select page-code from top250 where id = 175; ERROR 1054 (42S22): Unknown column 'page' in 'field list' mysql> select origin-title from top250 where id = 175; ERROR 1054 (42S22): Unknown column 'origin' in 'field list' mysql> select origin_title from top250 where id = 175; ERROR 1054 (42S22): Unknown column 'origin_title' in 'field list' mysql> select * from top250 where id = 175; +-----+-----+-----------+--------+--------------+-------+-------------------+ | id | top | page-code | title | origin-title | score | theme | +-----+-----+-----------+--------+--------------+-------+-------------------+ | 175 | 176 | 8 | 羅生門 | 羅生門 | 8.7 | 人生的N種可能性。 | +-----+-----+-----------+--------+--------------+-------+-------------------+ 1 row in set (0.00 sec) mysql> select * from top250 where title = 未麻的部屋; ERROR 1054 (42S22): Unknown column '未麻的部屋' in 'where clause' mysql> select * from top250 where top=175; Empty set (0.00 sec) mysql>
兩個小問題:web
1.沒想到數據庫字段不能用'-'...,因而page-code字段與origin-title字段不能獨立進行查找。。。sql
2.不知道爲啥top175的電影《未麻的部屋》沒爬到。。。數據庫
建議使用scrapy。bash
用scrapy的一些好處是配置爬蟲很方便,還有其內部自帶的html解析器、對不完整的url的組建等十分便利。app
最後,吐槽一下,以前的電腦配置太差,跑深度學習程序的過程耗盡內存,出現莫名的bug後,藍屏死機就再也無法啓動了。。。因此,暫時不能更新博客了。。。scrapy