python抓取豆瓣電影Top250數據html
1.豆瓣地址:https://movie.douban.com/top250?start=25&filter=python
2.主要流程是抓取該網址下的Top250的數據,存入本地的txt文件中,並將數據持久化寫入數據庫中mysql
環境準備:sql
1.本地安裝mysql數據庫,具體下載以及安裝參照:https://blog.csdn.net/chic_data/article/details/72286329數據庫
2.安裝好數據後建立database和table,並建立字段app
如:我安裝的版本是mysqlV8.0url
CREATE TABLE doubanTop250( ID int PRIMARY KEY AUTO_INCREMENT, rankey int, name varchar(50), alias varchar(100), director varchar(50), showYear varchar(50), makeCountry varchar(50), movieType varchar(50), movieScore float, scoreNum int, shortFilm varchar(255) )ENGINE=InnoDB DEFAULT CHARSET=utf8;
最後咱們直接來看代碼:spa
1 from urllib import request 2 import re 3 import pymysql 4 class MovieTop(object): 5 def __init__(self): 6 self.start = 0 7 self.param = '&filter' 8 self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) " 9 "AppleWebKit/537.36 (KHTML, like Gecko) " 10 "Chrome/65.0.3325.146 Safari/537.36"} 11 self.movieList = [] 12 self.filePath = './DoubanTop250.txt' 13 14 def get_page(self): 15 try: 16 url = 'https://movie.douban.com/top250?start=' + str(self.start) + '&filter=' 17 myRequest = request.Request(url, headers=self.headers) 18 response = request.urlopen(myRequest) 19 page = response.read().decode('utf-8') 20 print('正在獲取第' + str((self.start+25)//25) + '頁數據...') 21 self.start += 25 22 return page 23 except request.URLError as e: 24 if hasattr(e, 'reason'): 25 print('獲取失敗,失敗緣由:', e.reason) 26 27 def get_page_info(self): 28 patern = re.compile(u'<div.*?class="item">.*?' 29 + u'<div.*?class="pic">.*?' 30 + u'<em.*?class="">(.*?)</em>.*?' 31 + u'<div.*?class="info">.*?' 32 + u'<span.*?class="title">(.*?)</span>.*?' 33 + u'<span.*?class="other">(.*?)</span>.*?' 34 + u'<div.*?class="bd">.*?' 35 + u'<p.*?class="">.*?' 36 + u'導演:\s(.*?)\s.*?<br>' 37 + u'(.*?) / ' 38 + u'(.*?) / (.*?)</p>.*?' 39 + u'<div.*?class="star">.*?' 40 + u'<span.*?class="rating_num".*?property="v:average">' 41 + u'(.*?)</span>.*?' 42 + u'<span>(.*?)人評價</span>.*?' 43 + u'<span.*?class="inq">(.*?)</span>' 44 , re.S) 45 46 while self.start <= 225: 47 page = self.get_page() 48 movies = re.findall(patern, page) 49 for movie in movies: 50 self.movieList.append([movie[0], 51 movie[1], 52 movie[2].lstrip(' / '), 53 movie[3], 54 movie[4].lstrip(), 55 movie[5], 56 movie[6].rstrip(), 57 movie[7], 58 movie[8], 59 movie[9]]) 60 61 def write_page(self): 62 print('開始寫入文件...') 63 file = open(self.filePath, 'w', encoding='utf-8') 64 try: 65 for movie in self.movieList: 66 file.write('電影排名:' + movie[0] + '\n') 67 file.write('電影名稱:' + movie[1] + '\n') 68 file.write('電影別名:' + movie[2] + '\n') 69 file.write('導演:' + movie[3] + '\n') 70 file.write('上映年份:' + movie[4] + '\n') 71 file.write('製做國家/地區:' + movie[5] + '\n') 72 file.write('電影類別:' + movie[6] + '\n') 73 file.write('評分:' + movie[7] + '\n') 74 file.write('參評人數:' + movie[8] + '\n') 75 file.write('簡短影評:' + movie[9] + '\n') 76 file.write('\n') 77 print('成功寫入文件...') 78 except Exception as e: 79 print(e) 80 finally: 81 file.close() 82 83 def upload(self): 84 db = pymysql.connect("localhost", "root", "root", "PythonTest", charset='utf8') 85 cursor = db.cursor() 86 87 insertStr = "INSERT INTO doubanTop250(rankey, name, alias, director," \ 88 "showYear, makeCountry, movieType, movieScore, scoreNum, shortFilm)" \ 89 "VALUES (%d, '%s', '%s', '%s', '%s', '%s', '%s', %f, %d, '%s')" 90 91 try: 92 for movie in self.movieList: 93 insertSQL = insertStr % (int(movie[0]), str(movie[1]), str(movie[2]), str(movie[3]), 94 str(movie[4]), str(movie[5]), str(movie[6]), float(movie[7]), 95 int(movie[8]), str(movie[9])) 96 cursor.execute(insertSQL) 97 db.commit() 98 print('成功上傳至數據庫...') 99 except Exception as e: 100 print(e) 101 db.rollback() 102 finally: 103 db.close() 104 105 if __name__ == '__main__': 106 mt = MovieTop() 107 mt.get_page_info() 108 mt.write_page() 109 mt.upload()
執行結果:.net
參照原文地址:https://www.cnblogs.com/AlvinZH/p/8576841.html#_label0code