# -*- coding:utf-8 -*- """ 一個簡單的Python爬蟲, 用於抓取豆瓣電影Top前250的電影的名稱 Language: Python3.6 """ import re import urllib.request import urllib.error import time #import urllib2 import ssl ssl._create_default_https_context = ssl._create_unverified_context class DouBanSpider(object): """類的簡要說明 本類主要用於抓取豆瓣前100的電影名稱 Attributes: page: 用於表示當前所處的抓取頁面 cur_url: 用於表示當前爭取抓取頁面的url datas: 存儲處理好的抓取到的電影名稱 _top_num: 用於記錄當前的top號碼 """ def __init__(self): self.page = 1 self.cur_url = "http://movie.douban.com/top250?start={page}&filter=&type=" self.datas = [] self._top_num = 1 print("豆瓣電影爬蟲準備就緒, 準備爬取數據...") def get_page(self, cur_page): """ 根據當前頁碼爬取網頁HTML Args: cur_page: 表示當前所抓取的網站頁碼 Returns: 返回抓取到整個頁面的HTML(unicode編碼) Raises: URLError:url引起的異常 """ url = self.cur_url time.sleep(3) try: #print(cur_page) page = (cur_page - 1) * 25 #print(page) url = url.format(page=page) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } request = urllib.request.Request(url, headers=headers) my_page = urllib.request.urlopen(request).read().decode('utf-8') print("請求第{}頁,url地址是:{}".format(cur_page,url)) #print(my_page) #urllib.error.URLError #urllib.request.urlopen.URLError except urllib.error.URLError as e: if hasattr(e, "code"): print("The server couldn't fulfill the request.") print("Error code: %s" % e.code) elif hasattr(e, "reason"): print("We failed to reach a server. Please check your url and read the Reason") print("Reason: %s" % e.reason) return my_page def find_title(self, my_page): """ 經過返回的整個網頁HTML, 正則匹配前100的電影名稱 Args: my_page: 傳入頁面的HTML文本用於正則匹配 """ temp_data = [] #<span class="title">.*</span> #class="">[\s]+<span class="title">(.*?)</span> #<span.*?class="title">(.*?)</span> movie_items = re.findall(r'<span.*?class="title">(.*?)</span>', my_page, re.S) for index, item in enumerate(movie_items): if item.find(" ") == -1: temp_data.append("Top" + str(self._top_num) + " " + item) self._top_num += 1 self.datas.extend(temp_data) def start_spider(self): """ 爬蟲入口, 並控制爬蟲抓取頁面的範圍 """ while self.page <= 3: my_page = self.get_page(self.page) self.find_title(my_page) self.page += 1 def main(): print( """ ###################################### 一個簡單的豆瓣電影前100爬蟲 Author: Agoly Version: Python3.6 Date: 2019-09-06 ###################################### """) my_spider = DouBanSpider() my_spider.start_spider() for item in my_spider.datas: print(item) print("豆瓣爬蟲爬取結束...") if __name__ == '__main__': main()