""" 抓取貓眼電影TOP100 """ import re import time import requests from bs4 import BeautifulSoup class SpiderMaoyan(object): def __init__(self): # 經過分析URL能夠發現, 貓眼電影TOP100頁面是經過 offset + 10 來分頁的 self.url = "http://maoyan.com/board/4?offset={0}" # 設置一下UA, 不然有可能提示你訪問被禁止了 self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/66.0.3359.139 Safari/537.36" } # 定義須要提取的內容字段 self.fields = ("id", "name", "movieUrl", "imgUrl", "star", "releaseTime", "score") def handler(self, offset=0): while offset < 100: response = requests.get(self.url.format(offset), headers=self.headers) if response.status_code == 200: print("INFO -> Current URL: <%s>" % response.url) # 編碼處理一下, 否則有可能中文顯示亂碼 r_html = response.text.encode(response.encoding).decode("utf-8") # 構建一個 BeautifulSoup 對象, 用於後續的標籤、內容提取 soup = BeautifulSoup(r_html, "html5lib") # 繼續分析網頁源代碼, 發現每部影片都存在 <dd></dd> 標籤中 tags = soup.select("dd") # 提取內容 for tag in tags: # id、name、movieUrl obj = tag.find("p", class_="name").select_one("a") _id = re.search(r"movieId:(\d+)", obj.get("data-val")).group(1) _name = obj.string _movieUrl = "http://maoyan.com" + obj.get("href") # img # Tips: 能夠將圖片地址後面的分辨率去掉, 保存高清大圖地址 .split("@")[0] _imgUrl = tag.find("img", class_="board-img").get("data-src") # star # Tips: 中文標點 _star = tag.find("p", class_="star").string.strip().split(":")[-1] # releaseTime # Tips: 中文標點 _releaseTime = tag.find("p", class_="releasetime").string.split(":")[-1] # score _score = tag.find("p", class_="score").get_text() # 接下來就能夠將數據寫入存儲了 # Tips: 這種 SQL 生成方式有必要驗證 key/val 是否成對出現 print( "INSERT INTO TABLE_NAME (%s) VALUE %s;" % ( ", ".join(self.fields), tuple([_id, _name, _movieUrl, _imgUrl, _star, _releaseTime, _score]) ) ) # 偏移量自增 offset += 10 # 有必要停頓一下 time.sleep(.9) else: print(response.reason) exit(999) if __name__ == "__main__": spider = SpiderMaoyan() spider.handler()