from bs4 import BeautifulSoup import openpyxl import re import urllib.request import urllib.error # 訪問url def ask_url(url): # 假裝瀏覽器 head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'} req = urllib.request.Request(url, headers=head) # 包裝 try: response = urllib.request.urlopen(req, timeout=3) # 訪問 超時3s結束 html = response.read().decode('utf-8') # 解碼 return html # 返回url網頁html源碼 except urllib.request.HTTPError as e: if hasattr(e, 'code'): print(e.code) except urllib.error.URLError as e: if hasattr(e, 'reason'): print(e.reason) # 爬取網頁 def crawl_web(base_url): data_list = [] # re電影名 re_movie_name = re.compile(r'<span class="title">(.*?)</span>') # re影片詳情鏈接 re_movie_link = re.compile(r'<a class="" href="(.*?)">') # re影片海報圖片 re_movie_img = re.compile(r'<img.*src="(.*?)".*?>', re.S) # re電影簡介 re_movie_introduction = re.compile(r'<p class="">(.*?)</p>', re.S) # re評分 re_movie_score = re.compile( r'<span class="rating_num" property="v:average">(.*?)</span>') # re評分人數 re_movie_judge = re.compile(r'<span>(.*?)人評價</span>') # re一句話評價 re_moive_inq = re.compile(r'<span class="inq">(.*?)。*</span>') for i in range(10): url = base_url + str(i * 25) html = ask_url(url) # 獲取網頁源碼 soup = BeautifulSoup(html, 'html.parser') # 解析源碼 for item in soup.find_all(class_='item'): item = str(item).replace(u'\xa0', ' ') # 獲取的頁面中有奇妙代碼(●'◡'●),因此要去掉 data = [] # 獲取須要的信息 movie_name = re.findall(re_movie_name, item) if len(movie_name) > 1: data.append(movie_name[0]) data.append(movie_name[1].replace('/', '')) else: # 沒有外語名也要空出來,方便後續儲存 data.append(movie_name[0]) data.append('暫無') movie_link = re.findall(re_movie_link, item)[0] data.append(movie_link) movie_img = re.findall(re_movie_img, item)[0] data.append(movie_img) movie_introduction = re.findall(re_movie_introduction, item)[0] movie_introduction = re.sub( r'<br(.*)?>', ' ', movie_introduction).strip() # 存入簡介時要去掉含有的html標籤 data.append(movie_introduction) movie_score = re.findall(re_movie_score, item)[0] data.append(movie_score) movie_judge = re.findall(re_movie_judge, item)[0] data.append(movie_judge) moive_inq = re.findall(re_moive_inq, item) if len(moive_inq) == 0: # 有時候沒有一句話短評,同上要空出 data.append('暫無') else: data.append(moive_inq[0]) # 添加到data_list data_list.append(data) return data_list def save_data(save_path, data_list): wb = openpyxl.Workbook() ws = wb.active ws.title = '豆瓣TOP250' first_row = ("電影名", "其餘名", "影片詳情鏈接", "影片海報圖片", "電影簡介", "評分", "評分人數", "一句話評價") temp = 1 for i in first_row: # 生成表頭 ws.cell(1, temp, i) temp += 1 row = 1 for i in data_list: # 存入數據 row += 1 column = 1 for j in i: ws.cell(row, column, j) column += 1 wb.save(save_path+'豆瓣TOP250.xlsx') return None if __name__ == "__main__": base_url = 'https://movie.douban.com/top250?start=' save_path = 'E:\\School\\Study\\Python\\爬蟲\\' data_list = crawl_web(base_url) save_data(save_path, data_list) print('Crawl over')