須要說明的是,我這裏把電影信息提取以後,緩存了電影封面和演職人員的圖片,並對圖片信息進行了獲取入庫html
先貼出我兩種表結構:mysql
1.電影表:
其中data是存儲電影信息的json數據,以下: {"mActorRole": [{"name": "奧克塔維亞·斯賓瑟 ", "id": 1154263, "role": "暫無角色信息"},{"name": "約翰·浩克斯 ", "id": 1100849, "role": "暫無角色信息"},{"name": "凱蒂·洛茨 ", "id": 1232918, "role": "暫無角色信息"},{"name": "詹姆斯·拉夫爾提 ", "id": 1022640, "role": "暫無角色信息"},{"name": "小克利夫頓·克林斯 ", "id": 1019033, "role": "暫無角色信息"}], "mCoverId": 2499355591, "mDirector": [{"name": "Eshom Nelms ", "id": 1387484}], "mId": 26802500, "mLength": "92分鐘", "mName": "小城犯罪 Small Town Crime", "mShowDate": "2017-03-11(美國)", "mSynopsis": " 奧克塔維亞·斯賓瑟將攜約翰·浩克斯出演獨立驚悚片《小城犯罪》。電影講述由浩克斯扮演的酗酒前警察偶然發現一具女屍,並不慎將他的家庭至於危險之中,他不得不一邊尋找兇手,一邊與惡勢力做鬥爭。該片由內爾姆斯兄弟執導,目前正在拍攝中。 ", "mType": "驚悚"} 2.圖片表:
兩個表,結構一致名字不一樣,id爲電影表內演職人員的id.data爲圖片屬性的json字符串,以下:
{"format": "jpg", "height": 383, "size": 20016, "width": 270}git
腳本由如下方法調用執行:
須要注意的是 , 下圖中的local_url是 豆瓣最新電影 這個頁面緩存到本地後的html文件的路徑 , 也能夠不傳 . sql
# coding=utf-8 import os import sys import cv2 import pymysql import re import json import time import requests from bs4 import BeautifulSoup def get_all_url(local_url, extra_url_list): """ :param local_url: 電影列表採起動態解析 , 故須要下載解析完畢的數據 . 可爲空 :param extra_url_list: 有時候新出的電影湊不夠一頁,因此手動加在數組裏傳進來 :return: 單條電影url的list """ if local_url is not None: html_file = open(local_url, 'r', encoding='UTF-8') # NOTE 只讀打開 注意編碼 try: html_page = html_file.read() except IOError: print("--------------------本地html讀取失敗! ------------------------") return soup = BeautifulSoup(html_page, 'html.parser') new_list = soup.findAll('a', {'class': 'item'}) print('本次預計進行更新的電影數據個數爲: %d' % (len(new_list) + len(extra_url_list))) for movie in new_list: movie_url = movie.get('href') extra_url_list.append(movie_url) if len(extra_url_list) == 0: print("沒有收到任何url , 請檢查參數 !") sys.exit(0) command = int(input('本次即將進行更新的電影數據個數爲 : %d 個 , 按<1>確認更新,按<0>取消更新.....\n' % len(extra_url_list))) i = 0 while i < 2: if command == 1: return extra_url_list if command == 0: print("本次更新已取消 ! ") sys.exit(0) else: i += 1 command = int(input("您的輸入有誤 , 還有%d次機會 ! 請再次輸入..." % (3 - i))) print("檢查一下你的輸入法 ! 再見 ! ") sys.exit(0) def frisk_image(folder, table): if not os.path.exists(folder): print("-------------------------- %s 不存在 , 請檢查 !-------------------------------" % folder) return start_time = time.time() m_image = pymysql.Connect( host="數據庫主機", port=3306, user="用戶名", passwd="密碼", db="圖片表名", charset='utf8') # NOTE 必定要設置編碼 image_cursor = m_image.cursor() count = 0 for parent, dir_names, file_names in os.walk(folder): for file_name in file_names: full_name = os.path.join(parent, file_name) # note 全路徑名 try: img = cv2.imread(full_name) except Exception as err: print("-------------------------------讀取<%s>時發生錯誤 !------------------------------ " % full_name) print(err) continue shape = img.shape image_data = {"format": re.findall(r"[^.]+$", file_name)[0], "height": int(shape[0]), "size": int(os.path.getsize(full_name)), "width": int(shape[1])} img_id = int( file_name.replace(re.findall(r"[^.]+$", file_name)[0], "").replace(".", "").replace("p", "")) json_data = json.dumps(image_data, sort_keys=True, ensure_ascii=False).replace("'", '"') # NOTE 格式化 image_cursor.execute("select count(*) from %s where id =%d" % (table, img_id)) # note 有記錄則跳過 if not image_cursor.fetchall()[0][0] == 0: print("----------------------圖片庫中已存在id爲<%d>的數據 ! 已跳過---------------------" % img_id) continue sql = "INSERT INTO %s (id,data, create_time) VALUES (%d , '%s', %d)" % ( table, img_id, json_data, round(time.time() * 1000)) try: image_cursor.execute(sql) m_image.commit() except Exception as err: print(err) print("-------------------<%s>在寫入數據庫過程當中出現問題 , 已回滾 !---------------------" % full_name) m_image.rollback() # NOTE 出錯則回滾 continue print("恭喜! %s 已成功入庫啦!" % file_name) count += 1 m_image.close() # TODO 別忘了這個! print("恭喜您入庫完成 ! 本次入庫%d條數據,用時%d秒" % (count, time.time() - start_time)) def get_movie_info(info_table, cover_table, avatar_table, cover_folder, avatar_folder, extra_url_list, local_url=None): start_time = time.time() url_list = get_all_url(local_url, extra_url_list) mask_tmp = pymysql.Connect( host="數據庫主機", port=3306, user="用戶名", passwd="密碼", db="電影表名", charset='utf8') # NOTE 必定要設置編碼 movie_cursor = mask_tmp.cursor() headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36' } count = 0 for url in url_list: print(">>>>>>>>>>>>>>>>>>>>>>第" + str(count + 1) + "個開始<<<<<<<<<<<<<<<<<<<<<<<<<") data = requests.get(url, headers=headers).content row_id = 0 soup = BeautifulSoup(data, 'html.parser') # NOTE 實例化一個BeautifulSoup對象 movie_data = {} try: m_id = re.findall(r"ect/(.+?)/", url)[0] if m_id.isdigit(): # NOTE 校驗一下mId是否爲數字 movie_data['mId'] = int(m_id) # NOTE 設置id爲原站id else: continue except Exception as err: print(err) print( str(url))+">>>>>>>>>>>>>>>>>>>>>>>>>>>>>解析id出錯 !" continue has_movie = movie_cursor.execute( "select id from " + str(info_table) + " where data like '%mId\":" + str(m_id) + ",%'") if not has_movie == 0: row_id = int(movie_cursor.fetchall()[0][0]) print("發現重複電影>>>>>>>>>>>>>>>>>>mId =<%d> row_id=<%d> , ! " % (int(m_id), row_id)) try: if soup.find('span', {'property': 'v:itemreviewed'}) is None: # NOTE 這個和下面的同樣,若是標籤找不到就跳過此項數據(或將此數據置默認值) continue else: movie_data['mName'] = soup.find('span', {'property': 'v:itemreviewed'}).getText() # NOTE 設置電影名 if soup.find('span', {'property': 'v:summary'}) is not None: movie_data['mSynopsis'] = soup.find('span', {'property': 'v:summary'}).getText().replace("\n", "").replace("\t", "").replace( "\v", "").replace("\r", "") # NOTE 設置簡介( 去掉格式符號) else: movie_data['mSynopsis'] = "暫無電影簡介" if len(soup.findAll('li', {'class': 'celebrity'})) == 0: continue else: director__actor = soup.findAll('li', {'class': 'celebrity'}) # NOTE 演職人員數據都在一塊兒 directors = [] for single_director in director__actor: if single_director.find("span", {"title": "導演"}) is not None: director = {} if single_director.find("a", {"class": "name"}) is not None: a = single_director.find("a", {"class": "name"}) director["name"] = a.getText() director["id"] = int(re.findall(r"/celebrity/(.+?)/", a['href'])[0]) inner_all_url = re.findall(r"url\((.+?)\)", single_director.find("div", {"class": "avatar"})["style"]) img_url = inner_all_url[len(inner_all_url) - 1] # NOTE 圖片url (部分人的圖片不止一張,最後一張必定人頭像) if img_url.find("default") == -1: # NOTE 初步發現默認圖的路徑包含 default 若是沒有應該就是有圖的 if not os.path.exists(avatar_folder): os.makedirs(avatar_folder) try: director_pic = requests.get(img_url, timeout=15) file_name = "%s/%s" % (avatar_folder, str(director["id"]) + "." + re.findall(r"[^.]+$", img_url)[ 0]) # NOTE 數值id拼接格式,格式爲最後一個.後的字符串 圖片id爲導演id fp = open(file_name, 'wb') fp.write(director_pic.content) fp.close() print("導演圖片 : %s 成功保存到本地 ! " % file_name) except Exception as err: print(err) print(" %s>>>>>>>>>>>>>>>>>在處理導演圖片時出錯!" % url) directors.append(director) movie_data["mDirector"] = [','.join(str(i) for i in directors)] # NOTE 導演 actors = [] for single_actor in director__actor: has_role = True li = single_actor.find("span", {"class", "role"}) if li is None: # NOTE 沒有角色 has_role = False else: if li.getText().find("飾") == -1: # NOTE title與文本不一樣 確定不是演員 continue act = {} a = single_actor.find("a", {"class": "name"}) if a is None: continue act["name"] = a.getText() act["id"] = int(re.findall(r"/celebrity/(.+?)/", a['href'])[0]) act["role"] = single_actor.find("span", {"class", "role"}).getText().replace("飾 ", "") if has_role else "暫無角色信息" inner_all_url = re.findall(r"url\((.+?)\)", single_actor.find("div", {"class": "avatar"})["style"]) img_url = inner_all_url[len(inner_all_url) - 1] # NOTE 圖片url (部分人的圖片不止一張,最後一張必定人頭像) if img_url.find("default") == -1: # NOTE 初步發現默認圖的路徑包含 default 若是沒有應該就是有圖的 if not os.path.exists(avatar_folder): os.makedirs(avatar_folder) try: actor_pic = requests.get(img_url, timeout=15) file_name = "%s/%s" % (avatar_folder, str(act["id"]) + "." + re.findall(r"[^.]+$", img_url)[ 0]) # NOTE 數值id拼接格式,格式爲最後一個.後的字符串 圖片id 爲演員id fp = open(file_name, 'wb') fp.write(actor_pic.content) fp.close() print("演員圖片 : %s 成功保存到本地 ! " % file_name) except Exception as err: print(err) print("<%s>>>>>>>>>>>>>>>>>>>>> 在處理演員圖片時出錯!" % url) actors.append(act) movie_data["mActorRole"] = [','.join(str(i) for i in actors)] # NOTE 演員 if len(soup.findAll('span', {'property': 'v:genre'})) == 0: continue else: m_type = soup.findAll('span', {'property': 'v:genre'}) types = [] for single_type in m_type: types.append(single_type.getText()) movie_data["mType"] = ' / '.join(str(i) for i in types) # NOTE 設置類型 , 結果格式爲 : 愛情 / 動做 / 倫理 if len(soup.findAll('span', {'property': 'v:initialReleaseDate'})) == 0: continue else: show_date_list = soup.findAll('span', {'property': 'v:initialReleaseDate'}) show_date = [] for single_date in show_date_list: show_date.append(single_date.getText()) movie_data["mShowDate"] = ' / '.join(str(i) for i in show_date) movie_data["mLength"] = soup.find('span', {'property': 'v:runtime'}).getText() \ if soup.find('span', {'property': 'v:runtime'}) is not None else "暫無時長" # NOTE 電影時長 img_url = soup.find("img", {"title": "點擊看更多海報"})["src"] if img_url.find("default") == -1: # NOTE 初步發現默認圖的路徑包含 default 若是沒有應該就是有圖的 img_name = re.findall(r"lic/.*", img_url)[0].replace("lic/", "") if not os.path.exists(cover_folder): os.makedirs(cover_folder) try: cover = requests.get(img_url, timeout=15) file_name = "%s/%s" % (cover_folder, img_name) fp = open(file_name, 'wb') fp.write(cover.content) fp.close() print("封面圖片 : %s 成功保存到本地 ! " % file_name) except Exception as err: print(err) print("url爲----> %s 的電影在存儲封面時出錯!" % url) movie_data["mCoverId"] = 0 movie_data["mCoverId"] = int(re.findall(r"ic/p(.+?)\.", img_name)[0]) else: movie_data["mCoverId"] = 0 # NOTE 沒封面的電影圖片指向0 json_data = json.dumps(movie_data, sort_keys=True, ensure_ascii=False).replace("'", '"').replace('["', '[').replace( '"]', ']') # NOTE 格式化輸出電影信息 except Exception as err: print(err) print("--------------------------------解析網頁失敗 %s失敗 ----------------------------------" % url) continue if not row_id == 0: sql = "update %s set data = '%s' where id = %d " % (info_table, json_data, row_id) count -= 1 else: movie_cursor.execute("select id from %s order by id desc limit 1 " % info_table) last_id = movie_cursor.fetchall()[0][0] # NOTE 獲取最後一條數據的id sql = "INSERT INTO %s (id,data, createTime) VALUES (%d , '%s', %d)" % ( info_table, last_id + 1, json_data, round(time.time() * 1000)) try: movie_cursor.execute(sql) mask_tmp.commit() except Exception as err: print(err) print("---------------------------%s在寫入數據庫過程當中出現問題 , 已回滾 ! -------------------------------" % url) mask_tmp.rollback() # NOTE 出錯則回滾 continue print("恭喜! 《%s》已成功入庫啦!" % url) count += 1 mask_tmp.close() # TODO 別忘了這個! print("恭喜您入庫完成 ! 本次新入庫%d條數據,用時%d秒" % (count, time.time() - start_time)) print("------------------------華麗的分割線-----------------------------") print("即將掃描生成的圖片.............................................................................................倒計時5") print("即將掃描生成的圖片.............................................................................................倒計時4") print("即將掃描生成的圖片.............................................................................................倒計時3") print("即將掃描生成的圖片.............................................................................................倒計時2") print("即將掃描生成的圖片.............................................................................................倒計時1") print("開始掃描生成的圖片!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") frisk_image(cover_folder, cover_table) #調用圖片處理 frisk_image(avatar_folder, avatar_table) #調用圖片處理
經實測,效果顯著,百試不爽幾乎完美適配全部豆瓣電影!!圖書與音樂相似,在此再也不贅述!感謝閱讀!數據庫