#!/usr/bin/python3 # coding=utf8 import requests from bs4 import BeautifulSoup import pymysql import time ''' 需求:某視頻網站,沒有搜索功能,我弄個python爬蟲爬取網站視頻名稱和磁力連接,所有爬取下來放到mysql數據庫中,就能夠按本身喜愛搜索關鍵字得到影片下載地址進行下載了 做者:xiaoxiaohui 時間:2019-10-03 其餘:mysql數據庫建立數據庫和數據表 mysql -uroot -pxxh123 create database 4hucom; use 4hucom; 數據庫id自增加 CREATE TABLE `4hu_shoujixiaoshipin` (`id` INT(11) not null auto_increment,`biaoti` VARCHAR(380), `fabutime` VARCHAR(380), `lianjie` VARCHAR(380),primary key(id) ); 其餘2:由於是經過以前一些爬蟲代碼快速改進,因此關於(1)關於方法名稱get_house_info都是沿用以前爬取租房網站的名稱啦(2)info字典裏面這個'播放地址':fabutime,其實'播放地址'改成bofangdizhi比較好 ''' def get_links(url): response = requests.get(url) soup = BeautifulSoup(response.text,'html.parser') links_div = soup.find_all('li',class_="col-md-2 col-sm-3 col-xs-4") links = ['http://www.網站名馬賽克.com'+div.a.get('href') for div in links_div] #print(links) return links def get_house_info(item_url): response = requests.get(item_url) response.encoding = 'utf-8' soup = BeautifulSoup(response.text,'html.parser') links_div = soup.find_all('ul',class_="playul") lianjie_temp = 'http://www.網站名馬賽克.com'+links_div[1].li.a.get('href')#爬下載連接 這裏注意playul有2個 第一個playul links_div[0]是播放的 第二個playul links_div[1]是下載的 lianjie=get_cililianjie(lianjie_temp) print(lianjie) links_div2 = soup.find_all('div',class_="detail-title fn-clear") biaoti = links_div2[0].text[:].strip() #爬影片名字 我加了.strip() 去空格 #print(biaoti) links_div3 = soup.find_all('ul',class_="playul") fabutime = 'http://www.網站名馬賽克.com'+links_div[0].li.a.get('href') #爬影片播放地址 #print(fabutime) info = { 'id':id, '影片名字':biaoti, '播放地址':fabutime, '下載連接':lianjie } return info def get_cililianjie(url): response = requests.get(url) response.encoding = 'utf-8' soup = BeautifulSoup(response.text,'html.parser') #print(soup) links_div = soup.find_all('div',class_="download") #print(links_div) lianjie = links_div[0].a.get('href') #磁力連接 return lianjie def get_db(setting): return pymysql.connect(**setting) def insert(db,house): values_ = "'{}',"*2 + "'{}'" sql_values = values_.format(house['影片名字'],house['播放地址'],house['下載連接']) sql ='insert into 4hu_shoujixiaoshipin (biaoti,fabutime,lianjie) values({})'.format(sql_values) cursor = db.cursor() cursor.execute(sql) db.commit() DATABASE = { 'host':'127.0.0.1', 'database':'4hucom', 'user':'root', 'password':'xxh123', 'charset':'utf8' #以前代碼是utf8mb4以後我用navicat.exe查看一直是亂碼 改爲utf8 發現navicat.exe查是正常中文了 } db = get_db(DATABASE) #鏈接數據庫 #循環全部頁例子 for yema in range(1,44): if yema == 1: url = 'https://www.網站名馬賽克.com/vod/html7/index.html' else: url = 'https://www.網站名馬賽克.com/vod/html7/index_'+str(yema)+'.html' links = get_links(url) for item_url in links: time.sleep(1.0) house = get_house_info(item_url) print('獲取一條成功:{}'.format(house['影片名字'])) insert(db,house) #插入爬取到的數據輸入進數據庫