python爬取豆瓣視頻信息代碼

  這裏是爬取豆瓣視頻信息,用pyquery庫(jquery的python庫)。


java

一:代碼

from urllib.request import quote
from pyquery import PyQuery as pq
import requests
import pandas as pd


def get_text_page(movie_name):
    '''
    函數功能:得到指定電影名的源代碼
    參數:電影名
    返回值:電影名結果的源代碼
    '''
    url = 'https://www.douban.com/search?q=' + movie_name
    headers = {
        'Host' : 'www.douban.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
    }
    r = requests.get(url,headers = headers,timeout=5)
    return r.text


def get_last_url(this_text):
    '''
    函數功能:根據指定的源代碼獲得最終的網頁地址
    參數:搜索結果源代碼
    返回值:最終的網頁地址
    '''
    doc = pq(this_text)
    lis = doc('.title a').items()
    k = 0
    this_str = ''
    for i in lis:
        # print('豆瓣搜索結果爲:{0}'.format(i.text()))
        # print('地址爲:{0}'.format(i.attr.href))
        # print('\n')
        if k == 0:
            this_str = i.attr.href
        k += 1
    return this_str


def the_last_page(this_url):
    '''
    函數功能:得到最終電影網頁的源代碼
    參數:最終的地址
    返回值:最終電影網頁的源代碼
    '''
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
    }
    r = requests.get(this_url,headers = headers,timeout=20)
    return r.text


def the_last_text(this_text,movie_name):
    '''
    函數功能:得到每一項的數據
    參數:爬取頁面的源代碼
    返回值:返回空
    '''
    doc = pq(this_text)
    # 獲取標題
    title = doc('#content h1').text()
    # 獲取海報
    photo = doc('.nbgnbg img')
    photo_url = photo.attr.src
    r = requests.get(photo_url)
    with open('{m}.jpg'.format(m = movie_name),'wb') as f:
        f.write(r.content)
    # 電影信息
    message = doc('#info').text()
    # 豆瓣評分
    grade = doc('#interest_sectl').text()
    # 劇情
    things = doc('.related-info').text()
    with open('{0}.txt'.format(movie_name),'w+') as f:
        try:
            f.writelines([title,'\n','\n\n',message,'\n\n',grade,'\n\n',things])
        except:
            f.writelines([title,'\n','\n\n',message,'\n\n',grade])
    # 演員
    # 演員名
    name = []
    person_name  = doc('.info').items()
    for i in person_name:
        name.append(i.text())
    # 演員圖片地址
    person_photo = doc('#celebrities')
    j = 0
    for i in person_photo .find('.avatar').items():
        m = i.attr('style')
        person_download_url = m[m.find('(') + 1:m.find(')')]
        # 下載演員地址
        r = requests.get(person_download_url)
        try:
            with open('{name}.jpg'.format(name = name[j]),'wb') as f:
                f.write(r.content)
        except:
            continue
        j += 1


def lookUrl(this_text,my_str):
    '''
    函數功能:得到觀看連接
    參數:爬取頁面的源代碼
    返回值:返回空
    '''
    doc = pq(this_text)
    all_url = doc('.bs li a').items()
    movie_f   = []
    movie_url = []
    for i in all_url:
        movie_f.append(i.text())
        movie_url.append(i.attr.href)
    dataframe = pd.DataFrame({'觀看平臺':movie_f,'觀看地址':movie_url})
    dataframe.to_csv("{movie_name}的觀看地址.csv".format(movie_name = my_str),index=False,encoding = 'utf_8_sig',sep=',')


def main():
    name = input('')
    my_str = name
    movie_name = quote(my_str)
    page_text = get_text_page(movie_name)   # 得指定電影名的源代碼
    last_url = get_last_url(page_text)      # 根據指定的源代碼獲得最終的網頁地址
    page_text2 = the_last_page(last_url)    # 得到最終電影網頁的源代碼
    the_last_text(page_text2,my_str)        # 得到每一項的數據
    lookUrl(page_text2,my_str)              # 獲得並處理觀看連接


main()

二:結果以下(部分例子)

1.輸入天氣之子

2.輸入百變小櫻魔法卡

必須是已經上映的電影纔有觀看地址python

3.獨立日

相關文章
相關標籤/搜索