User-agent: Baiduspider Allow: / User-agent: Baiduspider-image Allow: / User-agent: YoudaoBot Allow: / User-agent: Sogou web spider Allow: / User-agent: Sogou inst spider Allow: / User-agent: Sogou spider2 Allow: / User-agent: Sogou blog Allow: / User-agent: Sogou News Spider Allow: / User-agent: Sogou Orion spider Allow: / User-agent: Sosospider Allow: / User-agent: 360Spider Allow: / User-agent: Sogouspider Allow: / User-agent: * Disallow: /
對於千千音樂的首頁的歌單進行爬取,建立以歌單爲名字的文件夾而且下載歌單內的全部歌曲保存至本地html
難點:千千音樂他音頻是由JS生成的難點就是找到他的js連接python
不要加多進程
與多線程
進去增長千千音樂的負擔,只作類人爬取,對於技術的練習git
爬取內容請不要用作商業用途
github
https://github.com/a568972484/spider_musicweb
具體查看項目json
import requests from lxml.html import etree import os import re import json headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', } #輸出字典形式 歌單名字:url def musics_name_urls_dict(): response = requests.get('http://music.taihe.com/',headers= headers) response.encoding = 'utf8' response_html = etree.HTML(response.text) music_xpath = '//*[@id="app"]/div/div[1]/section/section/div/div/div/div/div/h3/a/@href' music_name_xpath = '//*[@id="app"]/div/div[1]/section/section/div/div/div/div/div/h3/a/@title' music_urls = response_html.xpath(music_xpath) music_urls_list = [] for urls in music_urls: music_urls_list.append(f'http://music.taihe.com/{urls}') music_name = response_html.xpath(music_name_xpath) music_name_urls_zip = zip(music_name,music_urls_list) return dict(music_name_urls_zip) #根據歌單新建歌單名字的文件夾位置放在D:\music\下面,而且把歌單中歌曲的名字和歌曲的url放置在一個txt文本文檔中格式爲歌名:url def music_name_urls(musics_name_url_sdict:dict): for name,urls in musics_name_url_sdict.items(): # 建立文件夾music if not os.path.exists('D:\\music\\'): os.mkdir('D:\\music\\') # c建立歌單文件夾 # 建立歌單時候歌單的名字由字符串,字母,下劃線組成 name = re.sub('\W','',name) file_path = os.path.join('D:\\music\\',f'《{name}》') if not os.path.exists(file_path): os.mkdir(file_path) #獲取歌單中的歌曲名和url headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', } response = requests.get(urls, headers=headers) response.encoding = 'utf8' response_html = etree.HTML(response.text) music_name_xpath = '// *[ @ id = "songList"]/div/ul/li/div[2]/span/a[1]/@title' #歌名 music_xpath = '// *[ @ id = "songList"]/div/ul/li/div[2]/span/a[1]/@href' #歌曲連接 music_singers_xpath = '//*[@id="songList"]/div/ul/li/div[3]/span/a[1]/@title' #歌手名稱 music_urls = response_html.xpath(music_xpath) music_urls_list = [] for urls in music_urls: music_urls_list.append(f'http://music.taihe.com/{urls}') music_name = response_html.xpath(music_name_xpath) music_singers = response_html.xpath(music_singers_xpath) name_url_singers_zip = zip(music_name,music_urls_list,music_singers) #寫入txt文件 for name,url,singers in name_url_singers_zip: with open(f'{file_path}\\歌名和歌地址.txt','a',encoding='utf8') as fa: fa.write(f'{name}-{singers}&{url}\n') print(f'{file_path} 歌單生成完畢') #根據生成的歌單的txt文檔咱們對TXT文檔進行分析,分析後的內容爲歌單與其對應的內容歌名+id的一個zip文件 def get_music_name_id(): catalog = os.listdir('D:\\music\\') print(catalog) #若是目錄沒有這個文件 lis = [] #一個歌單放一個列表中 for music_lis in catalog: music_txt_path = os.path.join('D:\\music\\',music_lis,'歌名和歌地址.txt') #讀取文件中的歌名+歌手,以及歌曲ID,並組成字典 with open(music_txt_path,'r',encoding='utf8') as fr: dic = {} for name_url in fr: name_url_list = name_url.strip().split('&') #獲取歌曲名字,和生成後歌曲的路徑 music_name = name_url_list[0] music_file_path = os.path.join('D:\\music\\',music_lis,f'{music_name}.mp3') #歌曲的連接 music_id = name_url_list[1].split('/')[-1] dic[music_name] = music_id lis.append(dic) return zip(catalog,lis) #歌曲進行下載 def dump_music(zip): for music_file,music_name_id_dic in zip: file_path = os.path.join('D:\\music\\',music_file) for name,id in music_name_id_dic.items(): url = f'{id}' #這裏是錯誤滴,正確的在我guilb上,大家本身去瞧瞧哈,好用給顆星星謝謝 music_file_path = os.path.join(file_path,f'{name}.mp3') data = requests.get(url,headers=headers) data_text = re.sub('/','',data.text) try: #獲取歌曲URL music_url = re.findall('"file_link":(.*?),',data_text)[0] music_url = music_url.replace('\\\\','\\') music_url = music_url.replace('"','') music_url = music_url.replace('http:\\','http:\\\\') music_url = music_url.replace('\\','/') #下載歌曲 #獲取內容 response = requests.get(music_url, headers=headers) response_data = response.content #下載到本地 with open(music_file_path,'wb') as fw: fw.write(response_data) text_path = os.path.join(file_path, '下載成功的歌曲.txt') with open(text_path,'a',encoding='utf8') as fa: fa.write(f'歌曲名:{name}\n歌曲ID:{id}\n請求url:{music_url}\n') print(name,'下載成功') except: #可能會存在見狀新能夠到時候單獨進行處理 print(name,id,'url匹配失敗') text_path = os.path.join(file_path,'匹配失敗歌曲.txt') with open(text_path,'a',encoding='utf8') as fa: fa.write(f'歌曲名:{name}\n歌曲ID:{id}\n請求url:{url}\n') if __name__ == '__main__': print('開始') #保存文本建立文件夾 music_name_urls(musics_name_urls_dict()) #讀取文件夾 dump_music(get_music_name_id()) #最後文件建立再D盤中
做者名稱:a568972484多線程
做者博客:小小鹹魚ywyapp
博客連接:https://www.cnblogs.com/pythonywy
ide