目錄html
# requests庫 ## requests.get(url) 模擬瀏覽器打開網頁 # re庫 import requests import re response = requests.get('http://ishuo.cn/') # 模擬瀏覽器打開網頁 # print(response.status_code) # 200成功,301,404網頁丟失 # print(response.encoding) # utf-8 data = response.text # # print(data) # .匹配全部字符,*表示前面的字符0到無窮個 content_res = re.findall('<div class="content">(.*?)</div>', data) title_res = re.findall('<a href="/subject/.*?">(.*?)</a>', data) # print(title_res.index('活得糊塗的人,容易幸福')) # print(title_res.index('購買銀行理財產品虧損後如何起訴')) title_res= title_res[10:60] # print(title_res) title_content_dic = {} for i in range(len(title_res)): title_content_dic[title_res[i]] = content_res[i] # print(title_content_dic) # print(title_content_dic) for i in title_content_dic.items(): # print(str(i)+'\n') print(f'{i[0]:<40} | {i[1]}')
import requests import re response = requests.get('http://ishuo.cn/') # 模擬瀏覽器打開網頁 data = response.text res = re.findall('<li class="list_li">(.*?)</li>',data) title_content_desc_dic = {} for i in res: content = re.findall('<div class="content">(.*?)</div>',i)[0] title = re.findall('<a href="/subject/.*?">(.*?)</a>',i)[0] desc = re.findall('</a>(04月.*?)</div>',i)[0] title_content_desc_dic[title] = (content,desc) for i in title_content_desc_dic.items(): print(f'{i[0]:<40} | {i[1]}')
import requests import re response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1&tdsourcetag=s_pcqq_aiomsg') data = response.text # print(data) img_url_res = re.findall('data-src="(.*?)"',data) for i in img_url_res: img_response = requests.get(i) img_data = img_response.content img_name = i.split('/')[-1] f=open(img_name,'wb') f.write(img_data) # f.flush() # 快速刷新
import requests import re response = requests.get('http://www.mod.gov.cn/v/index.htm') # response.encoding = 'utf8' data = response.text # print(data) # mp4_res1 = re.findall('<a href="(.*?)" class="img">',data) # for i in mp4_res1: # print(i) mp4_res2 = re.findall('<a href="(.*?)">', data) for i in mp4_res2: # type:str res = re.findall('(.*?htm)', i)[0] res = 'http://www.mod.gov.cn/v/' + res response = requests.get(res) data = response.text # http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4 url_res = re.findall('//Video (.*?.mp4)',data)[0] mp4_response = requests.get(url_res) mp4_data = mp4_response.content f = open('test.mp4','wb') f.write(mp4_data) # break ''' <a href="2019-07/20/content_4846213.htm" class="img"><img src="attachement/jpg/site21/20190720/6c4b9041ab8b1e9ca1be01.jpg" border="0"><em class="video_40x40"></em></a> '''