import requests import re response = requests.get('http://www.taobao.com')# 模擬瀏覽器打開淘寶網頁 response = requests.get('http://duanziwang.com/')# 模擬瀏覽器打開段子網網頁 data = reponse.text # .表明匹配全部字符,*表示前面的字符0到無窮個 res = re.findall('href="(.*?)"',data) print(res)
import requests import re response = requests.get('http://duanziwang.com/') print(response.status_code) print(response.encoding) data = response.text print(data) # .表明匹配全部字符,*表示前面的字符0到無窮個 content_res = re.findall('<div class="content">(.*?)</div>',data)# 查找內容賦給print(content_res) title_res = re.findall('<a href="/subject/">(.*?)</a>') print(title_res.index('活得糊塗的人,容易幸福))# 打印title_res的索引位置9 print(title_res.index('購買銀行理財產品虧損後如何起訴'))#打印title_res的索引位置60 title_res = title_res[10:60] print(type(title_res)) dict = {} for i in range(len(title_res):# for循環title_res列表的長度 dict[titile(i)] = content_res(i) # 字典裏面添加內容 for i in title_content_dic.items():# 循環字典,i是元組類型 print(f'{i[0]:<40} {i[1]:<1000}')
3.暱圖網爬蟲html
import requests import re response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1') # 模仿瀏覽器打開暱圖網的網頁 data = response.text # print(data) # .表明匹配全部字符,*表示前面的字符0到無窮個 res = re.findall('data-src="(.*?)"',data) # print(res) for i in res:# 循環渠道的res,列表類型 print(i) res_response = requests.get(i) res_data = res_response.content res_name = i.split('/')[-1] f=open(res_name,'wb') f.write(res_data) # f.flush()
4.視頻爬蟲python
import requests import re response = requests.get('http://www.mod.gov.cn/v/index.htm') # response.encoding = 'utf8' data = response.text # print(data) # mp4_res1 = re.findall('<a href="(.*?)" class="img">',data) # for i in mp4_res1: # print(i) mp4_res2 = re.findall('<a href="(.*?)">', data) for i in mp4_res2: # type:str res = re.findall('(.*?htm)', i)[0] res = 'http://www.mod.gov.cn/v/' + res response = requests.get(res) data = response.text # http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4 url_res = re.findall('//Video(.*?.mp4)',data)[0] mp4_response = requests.get(url_res) mp4_data = mp4_response.content f = open('test.mp4','wb') f.write(mp4_data) # break