2019.07.22(day07)學習筆記(爬蟲)

day07 StudyNote

1.爬蟲

import requests
import re

response = requests.get('http://www.taobao.com')# 模擬瀏覽器打開淘寶網頁
response = requests.get('http://duanziwang.com/')# 模擬瀏覽器打開段子網網頁
data = reponse.text
# .表明匹配全部字符,*表示前面的字符0到無窮個
res = re.findall('href="(.*?)"',data)
print(res)

2.段子網

import requests
import re

response = requests.get('http://duanziwang.com/')
print(response.status_code)
print(response.encoding)
data = response.text
print(data)

# .表明匹配全部字符,*表示前面的字符0到無窮個
content_res = re.findall('<div class="content">(.*?)</div>',data)# 查找內容賦給print(content_res)
title_res = re.findall('<a href="/subject/">(.*?)</a>')
print(title_res.index('活得糊塗的人,容易幸福))# 打印title_res的索引位置9
print(title_res.index('購買銀行理財產品虧損後如何起訴'))#打印title_res的索引位置60
title_res = title_res[10:60]
print(type(title_res))
                      
                      
dict = {}
for i in range(len(title_res):# for循環title_res列表的長度
     dict[titile(i)] = content_res(i) # 字典裏面添加內容    
           
for i in title_content_dic.items():# 循環字典,i是元組類型
    print(f'{i[0]:<40} {i[1]:<1000}')

3.暱圖網爬蟲html

import requests
import re

response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1')
# 模仿瀏覽器打開暱圖網的網頁
data = response.text
# print(data)

# .表明匹配全部字符,*表示前面的字符0到無窮個
res = re.findall('data-src="(.*?)"',data)

# print(res)
for i in res:# 循環渠道的res,列表類型
    print(i)
    res_response = requests.get(i)
    res_data = res_response.content
    res_name = i.split('/')[-1]

    f=open(res_name,'wb')
    f.write(res_data)
    # f.flush()

4.視頻爬蟲python

import requests
import re

response = requests.get('http://www.mod.gov.cn/v/index.htm')
# response.encoding = 'utf8'
data = response.text
# print(data)

# mp4_res1 = re.findall('<a href="(.*?)"  class="img">',data)
# for i in mp4_res1:
#     print(i)


mp4_res2 = re.findall('<a href="(.*?)">', data)

for i in mp4_res2:  # type:str
    res = re.findall('(.*?htm)', i)[0]
    res = 'http://www.mod.gov.cn/v/' + res

    response = requests.get(res)
    data = response.text
    # http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4
    url_res = re.findall('//Video(.*?.mp4)',data)[0]


    mp4_response = requests.get(url_res)
    mp4_data = mp4_response.content
    f = open('test.mp4','wb')
    f.write(mp4_data)
    # break
相關文章
相關標籤/搜索