1.爬取博客園的全部隨筆的url以及計數,還有對應標題
import re
import requests
from lxml.html import etree
import json
#對於連接和標題的一個整合
def func_1_deco(func_1):
def wrapper(*args,**kwargs):
dic = dict()
lis = func_1(*args,**kwargs)
count = lis[0]
url_lis = lis[1]
dic['count'] = count
name_xpath = '//*[@id="cb_post_title_url"]/text()'
for url in url_lis:
response = requests.get(url)
response = response.text
response_html = etree.HTML(response)
name = response_html.xpath(name_xpath)[0]
print(name)
dic[name] = url
return dic
return wrapper
@func_1_deco
def func(url):
lis = []
count = 1
while True:
count_1 = len(lis)
response = requests.get(f'{url}default.html?page={count}')
response = response.text
data_1 = re.findall(' href="(.*?)"', response, re.S)
for a in data_1: # type:str
if a.startswith('http'):
if a.endswith('html'):
if 'archive' not in a:
lis.append(a)
count +=1
lis = set(lis)
lis = list(lis)
count_2 = len(lis)
if count_1 == count_2:
return count_2,lis #博客的數據量,博客裏面隨筆的url
dic = func('你的博客的首頁地址') #注意結尾要有/,字典格式是有一欄'count'計數,其餘均爲標題+對應的url