最最簡單的python爬蟲教程--爬取百度百科案例

時間 2021-01-19

標籤 html 正則表達式 app dom ide url code xml htm blog 欄目 Python 简体版

原文原文鏈接

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import random
base_url = "https://baike.baidu.com"
#導入相關的包html

his = ["/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711"]
#初始化url
#循環選取20百度百科的數據
for i in range(20):
url = base_url + his[-1]
#組合url
html = urlopen(url).read().decode('utf-8')
#獲取網頁內容
soup = BeautifulSoup(html, features='lxml')
#beautifulsoup經過lxml顯示解析網頁正則表達式

print(i, soup.find('h1').get_text(), '    url: ', base_url+his[-1])
#將如下信息打印出來

sub_urls = soup.find_all("a", {"target": "_blank", "href": re.compile("/item/(%.{2})+$")})
#經過正則表達式，首先找到a標籤，而後選取含有target的內容，而且href 她的必須匹配以/item/開頭的形式
if len(sub_urls) != 0:
    his.append(random.sample(sub_urls, 1)[0]['href'])
    #經過random的sample方法從sub-url中水機選去一個長度爲一的list的a標籤，而後選區他的href屬性
else:
    # no valid sub link found
    his.pop()
    #若是當前沒有連接，退出再來，而後再選擇一個，在來
            ![](https://s4.51cto.com/images/blog/201803/27/2ec8773ff147c38305ae581297c51351.png?x-oss-process=image/watermark,size_16,text_QDUxQ1RP5Y2a5a6i,color_FFFFFF,t_100,g_se,x_10,y_10,shadow_90,type_ZmFuZ3poZW5naGVpdGk=)