爬蟲之BeautifulSoup (三)

時間 2019-11-12

原文原文鏈接

簡單來講，Beautiful Soup是python的一個庫，最主要的功能是從網頁抓取數據。官方解釋以下：html

'''
Beautiful Soup提供一些簡單的、python式的函數用來處理導航、搜索、修改分析樹等功能。
它是一個工具箱，經過解析文檔爲用戶提供須要抓取的數據，由於簡單，因此不須要多少代碼就能夠寫出一個完整的應用程序。
'''

安裝html5

#安裝 Beautiful Soup
pip install beautifulsoup4

#安裝解析器
Beautiful Soup支持Python標準庫中的HTML解析器,還支持一些第三方的解析器,其中一個是 lxml .根據操做系統不一樣,能夠選擇下列方法來安裝lxml:

$ apt-get install Python-lxml

$ easy_install lxml

$ pip install lxml

另外一個可供選擇的解析器是純Python實現的 html5lib , html5lib的解析方式與瀏覽器相同,能夠選擇下列方法來安裝html5lib:

$ apt-get install Python-html5lib

$ easy_install html5lib

$ pip install html5lib

解析器python

簡單使用瀏覽器

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>

<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""


# 方式一:
# soup = BeautifulSoup(html_doc, "lxml")
# print(soup)

# 方式二:
# soup = BeautifulSoup(open('a.html'), "lxml")
# print(soup)


soup = BeautifulSoup(html_doc, "lxml")
# print(soup)

# 自動補全
soup.prettify()

print(soup.p)
# 獲取p標籤下的b標籤
print(soup.p.b)
#  獲取p標籤下的b標籤下的文本
print(soup.p.b.text)

# 找body內的全部標籤
print(soup.body.contents)

# 獲取p標籤屬性
print(soup.p.attrs)

# 獲取p標籤的孩子， 返回一個iter對象
print(list(soup.p.children))

# 獲取p標籤的子子孫孫
print(list(soup.p.descendants))

# 獲取p標籤的爸爸
print(soup.p.parent)

# 獲取p標籤的爸爸， 獲取p標籤的爸爸的爸爸， 獲取p標籤的爸爸的爸爸的爸爸
print(list(soup.p.parents))

# 獲取a標籤內的href屬性
print(soup.a.attrs['href'])

搜索文檔樹cookie

from bs4 import BeautifulSoup
html_doc ="""

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>

<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
# 搜索文檔樹
# 1.文本查找
# 經過文本查找p標籤
soup = BeautifulSoup(html_doc, "lxml")
# print(soup)

# 自動補全
soup.prettify()
print(soup.find_all(name='p'))

# 經過文本查找文本爲$37的p標籤
print(soup.find_all(name='p', text='$37'))

# 經過文本查找id爲link3的a標籤
print(soup.find_all(name='a', attrs={"id": "link3"}))

import re
# 2.正則查找
# 經過正則查找全部p標籤
print(soup.find_all(name=re.compile("^p")))

# 經過正則查找全部a標籤
print(soup.find_all(name=re.compile("^a")))

# 經過正則查找全部id爲link的p標籤
print(soup.find_all(name="p", attrs={"id": re.compile("^link")}))

# 經過正則查找全部id爲link的a標籤
print(soup.find_all(name="a", attrs={"id": re.compile("^link")}))

# 經過正則查找全部class爲story的p標籤
print(soup.find_all(name="p", attrs={"class": re.compile("story")}))

# 3.列表
# 經過列表查找全部的a、p標籤
print(soup.find_all(name=['p', 'a']))

# 經過列表查找全部的正則匹配有Elsie的文本
print(soup.find_all(text=[re.compile("Elsie")]))

# 經過列表查找全部的正則匹配有Elsie的文本的a標籤
print(soup.find_all(name=['a'], text=[re.compile("Elsie")]))


# 4.True
# 獲取全部標籤
print(soup.find_all(name=True))

# 獲取全部有id的a標籤
print(soup.find_all(name="a", attrs={"id": True}))

# # 獲取全部有class的a標籤
print(soup.find_all(name="a", attrs={"class": True}))


# 5.方法
def have_id_not_class(a):
    # if tag.has_attr('id') and not tag.has_attr('class'):
    #     return tag
    if a.has_attr('class') and not a.has_attr('id'):
        return a

# 經過方法查找全部有class沒id的標籤
print(soup.find_all(have_id_not_class))

給新熱抽屜網1-100頁全部用戶點贊函數

import requests

# 一: 訪問主頁獲取cookies
index_res = requests.get('https://dig.chouti.com/', headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
})
# print(index_res.cookies.get_dict())
index_cookies = index_res.cookies.get_dict()

'''
url:
    https://dig.chouti.com/login

method:
    POST

headers:
    user-agent:
    referer:
    cookie

form_data:
    phone: 8615622792660
    password: k46709394
    oneMonth: 1

'''

# 二 攜帶主頁的cookies進行登錄
login_url = 'https://dig.chouti.com/login'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'referer': 'https://dig.chouti.com/',
}

form_data = {
    'phone': '8615622792660',
    'password': 'k46709394',
    'oneMonth': '1'
}

# 往登錄鏈接發送post請求進行登錄
login_res = requests.post(login_url, cookies=index_cookies, headers=headers, data=form_data)
print(login_res)

# 登錄成功返回文本
print(login_res.text)

# 對Id=25068561的用戶進行點贊
# vote_res = requests.post('https://dig.chouti.com/link/vote?linksId=25068561', headers=headers, cookies=index_cookies)
# print(vote_res.text)

'''
url：
    https://dig.chouti.com/all/hot/recent/1
headers:
'''
from bs4 import BeautifulSoup

# 循環拼接1-100頁的 特推 頁面
for line in range(1, 100):
    hot_url = 'https://dig.chouti.com/all/hot/recent/%s' % line
    # 往特推頁面發送get請求獲取當前頁全部用戶
    hot_res = requests.get(hot_url, headers=headers, )
    # print(hot_res.text)

    soup = BeautifulSoup(hot_res.text, 'lxml')

    # 經過bs4獲取到全部id爲content-list用戶特推的div標籤
    div_1 = soup.find(name='div', attrs={"id": "content-list"})
    # print(div_1)

    # # 經過bs4獲取到全部class爲item的用戶特推的div標籤
    div_s = soup.find_all(name='div', attrs={"class": "item"})
    for div in div_s:
        # 獲取全部用戶的id
        # 經過bs4查找到class爲part2的div，而後獲取屬性中的share-linkid進而獲取id值
        id = div.find(name='div', attrs={"class": "part2"}).get("share-linkid")
        # print(id)
        # 對全部用戶進行點贊
        vote_res = requests.post('https://dig.chouti.com/link/vote?linksId=%s' % id, headers=headers,
                                 cookies=index_cookies)
        print(vote_res.text)
        '''
        {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53191784515","likedTime":"1552034126796000","lvCount":"6","nick":"人生苦短灬及時行樂","uvCount":"430","voteTime":"小於1分鐘前"}}}

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。