爬蟲基礎——示例：汽車之家

時間 2019-11-12

原文原文鏈接

# 爬取汽車之家新聞 圖片
import requests
from bs4 import BeautifulSoup


response = requests.get(
    url="https://www.autohome.com.cn/news/"
)

# response.encoding = "gbk"
response.encoding = response.apparent_encoding  # 設置編碼集， response.apparent_encoding父類網頁的編碼


# print(response.text)
# soup = BeautifulSoup(response.text, features='lxml')
soup = BeautifulSoup(response.text, features='html.parser')
target = soup.find(id='auto-channel-lazyload-article')
# print(target)

li_list = target.find_all('li')  # 列表對象

for i in li_list:
    a = i.find("a")
    # print(a)
    if a:
        print(a.attrs.get('href'))  # 新聞 url
        txt = a.find('h3')
        print("對象: ", txt)
        print("文本：", txt.text)  # 新聞標題


        # 爬取全部的文本圖片
        img = a.find('img')
        img_url = img.attrs.get('src')
        print(img_url)
        img_response = requests.get("https:"+img_url)

        import uuid
        file_name = str(uuid.uuid4()) + ".img"

        with open(file_name, 'wb') as f:
            f.write(img_response.content)  # 返回二進制 數據 img_response.content

相關標籤/搜索