抓取淘寶美食數據

時間 2019-11-10

標籤抓取淘寶美食數據简体版

原文原文鏈接

1.進入淘寶，主頁：https://www.taobao.com/css

2.搜索：美食，點擊搜索html

3.獲得當前搜索結果商品的：price(價格)，location(銷售地)，shop(商店名稱)，image(圖片)，title(商品名稱)，product_link商品鏈接web

4切換點擊到最後一頁中止chrome

代碼：瀏覽器

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import re
from bs4 import BeautifulSoup

SERVICE_ARGS = ['--load-images=false', '--disk-cache=false']
# 全局變量

# 無界面的瀏覽器
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service_args=SERVICE_ARGS, options=options)

# 設置瀏覽器的窗口大小
# driver.set_window_size(width=1360, height=768)
# 第一個參數是傳入driver,第二個參數是等待時間
wait = WebDriverWait(driver, 5)


def next_page(page):
    print("正在切換===", page, "頁")

    input = driver.find_element_by_css_selector("#mainsrp-pager div div div div.form input")
    # 清空內容
    input.clear()
    # 輸入傳入的頁面
    input.send_keys(page)

    submit = driver.find_element_by_css_selector("#mainsrp-pager div div div div.form span.btn.J_Submit")
    submit.click()

    # 判斷是否切換成功,等待校驗是否成功
    wait.until(
        EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > ul > li.item.active"),
                                         str(page)))

    # 解析對應頁面的數據
    get_product_info(page)


def get_product_info(page):
    print("當前正在解析========", page, "頁")

    # 判斷頁面是否加載完成
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))

    # 當前頁面的數據
    html = driver.page_source

    soup = BeautifulSoup(html, "lxml")
    # 獲得當前頁面的因此的商品標籤item
    product_lists = soup.select("#mainsrp-itemlist .items .item")

    for product in product_lists:
        print("---" * 100)
        item = {}
        # 出售地點
        location = product.select(".location")[0].text
        # 店鋪名稱
        shopname = product.select(".shopname")[0].text
        # 商品名稱
        title = product.select(".title .J_ClickStat")[0].text

        # 商品的圖片
        iamge = product.select("img")[0]["data-src"]

        data_link = product.select(".pic-link.J_ClickStat.J_ItemPicA")[0]["href"]

        # print(location,shopname,title,iamge,data_link)
        item["location"] = location
        item["shopname"] = shopname
        item["title"] = title
        item["iamge"] = iamge
        item["data_link"] = data_link

        print(item)


# 獲得總頁數

def get_total_page():
    driver.get("https://www.taobao.com/")
    # 等待搜索寬出現
    input = wait.until(EC.presence_of_element_located((By.ID, "q")))
    input.clear()
    input.send_keys("美食")

    # 按點擊按鈕

    driver.find_element_by_css_selector(".btn-search").click()

    # 第1頁

    # 獲得總頁數
    total = driver.find_element_by_class_name("total").text
    # print(total)
    total_num = re.compile(r'\d+').search(total).group()

    # 寫一個函數獲取當前頁（第一頁的數據）

    get_product_info(1)

    return total_num


if __name__ == "__main__":
    toto_page = get_total_page()
    print("總頁數===", toto_page)

    for page in range(2, int(toto_page) + 1):
        print(page)
        next_page(page)

    # 退出瀏覽器
    driver.quit()

運行效果：less

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。