selenium+phantomjs+pyquery 爬取淘寶商品信息

 1 from selenium import webdriver
 2 from selenium.common.exceptions import TimeoutException
 3 from selenium.webdriver.common.by import By
 4 from selenium.webdriver.support.ui import WebDriverWait
 5 from selenium.webdriver.support import expected_conditions as EC
 6 import re
 7 from pyquery import PyQuery as pq
 8 from config import *
 9 import pymongo
10 
11 client = pymongo.MongoClient(MONGO_URL)
12 db =client[MONGO_DB]
13 
14 browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
15 wait = WebDriverWait(browser, 10)# 等待時長10秒,默認0.5秒詢問一次,等待頁面加載完成,找到某個條件發生後再繼續執行後續代碼,若是超過設置時間檢測不到則拋出異常
16 browser.set_window_size(1400,900)
17 def search():
18     print("正在搜索")
19     try:
20         browser.get('https://www.taobao.com/')
21         input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q")))#模擬輸入框
22         submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
23         #模擬搜索按鈕
24         input.send_keys('美食')#添加input
25         submit.click()#模擬按下搜索按鈕
26         total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total")))
27         get_product()
28         return total.text
29     except TimeoutException:
30         return search()
31 def next_page(page_number):#翻頁,把當前頁碼清除後,直接跳轉到想去的頁碼
32     print("正在翻頁",page_number)
33     try:
34         input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
35         submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
36         input.clear()
37         input.send_keys(page_number)
38         submit.click()
39         wait.until(EC.text_to_be_present_in_element(
40             (By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
41         get_product()
42     except TimeoutException:
43         next_page(page_number)
44 def get_product():#得到每頁商品內容,pyquery not understand
45     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
46     html =browser.page_source
47     # print(html)
48     doc = pq(html)
49     items = doc("#mainsrp-itemlist .items .item").items()#注意class名字後空格!
50     for item in items:
51         product = {
52             'image':item.find('.pic .img').attr('src'),
53             'price':item.find('.price').text(),
54             'deal':item.find('.deal-cnt').text()[:-3],
55             'title':item.find('.title').text(),
56             'shop':item.find('.shop').text(),
57             'location':item.find('.location').text()#find 查找的是div class的名字 別的標籤不能夠
58         }
59         print(product)
60         save_to_mongo(product)
61 def save_to_mongo(result):#將數據存儲到mongodb
62     try:
63         if db[MONGO_TABLE].insert(result):
64             print('存儲成功->',result)
65     except Exception:
66         print('存儲失敗->',result)
67 def main():
68     total = search()
69     total = int(re.compile('\d+').search(total).group(0))
70     for i in range(2,20):
71         next_page(i)
72     browser.close()
73 
74 if __name__ == '__main__':
75     main()

MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_TABLE = 'product'

SERVICE_ARGS  = ['--load-images=false', '--disk-cache=true']
config
相關文章
相關標籤/搜索