抓取淘寶部分信息

from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import re from pyquery import PyQuery as pq from taobaoconn import * import pymongo client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] brower = webdriver.Chrome() wait = WebDriverWait(brower,10) #查詢方法 def search(): brower.get("http://www.taobao.com") try: input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))) button = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button"))) input.send_keys('美食') button.click() total = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total"))) get_products() return total.text except TimeoutError: return search()html

#獲取下一頁的方法 def next_page(page_number): try: input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) button = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))) input.clear() input.send_keys(page_number) button.click() wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number))) get_products() except TimeoutError: return next_page(page_number)web

#獲取寶貝信息內容 def get_products(): wait.until( EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item"))) html = brower.page_source doc = pq(html) items = doc('#mainsrp-itemlist .items .item').items() for item in items: prduct ={ 'imag':item.find('.pic .img').attr('src'), 'price':item.find('.price').text(), 'deal':item.find('.deal-cnt').text()[:-3], 'title':item.find('.title').text(), 'shop':item.find('.shop').text(), 'location':item.find('.location').text(), } save_to_mongodb(prduct) #定義保存到數據庫的方法 def save_to_mongodb(result): try: if db[MONGO_TABBLE].insert(result): print('保存成功') except Exception: print('出現存儲異常')mongodb

def main(): total = search() pattern = re.compile('(\d+)') total = int(re.search(pattern,total).group(1)) print(total) for i in range(2,2): next_page(i) if name == "main": main()數據庫

相關文章
相關標籤/搜索