Python爬蟲學習第一天--利用selenium和chromedriver驅動瀏覽器爬取網頁

 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 # @Time : 2018/7/12 21:10
 4 # @Author : chenxiaowei
 5 # @Email : chen1020xiaowei@163.com
 6 # @File : vip.py
 7 
 8 from pymongo.errors import ConfigurationError  9 from selenium import webdriver  10 from selenium.common.exceptions import TimeoutException, WebDriverException  11 from selenium.webdriver.common.by import By  12 from selenium.webdriver.support.ui import WebDriverWait  13 from selenium.webdriver.support import expected_conditions as EC  14 from pyquery import PyQuery  15 from urllib3.exceptions import NewConnectionError, MaxRetryError  16 from config_vip import *
 17 from multiprocessing import Pool  18 from selenium.webdriver.chrome.options import Options  19 import os  20 import pymongo  21 import requests  22 import hashlib  23 import time  24 
 25 if browser_method == 0:  26     browser = webdriver.Chrome()  27     print('你選擇使用Chrome()方法...')  28 elif browser_method == 1:  29     browser = webdriver.PhantomJS(service_args=['--load-images=false', '--disk-cache=false'])  30     print('你選擇使用PhantomJS()方法...')  31 else:  32     chrome_option = Options()  33     chrome_option.add_argument('--headless')  34     browser = webdriver.Chrome(options=chrome_option)  35     print('你選擇使用Headless()方法...')  36 
 37 browser.set_window_size(1920, 1080)  38 wait = WebDriverWait(browser, 10)  39 
 40 try:  41     client = pymongo.MongoClient(mongo_url)  42     database = client[mongo_database]  43 except TypeError:  44     print('數據庫建立失敗'.center(130, '*'))  45 except ConfigurationError:  46     print('數據庫建立失敗'.center(130, '*'))  47 
 48 
 49 # 實現數據庫對象
 50 
 51 def drop_down_scrollbar():  52     # 定義下拉滾動條方法
 53     times = 1
 54     while times < total_times:  55         js = "var q=document.documentElement.scrollTop={}".format(times * size)  56  browser.execute_script(js)  57         time.sleep(1)  58         times += 1
 59 
 60 
 61 def get_search(search_word):  62     # 定義get_()search方法
 63     url = main_url  64  browser.get(url)  65     # 打開url,得到內容
 66     time.sleep(3)  67     try:  68         search_bar = wait.until(  69             EC.presence_of_element_located((By.CSS_SELECTOR, '#J-search > div.c-search-form > input')))  70         enter_button = wait.until(  71             EC.element_to_be_clickable((By.CSS_SELECTOR, '#J-search > div.c-search-form > a > span')))  72         # 肯定輸入框和搜索按鈕可用
 73  search_bar.send_keys(search_word)  74         time.sleep(1)  75  enter_button.click()  76         # 輸入關鍵字並點擊搜索
 77         time.sleep(5)  78  drop_down_scrollbar()  79         pages = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_pagingCt > a:nth-child(6)')))  80         # 得到總頁數,main()中做爲for循環參數
 81         print('搜索到{}共{}頁的內容'.format(search_word, pages.text))  82         time.sleep(3)  83         print('開始獲取{}第{}頁的內容...'.format(search_word, str(1)))  84  get_page_detail(search_word)  85         print('完成獲取{}第{}頁的內容...'.format(search_word, str(1)))  86         return pages.text  87     except TimeoutException:  88         print('網頁未加載完成,沒法搜索信息!', TimeoutException.args)  89         pass
 90     except WebDriverException:  91         print(WebDriverException.args)  92         pass
 93 
 94 
 95 def get_next_page(search_word, page):  96     # 定義get_next_page()方法進行跳轉
 97     try:  98         url1 = url_search.format(search_word, str(page))  99         # 找出網頁規律,定個模板
100         print('開始獲取{}第{}頁的內容...\n'.format(search_word, page)) 101  browser.get(url1) 102  drop_down_scrollbar() 103  get_page_detail(search_word) 104         print('完成獲取{}第{}頁的內容...\n'.format(search_word, page)) 105     except TimeoutException: 106         print('跳轉網頁超時!', TimeoutException.args) 107         pass
108     except WebDriverException: 109         print(WebDriverException.args) 110         pass
111 
112 
113 def get_page_detail(search_word): 114     # 定義get_page_detail()方法獲取網頁詳細信息
115     try: 116         source = browser.page_source 117         html = PyQuery(source) 118         print('解析數據成功'.center(130, '*')) 119         # PyQuery解析源代碼
120         good_items = html('.goods-list .goods-list-item').items() 121         # 調用items()方法得到數據
122         for item in good_items: 123             goods = { 124                 'good-title': item.find('.goods-title-info ').text().split('\n')[1], 125                 'good-sells-price': item.find('.goods-info .goods-price-wrapper .goods-sells-price .price').text(), 126                 'good-market-price': item.find('.goods-info .goods-market-price').text()[2:], 127                 'good-discount': item.find('.goods-info .goods-discount').text(), 128                 'good-brand': item.find('.goods-info .goods-brand').text(), 129                 'image': 'http:{}'.format(item.find('.goods-slide .goods-image-link .J_img').attr('src')), 130                 'detail': 'http:{}'.format(item.find(' .goods-slide .goods-image-link').attr('href')) 131  } 132             image_url = goods['image'] 133             content = get_image_content(image_url) 134             if content: 135                 # 肯定圖片網頁是否能夠打開
136  download_image(content, search_word, image_url) 137  save_to_mongodb(goods, search_word) 138         # 調用find方法和CSS取得數據
139     except TimeoutException: 140         print('爬取網頁超時!', TimeoutException.args) 141         pass
142 
143 
144 def save_to_mongodb(goods, database_table): 145     # 定義save_to_mongoDB(goods)方法將數據存儲到mongoDB數據
146     try: 147         if database[database_table].insert(goods): 148             # 插入數據成功
149             print('存儲數據成功'.center(130, '*')) 150             print(goods, '\n') 151     except Exception: 152         print('寫入數據出錯!', Exception.args) 153         pass
154 
155 
156 def get_image_content(url): 157     try: 158         response = requests.get(url) 159         if response.status_code == 200: 160             return response.content 161         else: 162             print('請求圖片連接失敗!') 163     except ConnectionError: 164         print(ConnectionError.args) 165         return False 166     except NewConnectionError: 167         print(NewConnectionError.args) 168         return False 169     except MaxRetryError: 170         print(MaxRetryError.args) 171         return False 172 
173 
174 def download_image(content, folder, image_url): 175     # 定義download_image(content)保存圖片
176     time_stamp = time.strftime("%Y%m%d", time.localtime()) 177     path = file_path.format(mongo_database, time_stamp, folder) 178     if os.path.exists(path): 179         pass
180     else: 181  os.makedirs(path) 182     # 利用hash算法得到content MD5值以16進制顯示
183     filename = hashlib.md5(content).hexdigest() 184     with open(file_type.format(path, filename), 'wb')as f: 185  f.write(content) 186  f.close() 187     # 打開文件保存路徑,文件名,格式,wb寫入形式
188     print(' {} 下載圖片成功'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())).center(125, '*')) 189     print(filename, image_url) 190 
191 
192 def main(search_word): 193     pages = int(get_search(search_word)) 194     page = 2
195     if pages >= end: 196         pages = end 197     try: 198         while page <= pages: 199  get_next_page(search_word, page) 200             page += 1
201     except TimeoutException: 202         print(TimeoutException.args) 203         pass
204 
205 
206 if __name__ == '__main__': 207     pool = Pool(processes=2) 208     pool.map(main, [keyword for keyword in keywords]) 209  pool.close() 210     #鎖定進程池
211  pool.join() 212     os.system('taskkill /im chromedriver.exe /F') 213     os.system('taskkill /im chrome.exe /F') 214     #殺死多餘的chromedriver進程以及chrome進程
 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 # @Time : 2018/7/12 23:48
 4 # @Author : chenxiaowei
 5 # @Email : chen1020xiaowei@163.com
 6 # @File : config_vip.py
 7 mongo_url = 'localhost'
 8 mongo_database = 'vip'
 9 #數據庫地址以及名稱
10 main_url = 'https://www.vip.com/'
11 
12 total_times =16
13 size =500
14 #設定下來滾動條的次數和大小
15 
16 browser_method = 2
17 #驅動瀏覽器的方法
18 
19 start=1
20 end = 45
21 #設定結束網頁,有些網頁沒有內容,容易引發一場
22 url_search = 'https://category.vip.com/suggest.php?keyword={}&page={}&count=100&suggestType=brand#catPerPos'
23 #定義模板
24 file_path = 'H:/Python_download/{}/{}/image/{}/'
25 file_type = '{}{}.jpg'
26 # 文件類型以及文件夾
27 
28 
29 keywords=['蘋果','雪梨','香蕉']
相關文章
相關標籤/搜索