1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @Time : 2018/7/12 21:10
4 # @Author : chenxiaowei
5 # @Email : chen1020xiaowei@163.com
6 # @File : vip.py
7
8 from pymongo.errors import ConfigurationError 9 from selenium import webdriver 10 from selenium.common.exceptions import TimeoutException, WebDriverException 11 from selenium.webdriver.common.by import By 12 from selenium.webdriver.support.ui import WebDriverWait 13 from selenium.webdriver.support import expected_conditions as EC 14 from pyquery import PyQuery 15 from urllib3.exceptions import NewConnectionError, MaxRetryError 16 from config_vip import *
17 from multiprocessing import Pool 18 from selenium.webdriver.chrome.options import Options 19 import os 20 import pymongo 21 import requests 22 import hashlib 23 import time 24
25 if browser_method == 0: 26 browser = webdriver.Chrome() 27 print('你選擇使用Chrome()方法...') 28 elif browser_method == 1: 29 browser = webdriver.PhantomJS(service_args=['--load-images=false', '--disk-cache=false']) 30 print('你選擇使用PhantomJS()方法...') 31 else: 32 chrome_option = Options() 33 chrome_option.add_argument('--headless') 34 browser = webdriver.Chrome(options=chrome_option) 35 print('你選擇使用Headless()方法...') 36
37 browser.set_window_size(1920, 1080) 38 wait = WebDriverWait(browser, 10) 39
40 try: 41 client = pymongo.MongoClient(mongo_url) 42 database = client[mongo_database] 43 except TypeError: 44 print('數據庫建立失敗'.center(130, '*')) 45 except ConfigurationError: 46 print('數據庫建立失敗'.center(130, '*')) 47
48
49 # 實現數據庫對象
50
51 def drop_down_scrollbar(): 52 # 定義下拉滾動條方法
53 times = 1
54 while times < total_times: 55 js = "var q=document.documentElement.scrollTop={}".format(times * size) 56 browser.execute_script(js) 57 time.sleep(1) 58 times += 1
59
60
61 def get_search(search_word): 62 # 定義get_()search方法
63 url = main_url 64 browser.get(url) 65 # 打開url,得到內容
66 time.sleep(3) 67 try: 68 search_bar = wait.until( 69 EC.presence_of_element_located((By.CSS_SELECTOR, '#J-search > div.c-search-form > input'))) 70 enter_button = wait.until( 71 EC.element_to_be_clickable((By.CSS_SELECTOR, '#J-search > div.c-search-form > a > span'))) 72 # 肯定輸入框和搜索按鈕可用
73 search_bar.send_keys(search_word) 74 time.sleep(1) 75 enter_button.click() 76 # 輸入關鍵字並點擊搜索
77 time.sleep(5) 78 drop_down_scrollbar() 79 pages = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_pagingCt > a:nth-child(6)'))) 80 # 得到總頁數,main()中做爲for循環參數
81 print('搜索到{}共{}頁的內容'.format(search_word, pages.text)) 82 time.sleep(3) 83 print('開始獲取{}第{}頁的內容...'.format(search_word, str(1))) 84 get_page_detail(search_word) 85 print('完成獲取{}第{}頁的內容...'.format(search_word, str(1))) 86 return pages.text 87 except TimeoutException: 88 print('網頁未加載完成,沒法搜索信息!', TimeoutException.args) 89 pass
90 except WebDriverException: 91 print(WebDriverException.args) 92 pass
93
94
95 def get_next_page(search_word, page): 96 # 定義get_next_page()方法進行跳轉
97 try: 98 url1 = url_search.format(search_word, str(page)) 99 # 找出網頁規律,定個模板
100 print('開始獲取{}第{}頁的內容...\n'.format(search_word, page)) 101 browser.get(url1) 102 drop_down_scrollbar() 103 get_page_detail(search_word) 104 print('完成獲取{}第{}頁的內容...\n'.format(search_word, page)) 105 except TimeoutException: 106 print('跳轉網頁超時!', TimeoutException.args) 107 pass
108 except WebDriverException: 109 print(WebDriverException.args) 110 pass
111
112
113 def get_page_detail(search_word): 114 # 定義get_page_detail()方法獲取網頁詳細信息
115 try: 116 source = browser.page_source 117 html = PyQuery(source) 118 print('解析數據成功'.center(130, '*')) 119 # PyQuery解析源代碼
120 good_items = html('.goods-list .goods-list-item').items() 121 # 調用items()方法得到數據
122 for item in good_items: 123 goods = { 124 'good-title': item.find('.goods-title-info ').text().split('\n')[1], 125 'good-sells-price': item.find('.goods-info .goods-price-wrapper .goods-sells-price .price').text(), 126 'good-market-price': item.find('.goods-info .goods-market-price').text()[2:], 127 'good-discount': item.find('.goods-info .goods-discount').text(), 128 'good-brand': item.find('.goods-info .goods-brand').text(), 129 'image': 'http:{}'.format(item.find('.goods-slide .goods-image-link .J_img').attr('src')), 130 'detail': 'http:{}'.format(item.find(' .goods-slide .goods-image-link').attr('href')) 131 } 132 image_url = goods['image'] 133 content = get_image_content(image_url) 134 if content: 135 # 肯定圖片網頁是否能夠打開
136 download_image(content, search_word, image_url) 137 save_to_mongodb(goods, search_word) 138 # 調用find方法和CSS取得數據
139 except TimeoutException: 140 print('爬取網頁超時!', TimeoutException.args) 141 pass
142
143
144 def save_to_mongodb(goods, database_table): 145 # 定義save_to_mongoDB(goods)方法將數據存儲到mongoDB數據
146 try: 147 if database[database_table].insert(goods): 148 # 插入數據成功
149 print('存儲數據成功'.center(130, '*')) 150 print(goods, '\n') 151 except Exception: 152 print('寫入數據出錯!', Exception.args) 153 pass
154
155
156 def get_image_content(url): 157 try: 158 response = requests.get(url) 159 if response.status_code == 200: 160 return response.content 161 else: 162 print('請求圖片連接失敗!') 163 except ConnectionError: 164 print(ConnectionError.args) 165 return False 166 except NewConnectionError: 167 print(NewConnectionError.args) 168 return False 169 except MaxRetryError: 170 print(MaxRetryError.args) 171 return False 172
173
174 def download_image(content, folder, image_url): 175 # 定義download_image(content)保存圖片
176 time_stamp = time.strftime("%Y%m%d", time.localtime()) 177 path = file_path.format(mongo_database, time_stamp, folder) 178 if os.path.exists(path): 179 pass
180 else: 181 os.makedirs(path) 182 # 利用hash算法得到content MD5值以16進制顯示
183 filename = hashlib.md5(content).hexdigest() 184 with open(file_type.format(path, filename), 'wb')as f: 185 f.write(content) 186 f.close() 187 # 打開文件保存路徑,文件名,格式,wb寫入形式
188 print(' {} 下載圖片成功'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())).center(125, '*')) 189 print(filename, image_url) 190
191
192 def main(search_word): 193 pages = int(get_search(search_word)) 194 page = 2
195 if pages >= end: 196 pages = end 197 try: 198 while page <= pages: 199 get_next_page(search_word, page) 200 page += 1
201 except TimeoutException: 202 print(TimeoutException.args) 203 pass
204
205
206 if __name__ == '__main__': 207 pool = Pool(processes=2) 208 pool.map(main, [keyword for keyword in keywords]) 209 pool.close() 210 #鎖定進程池
211 pool.join() 212 os.system('taskkill /im chromedriver.exe /F') 213 os.system('taskkill /im chrome.exe /F') 214 #殺死多餘的chromedriver進程以及chrome進程
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @Time : 2018/7/12 23:48
4 # @Author : chenxiaowei
5 # @Email : chen1020xiaowei@163.com
6 # @File : config_vip.py
7 mongo_url = 'localhost'
8 mongo_database = 'vip'
9 #數據庫地址以及名稱
10 main_url = 'https://www.vip.com/'
11
12 total_times =16
13 size =500
14 #設定下來滾動條的次數和大小
15
16 browser_method = 2
17 #驅動瀏覽器的方法
18
19 start=1
20 end = 45
21 #設定結束網頁,有些網頁沒有內容,容易引發一場
22 url_search = 'https://category.vip.com/suggest.php?keyword={}&page={}&count=100&suggestType=brand#catPerPos'
23 #定義模板
24 file_path = 'H:/Python_download/{}/{}/image/{}/'
25 file_type = '{}{}.jpg'
26 # 文件類型以及文件夾
27
28
29 keywords=['蘋果','雪梨','香蕉']