# -*- coding: utf-8 -*- from __future__ import division from selenium import webdriver import time from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import requests from threading import Thread from pyquery import PyQuery as pq import chardet import copy import xlwt import os import mPing import datetime import xlwt from xlrd import open_workbook now_time = time.strftime('%H-%M-%S', time.localtime(time.time())) print now_time # print chardet.detect(now_time) # print chardet.detect(time_now_time) #xls_name = ("京東爬蟲數據.xls").decode("utf-8") xls_name = ("京東爬蟲數據"+str(now_time)+".xls").decode("utf-8") #print type(xls_name) #print "京東爬蟲數據"+str(now_time)+".xls" title = ["連接", "名稱", "價格", "曬圖", "好評", "中評", "差評", "所有評價"] urllist = ["https://item.jd.com/11936238.html", "https://item.jd.com/11841674.html" ] URLSource = "京東URL.txt".decode('utf-8') if os.path.isfile(URLSource): print "發現URL文件,準備開始爬蟲".decode('utf-8') else: print "親!!! 當前目錄下的url文件: \"".decode('utf-8')+URLSource+"\" 不存在,請添加後再運行".decode('utf-8') exit(1) def msleep1(): time.sleep(1) def msleep2(): print "...2", time.sleep(1) print "...1", time.sleep(1) print "...0" def msleep3(): print "5", time.sleep(1) print "...4", time.sleep(1) print "...3", time.sleep(1) print "...2", time.sleep(1) print "...1", time.sleep(1) print "...0" def warnningtext(): return "這裏沒法正確獲取數據(偶爾網速問題會影響一兩個數據),請手動檢查,若是是代碼問題請聯繫開發修改".decode("utf-8") def cannotgetdataprint(text): print ("沒法獲取"+text+" 請手動檢查一下而後聯繫開發人員").decode('utf-8') def mprint(str): #print "", print "############# " + str.decode('utf-8') + " #############" def debugprint(str): print "", #不換行空輸出 "" 後面加 , print "debugprint@@@ " + str.decode('utf-8') def totwrite(str): return str.decode('utf-8') # mPing.mNetPing('jd.com') # chromeOptions = webdriver.ChromeOptions() # prefs = {"profile.managed_default_content_settings.images":2} # chromeOptions.add_experimental_option("prefs",prefs) # driver = webdriver.Chrome(chrome_options=chromeOptions) prefs = {"profile.managed_default_content_settings.images":2} option = webdriver.ChromeOptions() option.add_argument("test-type")#不顯示警告 option.add_experimental_option("prefs",prefs)#不顯示圖片 global timesurl timesurl = 1 global webdriver_chrome #webdriver_chrome = webdriver.PhantomJS()#phantomjs沒法加載ajax 因此這裏不能用 仍是要用chrome來模擬動態的加載 webdriver_chrome = webdriver.Chrome(chrome_options=option) #webdriver_chrome.set_window_size(2000,2000) def isUrlBefore(): pass#打開url後地址是否被跳轉 若是跳轉那就跳過該地址並寫入警告 def isString(isstr, data): if isstr in str(data.encode("utf-8")): return True else: return False def openweb(url): global starttime global driver_wait global isOffsale COUNTINUE = False SKIP = 1 TIAOZHUAN = 2 LOADERROR = 3 FATALERROR = 4 mprint("努力加載連接中,請耐心等待") try: try:#獲取源碼進行判斷 respone = requests.get(url) #正確打開鏈接 isOffsale = False #初始化設置爲不下櫃 if respone.status_code == 200:#正確加載價格頁面包括下櫃的頁面 if "商品評價" in str(respone.text.encode("utf-8")):#說明頁面正常訪問到商品頁面 不然可能被跳轉了 # print respone.text isOffsale = False if "商品已下櫃" in str(respone.text.encode("utf-8")): isOffsale = True else: return TIAOZHUAN #說明頁面不是價格頁面 被跳轉了? else:#沒法打開鏈接 return LOADERROR#狀態碼不是200說明訪問有問題 except Exception, e: print Exception, e#沒法獲取源碼 return FATALERROR #如下代碼應該不會被執行 webdriver_chrome.get(url) # mprint("獲取當前地址") if "?c" in getcurrenturl():#有了上面的if "商品評價" in判斷後這段代碼應該不會被執行到 mprint("地址已經被跳轉") return SKIP driver_wait = WebDriverWait(webdriver_chrome, 10) return COUNTINUE except Exception: mprint("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!請注意,連接有問題 沒法打開 程序可能中止!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print url print getcurrenturl() return SKIP finally: debugprint("打印url") def get_element_bycssselector(css_selector): element = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))) # print element.text return element def get_datanum_bycssselectorlist(css_selector_list, text): for css_selector in css_selector_list: try: # print css_selector element = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))) data_num = element.get_attribute('data-num') if isString(text, element.text): print element.text + ":" + str(data_num) # mprint ("顯示好評") return data_num else: mprint("沒法獲取") except: pass return warnningtext() def get_element_byxpathlist(xpath_list, text): for xpath in xpath_list: try: element = driver_wait.until(EC.element_to_be_clickable((By.XPATH, xpath))) # print element.text if isString(text, element.text): print element.text return element else: mprint("沒法獲取xpath以下") print xpath except: mprint(xpath) pass return None # def try_element(element): # try: # element # except: # pass def getname(): debugprint("start find name btn") try: myname = webdriver_chrome.find_element_by_class_name('sku-name') mprint("1名稱:") print myname.text return myname.text except Exception: pass try: myname = webdriver_chrome.find_element_by_css_selector('#name > h1') mprint("2名稱:")#生鮮 書籍 print myname.text return myname.text except Exception: pass try: myname = webdriver_chrome.find_element_by_css_selector('#name') mprint("3名稱:")#生鮮 書籍 print myname.text return myname.text except Exception: mprint("第 3次 抓取商品名稱失敗") return warnningtext() def getprice(): debugprint("start getprice") try: myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.summary.summary-first > div > div.summary-price.J-summary-price > div.dd > span'))) mprint("1價格:") # print myprice.text finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep1() finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep2 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep3 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') print finalprice return finalprice except Exception:#估計下架 作下架的抓取 pass try: # 生鮮 書籍 抓取價格 myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#jd-price"))) # 生鮮 可用 # myprice = webdriver_chrome.find_element_by_xpath("/html/body/div[7]/div/div[2]/div[3]/div/div[1]/div[2]/span/span[2]") mprint("2價格:") # print myprice.text finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep1 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep2 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep3 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') print finalprice return finalprice except Exception: # 估計下架 作下架的抓取 pass try: # 生鮮 書籍 抓取價格 myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.summary-price.J-summary-price > div > div.dd > span > span"))) # 生鮮 可用 mprint("3價格:") # print myprice.text finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep1 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep2 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') if finalprice == "": msleep3 () finalprice = myprice.text.encode ('utf-8').replace ('¥', '') print finalprice return finalprice except Exception: # 估計下架 作下架的抓取 pass try: # 下架的抓取 前面判斷了下架 這裏基本上不會執行了 mprint("4下架:") soldout = webdriver_chrome.find_element_by_class_name('itemover-tip') # 抓下櫃 下架 「該商品已下櫃,歡迎挑選其餘商品!」 print soldout.text return soldout.text except Exception: mprint("抓不到價格 也不是下架 請檢查") return warnningtext() def scrolldown(): debugprint("準備開始滾動500") webdriver_chrome.execute_script("window.scrollBy(0,500)") debugprint("已向下滾動500") def clickcommentbtn(): xpath1 = '//*[@id="detail"]/div[1]/ul/li[5]' xpath2 = '//*[@id="detail"]/div[1]/ul/li[4]' # xpath3 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)' btn = get_element_byxpathlist([xpath1, xpath2], "商品評價") if btn is not None: try: btn.click() # mprint("xpath點擊") except Exception, e: mprint("btn非空 不過點擊失敗了 通常不會這樣的 報錯是不是:Element is not clickable at point (697, 299). Other element would receive the click") print Exception, e else: # pass#其餘判斷 基本上不會到這裏 css_sele1 = '# detail > div.tab-main.large > ul > li:nth-child(4)' css_sele2= '#detail > div.tab-main.large > ul > li.current' try: get_element_bycssselector(css_sele1).click() mprint("經過csssele獲取到") print css_sele1 except: try: get_element_bycssselector(css_sele2).click() mprint("經過csssele獲取到") print css_sele1 except: mprint("實在找不到 聯繫開發 程序可能終止") """ try:#1#detail > div.tab-main.large > ul > li.current > s mysumcommentbtn = webdriver_chrome.find_element_by_xpath ('//*[@id="detail"]/div[1]/ul/li[5]') mprint("1點擊") print mysumcommentbtn.text, # 三個按鈕的連接要用其餘的(運動戶外類) # mprint("運動戶外類?") if "商品評價" in str(mysumcommentbtn.text.encode("utf-8")): mysumcommentbtn.click() mprint("~~~~~~點擊了按鈕") # 這句有問題 return True else: mprint("找不到按鈕 商品評價 繼續尋找2") except: pass try:#2 mysumcommentbtn = webdriver_chrome.find_element_by_xpath ('//*[@id="detail"]/div[1]/ul/li[4]') mprint("2點擊") print mysumcommentbtn.text if "商品評價" in str(mysumcommentbtn.text.encode("utf-8")): mysumcommentbtn.click() mprint("~~~~~~點擊了評論總量按鈕") return True else: mprint("找到按鈕 不是商品評價 繼續尋找3") except: mprint("2點擊找不到繼續下一步") pass try:#3 css_sele = '# detail > div.tab-main.large > ul > li:nth-child(4)' # 香蕉 # http: // item.jd.com / 11461683.html mysumcommentbtn = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) mprint("3點擊") print mysumcommentbtn.text if "商品評價" in str(mysumcommentbtn.text.encode("utf-8")): mysumcommentbtn.click() mprint("~~~~~~點擊了評論總量按鈕") return True except: mprint("找不到按鈕 商品評價 繼續尋找4 ") pass try:#4 css_sele = '#detail-tab-comm' # 書籍類比較多 mysumcommentbtn = driver_wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele))) mprint("4點擊") print mysumcommentbtn.text if "商品評價" in str(mysumcommentbtn.text.encode("utf-8")): mysumcommentbtn.click() mprint("~~~~~~點擊了評論總量按鈕") return True except: mprint("找不到按鈕 商品評價 繼續尋找5") pass try:#5 css_sele = '#detail > div.tab-main.large > ul > li.current' # 香蕉 書籍 # http: // item.jd.com / 11461683.html mysumcommentbtn = driver_wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele))) mprint("5點擊") print mysumcommentbtn.text if "商品評價" in str(mysumcommentbtn.text.encode("utf-8")): mysumcommentbtn.click() mprint("~~~~~~點擊了評論總量按鈕") return True else: mprint("第五次也找不到 只能手動找了") print getcurrenturl() return warnningtext() except: mprint("沒法找到商品評價按鈕 請聯繫開發 提供url:") print getcurrenturl() return warnningtext() """ def getshowpicnum(): css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)' css_sele2 = '#comments-list > div.mt > div > ul > li:nth-child(2)' for i in range(3):#循環查找3次 pic_num = get_datanum_bycssselectorlist ([css_sele1, css_sele2], "曬圖") if pic_num is not None: # mprint(pic_num) return pic_num else: # pass mprint("shaitu") # print u"第"+str(i+1)+u"次沒找到,準備開始第"+str(i+2)+u"次查找" """ global data_num global myshowpic try:#comments-list > div.mt > div > ul > li:nth-child(2) # comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2) css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)' myshowpic = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) data_num = myshowpic.get_attribute('data-num') mprint("1曬圖") print myshowpic.text, if "曬圖" in str(myshowpic.text.encode("utf-8")): debugprint("第一次判斷正確 是曬圖按鈕") if data_num is not None: return data_num else: mprint("曬圖的值沒有正確加載 5s後再次驗證") msleep3() data_num = myshowpic.get_attribute ('data-num') if data_num is not None: mprint("找到曬圖值") print myshowpic.text return data_num else: mprint ("曬圖的值沒有正確加載 5s後再次驗證") msleep3 () msleep3 () data_num = myshowpic.get_attribute ('data-num') if data_num is not None: mprint ("找到曬圖值") print myshowpic.text return data_num else:#屢次查找沒法找到值 mprint("#屢次查找沒法找到值") return warnningtext() else: debugprint("第一次判斷錯誤 按鈕找到不是曬圖 聯繫開發提供截圖") except: debugprint("第一次判斷沒找到按鈕 開始第二次") try: css_sele = '#comments-list > div.mt > div > ul > li:nth-child(2)' myshowpic = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) mprint("2曬圖") print myshowpic.text if "曬圖" in str(myshowpic.text.encode("utf-8")): debugprint("第2次判斷正確 是曬圖按鈕") if myshowpic.get_attribute('data-num') is not None: return myshowpic.get_attribute('data-num') else: mprint ("曬圖的值沒有正確加載 5s後再次驗證") msleep3 () data_num = myshowpic.get_attribute ('data-num') if data_num is not None: mprint ("找到曬圖值") print myshowpic.text return data_num else: mprint ("曬圖的值沒有正確加載 5s後再次驗證") msleep3 () msleep3 () data_num = myshowpic.get_attribute ('data-num') if data_num is not None: mprint ("找到曬圖值") print myshowpic.text return data_num else: # 屢次查找沒法找到值 return warnningtext () else: debugprint("第2次判斷錯誤 按鈕找到不是曬圖 聯繫開發提供截圖") except: debugprint("第2次判斷沒找到按鈕 聯繫開發") return warnningtext() """ def totalcomment(): css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.current' css_sele2 = '#comments-list > div.mt > div > ul > li.ui-switchable-item.trig-item.curr' return get_datanum_bycssselectorlist([css_sele1, css_sele2], "所有評價") """ try: css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.current' mypositivecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) # mypositivecomment = webdriver_chrome.find_element_by_css_selector("#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)") data_num = mypositivecomment.get_attribute('data-num') mprint("1所有評價") print mypositivecomment.text, data_num if "所有評價" in str(mypositivecomment.text.encode("utf-8")): debugprint("第1次判斷正確 是所有評價按鈕") if data_num is not None: return data_num else: mprint("所有評價的值沒有正確加載 請手動查找") return cannotgetdataprint(mypositivecomment.text) else: debugprint("第1次判斷錯誤 按鈕找到不是所有評價 聯繫開發提供截圖") except: debugprint("第一次抓所有評價失敗 繼續第二次") pass try: css_sele = '#comments-list > div.mt > div > ul > li.ui-switchable-item.trig-item.curr' mypositivecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) # mypositivecomment = webdriver_chrome.find_element_by_css_selector("#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)") data_num = mypositivecomment.get_attribute('data-num') mprint("2所有評價") print mypositivecomment.text, data_num if "所有評價" in str(mypositivecomment.text.encode("utf-8")): debugprint("第2次判斷正確 是所有評價按鈕") if data_num is not None: return data_num else: mprint("所有評價的值沒有正確加載 請手動查找") return cannotgetdataprint(mypositivecomment.text) else: debugprint("第2次判斷錯誤 按鈕找到不是所有評價 聯繫開發提供截圖") except: debugprint("第2次抓所有評價失敗 繼續第二次") return cannotgetdataprint("所有評價") """ def getpositivecomment(): css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(4)' css_sele2 = '#comments-list > div.mt > div > ul > li:nth-child(3)' css_sele3 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)' return get_datanum_bycssselectorlist([css_sele1, css_sele2, css_sele3], "好評(") """ try: mypositivecomment = get_element_bycssselector(css_sele1) data_num = mypositivecomment.get_attribute('data-num') mprint("1好評") if isString("好評(", mypositivecomment.text): print mypositivecomment.text + ":" + str(data_num) # mprint ("顯示好評") return data_num else: mprint("好評數量沒法獲取") except: debugprint("第一次抓好評失敗 繼續第二次") pass try:#書籍 香蕉 mypositivecomment = get_element_bycssselector(css_sele2) data_num = mypositivecomment.get_attribute('data-num') mprint("2好評") if isString("好評(", mypositivecomment.text): print mypositivecomment.text + ":" + str(data_num) # mprint ("顯示好評") return data_num else: mprint("好評數量沒法獲取") except: pass try:#?? mypositivecomment = get_element_bycssselector(css_sele3) data_num = mypositivecomment.get_attribute('data-num') if isString("好評(", mypositivecomment.text): mprint ("第3次獲取到好評") print mypositivecomment.text + ":" + str(data_num) # mprint ("顯示好評") return data_num else: mprint("好評數量沒法獲取") print mypositivecomment.text + ":" + str(data_num) # mprint ("顯示好評") except: mprint("沒法獲取到好評") return warnningtext() """ def getmoderatecomment(): css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(5)' css_sele2 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)' return get_datanum_bycssselectorlist([css_sele1, css_sele2], "中評(") """ try: css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(5)' mymoderatecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) # mymoderatecomment = webdriver_chrome.find_element_by_css_selector( # "#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)") data_num = mymoderatecomment.get_attribute('data-num') mprint("1中評") print mymoderatecomment.text + ":" + str(data_num) # mprint("顯示中評") return data_num except: pass try: css_sele = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)' mymoderatecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) # mymoderatecomment = webdriver_chrome.find_element_by_css_selector( # "#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)") data_num = mymoderatecomment.get_attribute('data-num') print mymoderatecomment.text + ":" + str(data_num) # mprint("顯示中評") mprint("2中評") return data_num except: mprint("第二次中評失敗 聯繫開發") """ def getnegativecomment(): css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(6)' css_sele2 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(5)' return get_datanum_bycssselectorlist([css_sele1, css_sele2], "差評(") """ try: css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(6)' mynegativecomment = driver_wait.until (EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) data_num = mynegativecomment.get_attribute('data-num') mprint("1差評") print mynegativecomment.text+":"+str(data_num) # mprint ("顯示差評") return data_num except: debugprint("第一次差評失敗") try: css_sele = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(5)' mynegativecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele))) data_num = mynegativecomment.get_attribute('data-num') print mynegativecomment.text + ":" + str(data_num) # mprint ("顯示差評") mprint("2差評") return data_num except: mprint("第2次差評失敗 聯繫開發") """ def getaddcomment():#追評 css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.J-addComment' return get_datanum_bycssselectorlist ([css_sele1, ], "追評(") """ try: css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.J-addComment' maddcomment = driver_wait.until(EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele))) data_num = maddcomment.get_attribute('data-num') print maddcomment return data_num except: return "若是前面都沒問題 可能這個連接沒有追評 能夠手動確認".decode("utf-8") """ def getcurrenturl(): # debugprint("打印當前頁面url: "+str(webdriver_chrome.current_url)) return webdriver_chrome.current_url def mwrite(linenum, zlist): #放一個 要保存的 行數 和 數據list count = len(zlist) #列表數據的長度 mprint("準備插入第 "+str(linenum+1)+" 條數據,一共:"+str(count)+"列") title_style = xlwt.easyxf('font: name Times New Roman, color-index red, bold on', num_format_str='#,##0.00') if linenum == 0: global wb global ws wb = xlwt.Workbook() ws = wb.add_sheet("京東666".decode("utf-8")) for i in range(0, count):#列數 if i == 0: mprint("寫入以下數據") if linenum == 0:#第1條數據待插入 須要先把標題插入0 再把第一條數據插入1 ws.write(linenum, i, title[i].decode("utf-8"), title_style)#寫標題 ws.write(linenum+1, i, zlist[i])#這個write是一個覆蓋操做 若是沒write就放空 print title[i].decode("utf-8"), zlist[i] wb.save(xls_name) # if i == (count-1): # mprint("完成本條數據寫入") else: # 第2+條數據開始插入 ws = wb.get_sheet(0) ws.write(linenum+1, i, zlist[i]) print title[i].decode ("utf-8"), zlist[i] wb.save(xls_name) # mprint("第"+str(linenum+1)+"條數據寫入成功,還剩"+(sumurlcount-linenum)+"條數據待解析") class MyThread_totalcom(Thread): def __init__(self): Thread.__init__(self) def run(self): # totalcom = totalcomment() self.totalcom = totalcomment() def get_result(self): return self.totalcom class MyThread_showpic(Thread): def __init__(self): Thread.__init__(self) def run(self): self.showpic = getshowpicnum() def get_result(self): return self.showpic def getall(url): starttime = datetime.datetime.now() RETURN_CODE = openweb(url) print RETURN_CODE,'RETURN_CODE' if RETURN_CODE:#TRUE: skip and warning try: if RETURN_CODE == 2: mprint("頁面被跳轉") skiplist = [url, "!!頁面被跳轉".decode("utf-8"), RETURN_CODE, "", "", "", "", ""] return skiplist else:#1 mprint("沒法訪問 檢查網絡是否故障") skiplist = [url, "!!檢查是否沒法打開網頁".decode("utf-8"), RETURN_CODE, "", "", "", "", ""] return skiplist except: mprint("???") skiplist = [url, "!!跳過該條連接".decode("utf-8"), "???????????????????".decode("utf-8"), "", "", "", "", ""] return skiplist else:#FALSE :continue to get the data # starttime = datetime.datetime.now () endtime = datetime.datetime.now() timed = (endtime - starttime).seconds mprint("網頁已經被打開,耗時:"+str(timed)+"秒") debugprint('scrolldown1') #urlcurrent = getcurrenturl()#寫一個 若是連接被跳轉到其餘頁面就跳過的判斷 有時間再寫吧 urlcurrent可能變成 jd.com scrolldown() # msleep1() #scrolldown() # msleep2() debugprint('scrolldown2') name = getname() if isOffsale: # 下櫃 price = "商品已下櫃".decode ("utf-8") else: price = getprice() clickcommentbtn() # msleep2() #好評度能加載完成就能顯示曬圖 try: print u"好評度:", get_element_bycssselector("#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div").text except: mprint("沒法獲取好評度,說明網絡加載緩慢") #想寫個多線程 不過單獨一個的時候正常 若是兩個都放進去就會出問題 難道是selenium不能同時find兩個element? mprint("多線程開始") thd1 = MyThread_totalcom() # thd2 = MyThread_showpic() thd1.start() mprint("MyThread_totalcom線程開始") # thd2.start() # mprint("MyThread_showpic程開始") thd1.join() # thd2.join() totalcom = thd1.get_result() # showpic = thd2.get_result() mprint("多線程結束") # totalcom = totalcomment()#上面用多線程這裏就註釋掉 showpic = getshowpicnum() #上面多線程 只能跑一個 totalcomment和getshowpicnum一塊兒就出問題 好像不是我多線程代碼有問題 是selenium不能同時find多個元素 positivcom = getpositivecomment() modertcom = getmoderatecomment() negtivcom = getnegativecomment() # addcomment = getaddcomment() sumlist = [url, name, price, showpic, positivcom, modertcom, negtivcom, totalcom] # sumlist = [url, name, price, showpic, positivcom, modertcom, negtivcom ,addcomment] # print sumlist return sumlist # a list if __name__ == '__main__': try:#__main__ # print type(now_time), type("時間") print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) # print ".", # time.sleep(0.2) cc = 0 # URLSource total_starttime = datetime.datetime.now() f = open(URLSource, "r") lines = f.readlines() # 讀取所有內容 global sumurlcount sumurlcount = len(lines) print sumurlcount mprint("一共 "+str(sumurlcount)+" 條數據要爬蟲") for jdurl in lines: #for i in urllist: s = [] print jdurl one_starttime = datetime.datetime.now () goodsinfo_list = getall(jdurl.replace("\n", "")) print "test111111111" # print goodsinfo_list mwrite(cc, goodsinfo_list) oneurl_endtime = datetime.datetime.now () oneurl_timed = (oneurl_endtime - one_starttime).seconds mprint ("該條數據寫入完成耗時:" + str (oneurl_timed) + "秒,還剩"+str(sumurlcount - cc - 1)+"條數據待分析,即將開始下一個連接的抓取!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") cc = cc + 1 mprint("@@@@@$$$$$$$$@@@@@ 全部代碼正常運行 無報錯 @@@@@@@@@@@$$$$$$$$$$$$$$$@@@@@@@@@@@@@@@@") total_endtime = datetime.datetime.now () total_timed = (total_endtime - total_starttime).seconds mprint ("整個爬蟲一共耗時:" + str (total_timed) + "秒"+",單條連接平均爬蟲耗時:"+str((round(total_timed/sumurlcount,2)))+ "秒") except Exception, e: print Exception, e mprint("~~~~~~~~中間有 報錯了@@@@@@@@@@@@@@@@") finally: mprint("sleep 10s後關閉瀏覽器") time.sleep(10) webdriver_chrome.quit()