【爬蟲】京東商品鏈接

# -*- coding: utf-8 -*-
from __future__ import division
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from threading import Thread
from pyquery import PyQuery as pq
import chardet
import copy
import xlwt
import os
import mPing
import datetime
import xlwt
from xlrd import open_workbook
now_time = time.strftime('%H-%M-%S', time.localtime(time.time()))
print now_time
# print chardet.detect(now_time)
# print chardet.detect(time_now_time)
#xls_name = ("京東爬蟲數據.xls").decode("utf-8")
xls_name = ("京東爬蟲數據"+str(now_time)+".xls").decode("utf-8")
#print type(xls_name)
#print "京東爬蟲數據"+str(now_time)+".xls"
title = ["連接", "名稱", "價格", "曬圖", "好評", "中評", "差評", "所有評價"]
urllist = ["https://item.jd.com/11936238.html",
           "https://item.jd.com/11841674.html"
           ]
URLSource = "京東URL.txt".decode('utf-8')
if os.path.isfile(URLSource):
    print "發現URL文件,準備開始爬蟲".decode('utf-8')
else:
    print "親!!! 當前目錄下的url文件:   \"".decode('utf-8')+URLSource+"\"    不存在,請添加後再運行".decode('utf-8')
    exit(1)


def msleep1():
    time.sleep(1)


def msleep2():
    print "...2",
    time.sleep(1)
    print "...1",
    time.sleep(1)
    print "...0"


def msleep3():
    print "5",
    time.sleep(1)
    print "...4",
    time.sleep(1)
    print "...3",
    time.sleep(1)
    print "...2",
    time.sleep(1)
    print "...1",
    time.sleep(1)
    print "...0"


def warnningtext():
    return "這裏沒法正確獲取數據(偶爾網速問題會影響一兩個數據),請手動檢查,若是是代碼問題請聯繫開發修改".decode("utf-8")


def cannotgetdataprint(text):
    print ("沒法獲取"+text+" 請手動檢查一下而後聯繫開發人員").decode('utf-8')


def mprint(str):
    #print  "",
    print "#############   " + str.decode('utf-8') + "   #############"


def debugprint(str):
    print  "",   #不換行空輸出   "" 後面加 ,
    print "debugprint@@@   " + str.decode('utf-8')


def totwrite(str):
    return str.decode('utf-8')

# mPing.mNetPing('jd.com')

# chromeOptions = webdriver.ChromeOptions()
# prefs = {"profile.managed_default_content_settings.images":2}
# chromeOptions.add_experimental_option("prefs",prefs)
# driver = webdriver.Chrome(chrome_options=chromeOptions)

prefs = {"profile.managed_default_content_settings.images":2}
option = webdriver.ChromeOptions()
option.add_argument("test-type")#不顯示警告
option.add_experimental_option("prefs",prefs)#不顯示圖片
global timesurl
timesurl = 1
global webdriver_chrome
#webdriver_chrome = webdriver.PhantomJS()#phantomjs沒法加載ajax 因此這裏不能用 仍是要用chrome來模擬動態的加載
webdriver_chrome = webdriver.Chrome(chrome_options=option)
#webdriver_chrome.set_window_size(2000,2000)

def isUrlBefore():
    pass#打開url後地址是否被跳轉 若是跳轉那就跳過該地址並寫入警告

def isString(isstr, data):
    if isstr in str(data.encode("utf-8")):
        return True
    else:
        return False


def openweb(url):
    global  starttime
    global driver_wait
    global isOffsale
    COUNTINUE = False
    SKIP = 1
    TIAOZHUAN = 2
    LOADERROR = 3
    FATALERROR = 4

    mprint("努力加載連接中,請耐心等待")
    try:
        try:#獲取源碼進行判斷
            respone = requests.get(url)
            #正確打開鏈接
            isOffsale = False #初始化設置爲不下櫃
            if respone.status_code == 200:#正確加載價格頁面包括下櫃的頁面
                if "商品評價" in str(respone.text.encode("utf-8")):#說明頁面正常訪問到商品頁面  不然可能被跳轉了
                    # print respone.text
                    isOffsale = False
                    if "商品已下櫃" in str(respone.text.encode("utf-8")):
                        isOffsale = True
                else:
                    return TIAOZHUAN #說明頁面不是價格頁面  被跳轉了?
            else:#沒法打開鏈接
                return LOADERROR#狀態碼不是200說明訪問有問題
        except Exception, e:
            print Exception, e#沒法獲取源碼
            return FATALERROR
    #如下代碼應該不會被執行
        webdriver_chrome.get(url)
        # mprint("獲取當前地址")
        if "?c" in getcurrenturl():#有了上面的if "商品評價" in判斷後這段代碼應該不會被執行到
            mprint("地址已經被跳轉")
            return SKIP
        driver_wait = WebDriverWait(webdriver_chrome, 10)
        return COUNTINUE
    except Exception:
        mprint("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!請注意,連接有問題 沒法打開 程序可能中止!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print url
        print getcurrenturl()
        return SKIP
    finally:
        debugprint("打印url")


def get_element_bycssselector(css_selector):
    element = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
    # print element.text
    return element


def get_datanum_bycssselectorlist(css_selector_list, text):
    for css_selector in css_selector_list:
        try:
            # print css_selector
            element = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
            data_num = element.get_attribute('data-num')
            if isString(text, element.text):
                print element.text + ":" + str(data_num)  # mprint ("顯示好評")
                return data_num
            else:
                mprint("沒法獲取")
        except:
            pass
    return warnningtext()


def get_element_byxpathlist(xpath_list, text):
    for xpath in xpath_list:
        try:
            element = driver_wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
            # print element.text
            if isString(text, element.text):
                print element.text
                return element
            else:
                mprint("沒法獲取xpath以下")
                print xpath
        except:
            mprint(xpath)
            pass
    return None

# def try_element(element):
#     try:
#         element
#     except:
#         pass



def getname():
    debugprint("start find name btn")
    try:
        myname = webdriver_chrome.find_element_by_class_name('sku-name')
        mprint("1名稱:")
        print myname.text
        return myname.text
    except Exception:
        pass
    try:
        myname = webdriver_chrome.find_element_by_css_selector('#name > h1')
        mprint("2名稱:")#生鮮 書籍
        print myname.text
        return myname.text
    except Exception:
        pass
    try:
        myname = webdriver_chrome.find_element_by_css_selector('#name')
        mprint("3名稱:")#生鮮 書籍
        print myname.text
        return myname.text
    except Exception:
        mprint("第 3次 抓取商品名稱失敗")
        return warnningtext()


def getprice():
    debugprint("start getprice")
    try:
        myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.summary.summary-first > div > div.summary-price.J-summary-price > div.dd > span')))
        mprint("1價格:")
        # print myprice.text
        finalprice = myprice.text.encode ('utf-8').replace ('', '')
        if finalprice == "":
            msleep1()
            finalprice = myprice.text.encode ('utf-8').replace ('', '')
            if finalprice == "":
                msleep2 ()
                finalprice = myprice.text.encode ('utf-8').replace ('', '')
                if finalprice == "":
                    msleep3 ()
                    finalprice = myprice.text.encode ('utf-8').replace ('', '')
        print finalprice
        return finalprice
    except Exception:#估計下架 作下架的抓取
        pass
    try:  # 生鮮 書籍 抓取價格
        myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#jd-price")))  # 生鮮 可用
        # myprice = webdriver_chrome.find_element_by_xpath("/html/body/div[7]/div/div[2]/div[3]/div/div[1]/div[2]/span/span[2]")
        mprint("2價格:")
        # print myprice.text
        finalprice = myprice.text.encode ('utf-8').replace ('', '')
        if finalprice == "":
            msleep1 ()
            finalprice = myprice.text.encode ('utf-8').replace ('', '')
            if finalprice == "":
                msleep2 ()
                finalprice = myprice.text.encode ('utf-8').replace ('', '')
                if finalprice == "":
                    msleep3 ()
                    finalprice = myprice.text.encode ('utf-8').replace ('', '')
        print finalprice
        return finalprice
    except Exception:  # 估計下架 作下架的抓取
        pass
    try:  # 生鮮 書籍 抓取價格
        myprice = driver_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.summary-price.J-summary-price > div > div.dd > span > span")))  # 生鮮 可用
        mprint("3價格:")
        # print myprice.text
        finalprice = myprice.text.encode ('utf-8').replace ('', '')
        if finalprice == "":
            msleep1 ()
            finalprice = myprice.text.encode ('utf-8').replace ('', '')
            if finalprice == "":
                msleep2 ()
                finalprice = myprice.text.encode ('utf-8').replace ('', '')
                if finalprice == "":
                    msleep3 ()
                    finalprice = myprice.text.encode ('utf-8').replace ('', '')
        print finalprice
        return finalprice
    except Exception:  # 估計下架 作下架的抓取
        pass

    try:  # 下架的抓取  前面判斷了下架 這裏基本上不會執行了
        mprint("4下架:")
        soldout = webdriver_chrome.find_element_by_class_name('itemover-tip')  # 抓下櫃 下架 「該商品已下櫃,歡迎挑選其餘商品!」

        print  soldout.text

        return soldout.text
    except Exception:
        mprint("抓不到價格 也不是下架 請檢查")
        return warnningtext()


def scrolldown():
    debugprint("準備開始滾動500")
    webdriver_chrome.execute_script("window.scrollBy(0,500)")
    debugprint("已向下滾動500")


def clickcommentbtn():
    xpath1 = '//*[@id="detail"]/div[1]/ul/li[5]'
    xpath2 = '//*[@id="detail"]/div[1]/ul/li[4]'
    # xpath3 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)'
    btn = get_element_byxpathlist([xpath1, xpath2], "商品評價")
    if btn is not None:
        try:
            btn.click()
            # mprint("xpath點擊")
        except Exception, e:
            mprint("btn非空 不過點擊失敗了 通常不會這樣的 報錯是不是:Element is not clickable at point (697, 299). Other element would receive the click")
            print Exception, e
    else:
        # pass#其餘判斷  基本上不會到這裏
        css_sele1 = '# detail > div.tab-main.large > ul > li:nth-child(4)'
        css_sele2= '#detail > div.tab-main.large > ul > li.current'
        try:
            get_element_bycssselector(css_sele1).click()
            mprint("經過csssele獲取到")
            print css_sele1
        except:
            try:
                get_element_bycssselector(css_sele2).click()
                mprint("經過csssele獲取到")
                print css_sele1
            except:
                mprint("實在找不到 聯繫開發 程序可能終止")

    """
    try:#1#detail > div.tab-main.large > ul > li.current > s
        mysumcommentbtn = webdriver_chrome.find_element_by_xpath ('//*[@id="detail"]/div[1]/ul/li[5]')
        mprint("1點擊")
        print mysumcommentbtn.text,  # 三個按鈕的連接要用其餘的(運動戶外類)
        # mprint("運動戶外類?")
        if "商品評價" in str(mysumcommentbtn.text.encode("utf-8")):
            mysumcommentbtn.click()
            mprint("~~~~~~點擊了按鈕") #    這句有問題
            return True
        else:
            mprint("找不到按鈕 商品評價  繼續尋找2")
    except:
        pass

    try:#2

        mysumcommentbtn = webdriver_chrome.find_element_by_xpath ('//*[@id="detail"]/div[1]/ul/li[4]')
        mprint("2點擊")
        print mysumcommentbtn.text
        if "商品評價" in str(mysumcommentbtn.text.encode("utf-8")):
            mysumcommentbtn.click()
            mprint("~~~~~~點擊了評論總量按鈕")
            return True
        else:
            mprint("找到按鈕 不是商品評價  繼續尋找3")
    except:
        mprint("2點擊找不到繼續下一步")
        pass

    try:#3
        css_sele = '# detail > div.tab-main.large > ul > li:nth-child(4)'  # 香蕉
       # http: // item.jd.com / 11461683.html
        mysumcommentbtn = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        mprint("3點擊")
        print mysumcommentbtn.text
        if "商品評價" in str(mysumcommentbtn.text.encode("utf-8")):
            mysumcommentbtn.click()
            mprint("~~~~~~點擊了評論總量按鈕")
            return True
    except:
        mprint("找不到按鈕 商品評價  繼續尋找4 ")
        pass


    try:#4
        css_sele = '#detail-tab-comm'  # 書籍類比較多
        mysumcommentbtn = driver_wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele)))
        mprint("4點擊")
        print mysumcommentbtn.text
        if "商品評價" in str(mysumcommentbtn.text.encode("utf-8")):
            mysumcommentbtn.click()
            mprint("~~~~~~點擊了評論總量按鈕")
            return True
    except:
        mprint("找不到按鈕 商品評價  繼續尋找5")
        pass
    try:#5
        css_sele = '#detail > div.tab-main.large > ul > li.current'  # 香蕉 書籍
        # http: // item.jd.com / 11461683.html
        mysumcommentbtn = driver_wait.until (EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele)))
        mprint("5點擊")
        print mysumcommentbtn.text
        if "商品評價" in str(mysumcommentbtn.text.encode("utf-8")):
            mysumcommentbtn.click()
            mprint("~~~~~~點擊了評論總量按鈕")
            return True
        else:
            mprint("第五次也找不到 只能手動找了")
            print getcurrenturl()
            return warnningtext()

    except:
        mprint("沒法找到商品評價按鈕 請聯繫開發 提供url:")
        print getcurrenturl()
        return warnningtext()
        """


def getshowpicnum():
    css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)'
    css_sele2 = '#comments-list > div.mt > div > ul > li:nth-child(2)'
    for i in range(3):#循環查找3次
        pic_num = get_datanum_bycssselectorlist ([css_sele1, css_sele2], "曬圖")
        if pic_num is not None:
            # mprint(pic_num)
            return pic_num
        else:
            # pass
            mprint("shaitu")
            # print u""+str(i+1)+u"次沒找到,準備開始第"+str(i+2)+u"次查找"

    """
    global data_num
    global myshowpic
    try:#comments-list > div.mt > div > ul > li:nth-child(2)
                   # comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)
        css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(2)'
        myshowpic = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        data_num = myshowpic.get_attribute('data-num')
        mprint("1曬圖")
        print myshowpic.text,
        if "曬圖" in str(myshowpic.text.encode("utf-8")):
            debugprint("第一次判斷正確 是曬圖按鈕")
            if data_num is not None:
                return data_num
            else:
                mprint("曬圖的值沒有正確加載 5s後再次驗證")
                msleep3()
                data_num = myshowpic.get_attribute ('data-num')
                if data_num is not None:
                    mprint("找到曬圖值")
                    print myshowpic.text
                    return data_num
                else:
                    mprint ("曬圖的值沒有正確加載 5s後再次驗證")
                    msleep3 ()
                    msleep3 ()
                    data_num = myshowpic.get_attribute ('data-num')
                    if data_num is not None:
                        mprint ("找到曬圖值")
                        print myshowpic.text
                        return data_num
                    else:#屢次查找沒法找到值
                        mprint("#屢次查找沒法找到值")
                        return warnningtext()
        else:
            debugprint("第一次判斷錯誤 按鈕找到不是曬圖 聯繫開發提供截圖")
    except:
        debugprint("第一次判斷沒找到按鈕 開始第二次")

    try:
        css_sele = '#comments-list > div.mt > div > ul > li:nth-child(2)'
        myshowpic = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        mprint("2曬圖")
        print myshowpic.text
        if "曬圖" in str(myshowpic.text.encode("utf-8")):
            debugprint("第2次判斷正確 是曬圖按鈕")
            if myshowpic.get_attribute('data-num') is not None:
                return myshowpic.get_attribute('data-num')
            else:
                mprint ("曬圖的值沒有正確加載 5s後再次驗證")
                msleep3 ()
                data_num = myshowpic.get_attribute ('data-num')
                if data_num is not None:
                    mprint ("找到曬圖值")
                    print myshowpic.text
                    return data_num
                else:
                    mprint ("曬圖的值沒有正確加載 5s後再次驗證")
                    msleep3 ()
                    msleep3 ()
                    data_num = myshowpic.get_attribute ('data-num')
                    if data_num is not None:
                        mprint ("找到曬圖值")
                        print myshowpic.text
                        return data_num
                    else:  # 屢次查找沒法找到值
                        return warnningtext ()
        else:
            debugprint("第2次判斷錯誤 按鈕找到不是曬圖 聯繫開發提供截圖")
    except:
        debugprint("第2次判斷沒找到按鈕 聯繫開發")
        return warnningtext()

    """


def totalcomment():
    css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.current'
    css_sele2 = '#comments-list > div.mt > div > ul > li.ui-switchable-item.trig-item.curr'
    return get_datanum_bycssselectorlist([css_sele1, css_sele2], "所有評價")
    """
    try:
        css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.current'
        mypositivecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        # mypositivecomment = webdriver_chrome.find_element_by_css_selector("#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)")
        data_num = mypositivecomment.get_attribute('data-num')
        mprint("1所有評價")
        print mypositivecomment.text, data_num
        if "所有評價" in str(mypositivecomment.text.encode("utf-8")):
            debugprint("第1次判斷正確 是所有評價按鈕")
            if data_num is not None:
                return data_num
            else:
                mprint("所有評價的值沒有正確加載 請手動查找")
                return cannotgetdataprint(mypositivecomment.text)
        else:
            debugprint("第1次判斷錯誤 按鈕找到不是所有評價 聯繫開發提供截圖")
    except:
        debugprint("第一次抓所有評價失敗 繼續第二次")
        pass
    try:
        css_sele = '#comments-list > div.mt > div > ul > li.ui-switchable-item.trig-item.curr'
        mypositivecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        # mypositivecomment = webdriver_chrome.find_element_by_css_selector("#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)")
        data_num = mypositivecomment.get_attribute('data-num')
        mprint("2所有評價")
        print mypositivecomment.text, data_num
        if "所有評價" in str(mypositivecomment.text.encode("utf-8")):
            debugprint("第2次判斷正確 是所有評價按鈕")
            if data_num is not None:
                return data_num
            else:
                mprint("所有評價的值沒有正確加載 請手動查找")
                return cannotgetdataprint(mypositivecomment.text)
        else:
            debugprint("第2次判斷錯誤 按鈕找到不是所有評價 聯繫開發提供截圖")
    except:
        debugprint("第2次抓所有評價失敗 繼續第二次")
        return cannotgetdataprint("所有評價")
"""


def getpositivecomment():
    css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(4)'
    css_sele2 = '#comments-list > div.mt > div > ul > li:nth-child(3)'
    css_sele3 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(3)'
    return get_datanum_bycssselectorlist([css_sele1, css_sele2, css_sele3], "好評(")
"""
    try:
        mypositivecomment = get_element_bycssselector(css_sele1)
        data_num = mypositivecomment.get_attribute('data-num')
        mprint("1好評")
        if isString("好評(", mypositivecomment.text):
            print mypositivecomment.text + ":" + str(data_num)  # mprint ("顯示好評")
            return data_num
        else:
            mprint("好評數量沒法獲取")
    except:
        debugprint("第一次抓好評失敗 繼續第二次")
        pass

    try:#書籍 香蕉
        mypositivecomment = get_element_bycssselector(css_sele2)
        data_num = mypositivecomment.get_attribute('data-num')
        mprint("2好評")
        if isString("好評(", mypositivecomment.text):
            print mypositivecomment.text + ":" + str(data_num)  # mprint ("顯示好評")
            return data_num
        else:
            mprint("好評數量沒法獲取")
    except:
        pass
    try:#??

        mypositivecomment = get_element_bycssselector(css_sele3)
        data_num = mypositivecomment.get_attribute('data-num')
        if isString("好評(", mypositivecomment.text):
            mprint ("第3次獲取到好評")
            print mypositivecomment.text + ":" + str(data_num)  # mprint ("顯示好評")
            return data_num
        else:
            mprint("好評數量沒法獲取")
        print mypositivecomment.text + ":" + str(data_num)  # mprint ("顯示好評")
    except:
        mprint("沒法獲取到好評")
        return warnningtext()
"""


def getmoderatecomment():
    css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(5)'
    css_sele2 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)'
    return get_datanum_bycssselectorlist([css_sele1, css_sele2], "中評(")

    """
    try:
        css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(5)'
        mymoderatecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        # mymoderatecomment = webdriver_chrome.find_element_by_css_selector(
        #     "#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)")
        data_num = mymoderatecomment.get_attribute('data-num')
        mprint("1中評")
        print mymoderatecomment.text + ":" + str(data_num)  # mprint("顯示中評")
        return data_num

    except:
        pass
    try:
        css_sele = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)'
        mymoderatecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        # mymoderatecomment = webdriver_chrome.find_element_by_css_selector(
        #     "#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(4)")
        data_num = mymoderatecomment.get_attribute('data-num')
        print mymoderatecomment.text + ":" + str(data_num)  # mprint("顯示中評")
        mprint("2中評")
        return data_num
    except:
        mprint("第二次中評失敗 聯繫開發")

        """


def getnegativecomment():
    css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(6)'
    css_sele2 = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(5)'
    return get_datanum_bycssselectorlist([css_sele1, css_sele2], "差評(")
    """
    try:
        css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li:nth-child(6)'
        mynegativecomment = driver_wait.until (EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        data_num = mynegativecomment.get_attribute('data-num')
        mprint("1差評")
        print mynegativecomment.text+":"+str(data_num) # mprint ("顯示差評")
        return data_num
    except:
        debugprint("第一次差評失敗")

    try:
        css_sele = '#comments-list > div:nth-child(1) > div:nth-child(1) > ul:nth-child(1) > li:nth-child(5)'
        mynegativecomment = driver_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_sele)))
        data_num = mynegativecomment.get_attribute('data-num')
        print mynegativecomment.text + ":" + str(data_num)  # mprint ("顯示差評")
        mprint("2差評")
        return data_num
    except:
        mprint("第2次差評失敗 聯繫開發")
        """


def getaddcomment():#追評
    css_sele1 = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.J-addComment'
    return get_datanum_bycssselectorlist ([css_sele1, ], "追評(")
    """
    try:
        css_sele = '#comment > div.mc > div.J-comments-list.comments-list.ETab > div.tab-main.small > ul > li.J-addComment'
        maddcomment = driver_wait.until(EC.element_to_be_clickable ((By.CSS_SELECTOR, css_sele)))
        data_num = maddcomment.get_attribute('data-num')
        print maddcomment
        return data_num
    except:
        return "若是前面都沒問題 可能這個連接沒有追評 能夠手動確認".decode("utf-8")
    """


def getcurrenturl():
    # debugprint("打印當前頁面url:  "+str(webdriver_chrome.current_url))
    return webdriver_chrome.current_url


def mwrite(linenum, zlist): #放一個 要保存的 行數 和 數據list
    count = len(zlist) #列表數據的長度

    mprint("準備插入第 "+str(linenum+1)+" 條數據,一共:"+str(count)+"")
    title_style = xlwt.easyxf('font: name Times New Roman, color-index red, bold on', num_format_str='#,##0.00')
    if linenum == 0:
        global wb
        global ws
        wb = xlwt.Workbook()
        ws = wb.add_sheet("京東666".decode("utf-8"))
    for i in range(0, count):#列數
        if i == 0:
            mprint("寫入以下數據")
        if linenum == 0:#第1條數據待插入  須要先把標題插入0 再把第一條數據插入1
            ws.write(linenum, i, title[i].decode("utf-8"), title_style)#寫標題
            ws.write(linenum+1, i, zlist[i])#這個write是一個覆蓋操做 若是沒write就放空
            print title[i].decode("utf-8"), zlist[i]
            wb.save(xls_name)
            # if i == (count-1):
            #     mprint("完成本條數據寫入")
        else:   #  第2+條數據開始插入
            ws = wb.get_sheet(0)
            ws.write(linenum+1, i, zlist[i])
            print title[i].decode ("utf-8"), zlist[i]
            wb.save(xls_name)

    # mprint(""+str(linenum+1)+"條數據寫入成功,還剩"+(sumurlcount-linenum)+"條數據待解析")

class MyThread_totalcom(Thread):
    def __init__(self):
        Thread.__init__(self)

    def run(self):
        # totalcom = totalcomment()
        self.totalcom = totalcomment()

    def get_result(self):
        return self.totalcom

class MyThread_showpic(Thread):
    def __init__(self):
        Thread.__init__(self)

    def run(self):
        self.showpic = getshowpicnum()

    def get_result(self):
        return self.showpic

def getall(url):
    starttime = datetime.datetime.now()
    RETURN_CODE = openweb(url)
    print RETURN_CODE,'RETURN_CODE'


    if RETURN_CODE:#TRUE: skip and warning
        try:
            if RETURN_CODE == 2:
                mprint("頁面被跳轉")
                skiplist = [url, "!!頁面被跳轉".decode("utf-8"), RETURN_CODE, "", "", "", "", ""]
                return skiplist
            else:#1
                mprint("沒法訪問 檢查網絡是否故障")
                skiplist = [url, "!!檢查是否沒法打開網頁".decode("utf-8"), RETURN_CODE, "", "", "", "", ""]
                return skiplist
        except:
            mprint("???")
            skiplist = [url, "!!跳過該條連接".decode("utf-8"), "???????????????????".decode("utf-8"), "", "", "", "", ""]
            return skiplist

    else:#FALSE :continue to get the data
        # starttime = datetime.datetime.now ()
        endtime = datetime.datetime.now()
        timed = (endtime - starttime).seconds
        mprint("網頁已經被打開,耗時:"+str(timed)+"")
        debugprint('scrolldown1')
        #urlcurrent = getcurrenturl()#寫一個 若是連接被跳轉到其餘頁面就跳過的判斷  有時間再寫吧 urlcurrent可能變成 jd.com
        scrolldown()
        # msleep1()
        #scrolldown()
        # msleep2()
        debugprint('scrolldown2')
        name = getname()
        if isOffsale:  # 下櫃
            price = "商品已下櫃".decode ("utf-8")
        else:
            price = getprice()
        clickcommentbtn()
        # msleep2()
        #好評度能加載完成就能顯示曬圖
        try:
            print u"好評度:", get_element_bycssselector("#comment > div.mc > div.comment-info.J-comment-info > div.comment-percent > div").text
        except:
            mprint("沒法獲取好評度,說明網絡加載緩慢")
        #想寫個多線程  不過單獨一個的時候正常 若是兩個都放進去就會出問題 難道是selenium不能同時find兩個element?

        mprint("多線程開始")
        thd1 = MyThread_totalcom()

        # thd2 = MyThread_showpic()
        thd1.start()
        mprint("MyThread_totalcom線程開始")
        # thd2.start()
        # mprint("MyThread_showpic程開始")
        thd1.join()
        # thd2.join()
        totalcom = thd1.get_result()
        # showpic = thd2.get_result()
        mprint("多線程結束")

        # totalcom = totalcomment()#上面用多線程這裏就註釋掉
        showpic = getshowpicnum()
        #上面多線程 只能跑一個 totalcomment和getshowpicnum一塊兒就出問題 好像不是我多線程代碼有問題 是selenium不能同時find多個元素
        positivcom = getpositivecomment()
        modertcom = getmoderatecomment()
        negtivcom = getnegativecomment()
        # addcomment = getaddcomment()

        sumlist = [url, name, price, showpic, positivcom, modertcom, negtivcom, totalcom]
        # sumlist = [url, name, price, showpic, positivcom, modertcom, negtivcom ,addcomment]
        # print sumlist
        return sumlist # a list

if __name__ == '__main__':
    try:#__main__
        # print type(now_time), type("時間")
        print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        # print ".",
        # time.sleep(0.2)
        cc = 0
        # URLSource

        total_starttime = datetime.datetime.now()
        f = open(URLSource, "r")
        lines = f.readlines()  # 讀取所有內容
        global sumurlcount
        sumurlcount = len(lines)
        print sumurlcount
        mprint("一共 "+str(sumurlcount)+" 條數據要爬蟲")
        for jdurl in lines:
        #for i in urllist:
            s = []
            print jdurl
            one_starttime = datetime.datetime.now ()
            goodsinfo_list = getall(jdurl.replace("\n", ""))
            print "test111111111"
            # print goodsinfo_list
            mwrite(cc, goodsinfo_list)
            oneurl_endtime = datetime.datetime.now ()
            oneurl_timed = (oneurl_endtime - one_starttime).seconds
            mprint ("該條數據寫入完成耗時:" + str (oneurl_timed) + "秒,還剩"+str(sumurlcount - cc - 1)+"條數據待分析,即將開始下一個連接的抓取!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            cc = cc + 1


        mprint("@@@@@$$$$$$$$@@@@@            全部代碼正常運行 無報錯          @@@@@@@@@@@$$$$$$$$$$$$$$$@@@@@@@@@@@@@@@@")
        total_endtime = datetime.datetime.now ()
        total_timed = (total_endtime - total_starttime).seconds
        mprint ("整個爬蟲一共耗時:" + str (total_timed) + ""+",單條連接平均爬蟲耗時:"+str((round(total_timed/sumurlcount,2)))+ "")

    except Exception, e:
        print Exception, e

        mprint("~~~~~~~~中間有 報錯了@@@@@@@@@@@@@@@@")
    finally:
        mprint("sleep 10s後關閉瀏覽器")
        time.sleep(10)
        webdriver_chrome.quit()
相關文章
相關標籤/搜索