from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import re # 正則 from requests.exceptions import RequestException # 預防報錯 from multiprocessing import Pool #多線程 from pyquery import PyQuery as pq #解析庫 from config import * #引入當前目錄config 文件內容 print(SERVICE_ARGS) # bro = webdriver.Chrome() # 引用瀏覽器Chrome 提早要安裝chromedriver.exe bro = webdriver.PhantomJS(service_args=SERVICE_ARGS) #引用無界面 PhantomJS 不在彈框 只在後臺運行 # wait = WebDriverWait(bro, 10) # selenium 屬性設置 # bro.set_window_size(1400, 900) # 設置 瀏覽器窗口寬度 def search(): print('正在搜索...') try: bro.get('http://search.zongheng.com') input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#commSearch > div > input.search-text.fl'))) # 使用css屬性定位 submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#commSearch > div > input.search-btn.fr')))# 使用css屬性定位 input.send_keys('都市')#輸入文字 submit.click() #點擊按鈕 total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#totalPage')))# 使用css屬性定位 get_pr() return total.text # 返回HTML上的文字 except TimeoutException: search() def next_page(total): print('正在翻頁',total) try: input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab > div.search_d_pagesize > input.search_d_page_value')))# 使用css屬性定位 submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab > div.search_d_pagesize > input.search_d_page_submit')))# 使用css屬性定位 input.clear()#清除裏面的內容 input.send_keys(total)#輸入文字 submit.click()# 點擊 # 判斷當前是否加載完畢 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab > div.search_d_pagesize > a.active'), str(total))) get_pr() except RequestException: next_page(total) def get_pr(): wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab')))# 使用css屬性定位 html = bro.page_source doc = pq(html) items = doc('body > div.wrap > div.search-html-box.clearfix > div.search-main.fl > div.search-tab').items() #使用css屬性定位 獲取屬性下的html for item in items: html = re.compile('.*?src="(.*?)" onerro.*?class="tit"><a href="(.*?)" target.*?">(.*?)</a>.*?}">(.*?)</a>.*?">(.*?)</a>.*?</em><span>(.*?)</span>.*?em><span>(.*?)</span>.*?<p>.*?(.*?)</p>',re.S)#正則 html = re.findall(html,str(item))#比配 for ii in html: # print(ii) product = { # '圖片': item.find('.imgbox img').attr('scr'), # '圖片': item.find('.se-result-infos .tit').text(), '圖片': ii[0], '地址': ii[1], '書名': ii[2], '做者': ii[3] + ii[4], '連載': ii[6], '介紹': ii[7] } print(product) # print(html) # print(item) def main(): total = search() # total = int(re.compile('(\d+)').search(totla).group(1)) for i in range(2, int(total) + 1): next_page(i) # break # print(total) pool = Pool() # pool.map([next_page(i) for i in range(2, int(total) + 1)]) #多線程 bro.close() # 關閉瀏覽器 if __name__ == '__main__': # pool = Pool() # pool.map(main) # main() pass