抓取活動樹網站中會議活動數據(http://www.huodongshu.com/html/index.html)html
具體的思路是[Python爬蟲] 之十一中抓取活動行網站的相似,都是用多線程來抓取,可是因爲活動樹網站 ,每一個關鍵字搜索頁的ur是固定,好比搜索「數字」結果有470個結果,沒頁10條記錄,第二頁的url和第一頁的 url是同樣的。web
所以針對每一個關鍵字用一個線程進行搜索。數據庫
具體代碼以下:多線程
# coding=utf-8
import os
import re
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
import IniFile
from selenium.webdriver.common.keys import Keys
from threading import Thread
import thread
import LogFile
import urllib
import mongoDbBase
#抓取數據線程類
class ScrapyData_Thread(Thread):
#抓取數據線程類
def __init__(self,webSearchUrl,pageCountLable,htmlLable,originalUrlLabel,nextUrlLabel,keyword,db):
'''
構造函數
:param webSearchUrl: 搜索頁url
:param pageCountLable: 搜索頁數標籤
:param htmlLable: 要搜索的標籤
:param OriginalUrlLabel: 每一個記錄對應的url標籤
:param nextUrlLabel: 下一頁標籤
:param keywords: 要搜索的關鍵字,多個關鍵字中間用分號(;)隔開
:param db: 保存數據庫引擎
'''
Thread.__init__(self)
self.webSearchUrl = webSearchUrl
self.pageCountLable = pageCountLable
self.htmlLable = htmlLable
self.originalUrlLabel = originalUrlLabel
self.nextUrlLabel = nextUrlLabel
self.keyword = keyword
self.db = db
# IEDriverServer = self.cf.GetValue("section", "IEDriverServer")
# os.environ["webdriver.ie.driver"] = IEDriverServer
# self.urldriver = webdriver.Ie(IEDriverServer)
self.driver = webdriver.PhantomJS()
self.wait = ui.WebDriverWait(self.driver, 20)
self.driver.maximize_window()
def compareDate(self, dateLeft, dateRight):
'''
比較倆個日期的大小
:param dateLeft: 日期 格式2017-03-04
:param dateRight:日期 格式2017-03-04
:return: 1:左大於右,0:相等,-1:左小於右
'''
dls = dateLeft.split('-')
drs = dateRight.split('-')
if len(dls) > len(drs):
return 1
if int(dls[0]) == int(drs[0]) and int(dls[1]) == int(drs[1]) and int(dls[2]) == int(drs[2]):
return 0
if int(dls[0]) > int(drs[0]):
return 1
elif int(dls[0]) == int(drs[0]) and int(dls[1]) > int(drs[1]):
return 1
elif int(dls[0]) == int(drs[0]) and int(dls[1]) == int(drs[1]) and int(dls[2]) > int(drs[2]):
return 1
return -1
def date_isValid(self, strDateText):
'''
判斷日期時間字符串是否合法:若是給定時間大於當前時間是合法,或者說當前時間給定的範圍內
:param strDateText: 三種格式 '017.04.27 ~ 04.28'; '2017.04.20 08:30 ~ 12:30' ; '2015.12.29 ~ 2016.01.03'
:return: True:合法;False:不合法
'''
datePattern = re.compile(r'\d{4}-\d{2}-\d{2}')
date = strDateText.replace('.', '-')
strDate = re.findall(datePattern, date)
currentDate = time.strftime('%Y-%m-%d')
if len(strDate) == 2:
if self.compareDate(strDate[1], currentDate) > 0:
return True
elif len(strDate) == 1:
if self.compareDate(strDate[0], currentDate) >= 0:
return True
else:
datePattern = re.compile(r'\d{4}-\d{2}-\d{2}\s~\s\d{2}-\d{2}')
#2015-06-04 13:30 ~ 17:30
strDate = re.findall(datePattern, date)
if len(strDate) >0:
if self.compareDate(strDate[0][0:5] + strDate[0][13:], currentDate) >= 0:
return True
else:
return False
return False
def run(self):
print ''
print '關鍵字:%s ' % self.keyword
self.driver.get(self.webSearchUrl)
time.sleep(5)
# 記錄數
pageCount_elements = self.driver.find_elements_by_xpath(self.pageCountLable)
if len(pageCount_elements) > 0:
strCount = pageCount_elements[0].text.encode('utf8')
pageCount = int(strCount) / 10
if int(strCount) % 10 > 0:
pageCount = pageCount + 1
page_Count = pageCount
pageIndex = 0
kword = self.keyword
recordCount = 0
while pageCount > 0:
pageCount = pageCount - 1
if pageIndex > 0:
next_element = self.driver.find_elements_by_xpath(self.nextUrlLabel)
if len(next_element) > 0:
next_element[0].click()
time.sleep(3)
self.wait.until(lambda driver: self.driver.find_elements_by_xpath(self.htmlLable))
Elements = self.driver.find_elements_by_xpath(self.htmlLable)
# 查找微博對應的原始url
urlList = []
self.wait.until(lambda driver: self.driver.find_elements_by_xpath(self.originalUrlLabel))
hrefElements = self.driver.find_elements_by_xpath(self.originalUrlLabel)
for hrefe in hrefElements:
urlList.append(hrefe.get_attribute('href').encode('utf8'))
index = 0
strMessage = ' '
strsplit = '\n------------------------------------------------------------------------------------\n'
index = 0
# 每頁中有用記錄
usefulCount = 0
meetingList = []
for element in Elements:
txt = element.text.encode('utf8')
txts = txt.split('\n')
# strDate = re.findall(self.datePattern, txt)
# 日期大於今天而且搜索的關鍵字在標題中才認爲是複合要求的數據
if self.date_isValid(txts[1]) and txts[0].find(kword) > -1:
dictM = {'title': txts[0], 'date': txts[1],
'url': urlList[index], 'keyword': kword, 'info': txt}
meetingList.append(dictM)
# print ' '
# print txt
# print '活動連接:' + urlList[index]
# print strsplit
#
# strMessage = txt + "\n"
# strMessage += '活動連接:' + urlList[index] + "\n"
# strMessage += strsplit
# strMessage = unicode(strMessage, 'utf8')
# # log.WriteLog(strMessage)
usefulCount = usefulCount + 1
recordCount = recordCount + 1
index = index + 1
pageIndex = pageIndex + 1
if usefulCount == 0:
break
else:
self.db.SaveMeetings(meetingList) #保存數據庫中
print "共瀏覽了: %d 頁數據" % page_Count
print "共抓取了: %d 個符合條件的活動記錄" % recordCount
self.driver.close()
self.driver.quit()
if __name__ == '__main__':
configfile = os.path.join(os.getcwd(), 'activity.conf')
cf = IniFile.ConfigFile(configfile)
webSearchUrl = cf.GetValue("section", "webSearchUrl")
pageCountLable = cf.GetValue("section", "pageCountLable")
htmlLable = cf.GetValue("section", "htmlLable")
originalUrlLabel = cf.GetValue("section", "originalUrlLabel")
nextUrlLabel = cf.GetValue("section", "nextUrlLabel")
keywords= cf.GetValue("section", "keywords")
keywordlist = keywords.split(';')
start = time.clock()
db = mongoDbBase.mongoDbBase()
for keyword in keywordlist:
if len(keyword) > 0:
url = webSearchUrl + urllib.quote(keyword)
t = ScrapyData_Thread(url, pageCountLable, htmlLable,originalUrlLabel,nextUrlLabel,keyword,db)
t.setDaemon(True)
t.start()
t.join()
end = time.clock()
print "整個過程用時間: %f 秒" % (end - start)
配置文件內容:
[section]#IE驅動的路徑iedriverserver = C:\Program Files\Internet Explorer\IEDriverServer.exe#要搜索的標籤,若是有多個,中間用分號隔開htmlLable = //div[@id ='eventList']/div[@class ='list']#要獲取爬蟲也是的標籤pageCountLable = //span[@id='eventNumber']#給定網址的搜索首頁UrlwebSearchUrl = http://www.huodongshu.com/html/find_search.html?search_keyword=#查找對應的原始urloriginalUrlLabel = //div[@class='listR']/h2/a#下一頁連接對應的標籤nextUrlLabel = //dt[@class='next']/a#文本輸入框要搜索的關鍵字keywords = 互聯網電視;智能電視;數字;影音;家庭娛樂;節目;視聽;版權;數據