本例子用Selenium +phantomjs爬取節目(http://tv.cctv.com/epg/index.shtml?date=2018-03-25)的信息html
針對上面的網站信息,來進行抓取web
一、首先抓取信息列表app
抓取代碼:Elements = doc('div[class="epglist"]').find('ul')測試
二、節目名稱,連接,時間網站
title = subEle('div[class="innerbox"]').find('h3').text().encode('utf8')ui
link = subEle('div[class="innerbox"]').find('p').find('a').attr('href')url
strTime = subEle('div[class="innerbox"]').find('p').text().encode('utf8')spa
# coding=utf-8 import os import re from selenium import webdriver from datetime import datetime,timedelta import selenium.webdriver.support.ui as ui import time from pyquery import PyQuery as pq class cctvDriver: def __init__(self,startDate,endDate): #經過配置文件獲取IEDriverServer.exe路徑 self.urls = self.getUrlsFromStartEndDate(startDate,endDate) IEDriverServer ='C:\Program Files\Internet Explorer\IEDriverServer.exe' self.driver = webdriver.Ie(IEDriverServer) self.driver.maximize_window() self.fileName = time.strftime('%Y-%m-%d') def compareDate(self, startDate, endDate): start_Date = time.strptime(startDate, "%Y-%m-%d") end_Date = time.strptime(endDate, "%Y-%m-%d") totalSeconds = (end_Date - start_Date).total_seconds() if totalSeconds >= 0: print endDate return True else: print startDate return False def compareTime(self, startTime, endTime): st = int(startTime.replace(':',"")) et = int(endTime.replace(':',"")) if st>et: return True else: return False def getUrlsFromStartEndDate(self,startDate,endDate): urls = [] start_Date = datetime.strptime(startDate, "%Y-%m-%d") end_date = datetime.strptime(endDate, "%Y-%m-%d") ts = end_date-start_Date days = ts.days + 1 index = 0 for d in xrange(0,days): date = start_Date + timedelta(days=index) urls.append('http://tv.cctv.com/epg/index.shtml?date='+date.strftime("%Y-%m-%d")) index += 1 return urls def WriteLog(self, message,date): fileName = os.path.join(os.getcwd(), 'cctvInfo/'+date + '.txt') with open(fileName, 'a') as f: f.write(message) def CatchData(self): className = "//div[@class='epglist']/ul" for url in self.urls: date = url.split('=')[1] start_Date = datetime.strptime(date, "%Y-%m-%d") + timedelta(days=-1) predate = start_Date.strftime("%Y-%m-%d") self.driver.get(url) time.sleep(5) selenium_html = self.driver.execute_script("return document.documentElement.outerHTML") doc = pq(selenium_html) Elements = doc('div[class="epglist"]').find('ul') message = '' recount = 0 for element in Elements.items(): channel = element.attr('id') subElements = element.find("li") for subEle in subElements.items(): strTime = subEle('div[class="innerbox"]').find('p').text().encode('utf8').strip().replace( '回看', '').replace('直播','') if strTime: title = subEle('div[class="innerbox"]').find('h3').text().encode( 'utf8').strip().replace( ',', ',') link = subEle('div[class="innerbox"]').find('p').find('a').attr('href') if self.compareTime(strTime.split('~')[0],strTime.split('~')[1]): starttime = predate + " " + strTime.split('~')[0] else: starttime = date + " " + strTime.split('~')[0] endtime = date + " " + strTime.split('~')[1] mess = '\r\n{0},{1},{2},{3},{4}'.format(channel, title, starttime, endtime, link) # print mess message += mess recount+=1 if len(message)>10: self.WriteLog(message.strip(),date) print recount self.driver.close() self.driver.quit() # #測試抓取微博數據 obj = cctvDriver('2018-01-01','2018-03-01') obj.CatchData()