本例子用Selenium +phantomjs爬取欄目(http://tv.cctv.com/lm/)的信息javascript
首先抓取全部要抓取網頁連接,共39頁,保存到數據庫裏面html
def getUrls(self): urls = [] urls.append('http://tv.cctv.com/lm/') for index in range(2,40): urls.append("javascript:window.scroll(0,145);DataInteraction({0});showPageTitle_fenyei2('ELMT1413526954890942',{0});".format(index)) self.db.SaveCCTVColumnUrls(urls,'0')
針對上面的網站信息,來進行抓取java
一、首先抓取信息列表python
抓取代碼:Elements = doc("div[id='text_box_0']").find('dl').find('dd')web
二、欄目名稱,連接mongodb
column1Element = element.find('div[class="text"]').find('h3').find('a')數據庫
columnName = column1Element.text().encode('utf8').replace(',', ',').replace('\n', '')微信
columnUrl = column1Element.attr('href')app
# coding=utf-8 import os import re from selenium import webdriver from selenium.common.exceptions import TimeoutException from datetime import datetime,timedelta import selenium.webdriver.support.ui as ui import time from pyquery import PyQuery as pq import columnData import mongoDB class cctvColumnInfo: def __init__(self): #經過配置文件獲取IEDriverServer.exe路徑 # self.urls = self.getUrls() # IEDriverServer ='C:\Program Files\Internet Explorer\IEDriverServer.exe' # self.driver = webdriver.Ie(IEDriverServer) # self.driver.maximize_window() self.driver = webdriver.PhantomJS(service_args=['--load-images=false'])#service_args=['--load-images=false'] self.driver.set_page_load_timeout(10) self.driver.maximize_window() self.db = mongoDB.mongoDbBase() def WriteUrl(self,url): fileName = os.path.join(os.getcwd(), 'cctvColumn/cctvColumn_url.txt') with open(fileName, 'a') as f: f.write('\n'+url) def getUrls(self): urls = [] urls.append('http://tv.cctv.com/lm/') for index in range(2,40): urls.append("javascript:window.scroll(0,145);DataInteraction({0});showPageTitle_fenyei2('ELMT1413526954890942',{0});".format(index)) self.db.SaveCCTVColumnUrls(urls,'0') # return urls def WriteLog(self, message,date): fileName = os.path.join(os.getcwd(), 'cctvColumn/cctvColumn-'+date + '.txt') with open(fileName, 'a') as f: f.write(message) def getColumnInfo(self, colInfo): ts = colInfo.split('主持人') firstBroadcastTime = ts[0] ts1 = ts[1].split('播出頻道') columnHost = '主持人' + ts1[0] broadcastChannel = '播出頻道' + ts1[1] return firstBroadcastTime, columnHost, broadcastChannel def CatchData(self): urlIndex = 0 urls = self.db.GetCCTVColumnUrls() itemIndex = 0 for u in urls: url = u['url'] try: if url == 'http://tv.cctv.com/lm/': self.driver.get(url) else: self.driver.execute_script(url) urlIndex += 1 time.sleep(2) selenium_html = self.driver.execute_script("return document.documentElement.outerHTML") doc = pq(selenium_html) # Elements = doc("div[@id='text_box_0']/dl/dd") Elements = doc("div[id='text_box_0']").find('dl').find('dd') message = '' # for element in Elements: column_name = url.encode('utf8') print url for element in Elements.items(): colobj = columnData.columnData() itemIndex+=1 firstBroadcastTime = '' ReplayBroadcastTime = '' firstBroadcastChannel = '' # column1Element = element.find('div[@class="text"]/h3/a') # column1Element = element.find_element_by_xpath("//div[@class='ui-page-next']") column1Element = element.find('div[class="text"]').find('h3').find('a') columnName = column1Element.text().encode('utf8').replace(',', ',').replace('\n', '') columnUrl = column1Element.attr('href') colobj.setColumnName(columnName) colobj.setColumnUrl(columnUrl) column_name += '\n' + columnName # time.sleep(3) print columnName # column2Element = element.find('div[@class="text"]/p/a') column2Element = element.find('div[class="text"]').find('p').find('a') columnTimeName = column2Element.text().encode('utf8').replace(',', ',').replace('\n', '') columnTimeUrl = column2Element.attr('href') colobj.setColumnTimeName(columnTimeName) colobj.setColumnTimeUrl(columnTimeUrl) # print columnTimeName + '; ' + columnTimeUrl # column34Elements = element.find('div[@class="text"]/span/a') column34Elements = element.find('div[class="text"]').find('span').find('a') # for column34Element in column34Elements: column34Index = 0 pastVideoUrl = '' officialWebsiteUrl = '' for column34Element in column34Elements.items(): if column34Index == 0: pastVideoUrl = column34Element.attr('href') colobj.setPastVideoUrl(pastVideoUrl) else: officialWebsiteUrl = column34Element.attr('href') colobj.setOfficialWebsiteUrl(officialWebsiteUrl) column34Index += 1 # columnImageElement = element.find('div[@class="img"]/a/img') columnImageElement = element.find('div[class="img"]').find('a').find('img') colImgUrl = columnImageElement.attr('src') if colImgUrl == None: columnImageElement = element.find('div[class="image"]').find('a').find('img') colImgUrl = columnImageElement.attr('src') # print colImgUrl colobj.setColImgUrl(colImgUrl) # 首播時間 firstBroadcastTime1 = '' # 主持人 columnHost = '' # 播出頻道 firstBroadcastChannel1 ='' # columnInfos = element.find('div[@class="lr"]/div') columnInfos = element.find('div[class="lr"]').find('div') if columnInfos: for colInfo in columnInfos.items(): firstBroadcastTime1, columnHost, firstBroadcastChannel1 = self.getColumnInfo( colInfo.text().encode('utf8').replace(',', ',').replace('\n', '')) columnHost = columnHost.replace(',', ',') if not firstBroadcastTime: firstBroadcastTime = firstBroadcastTime1 if not firstBroadcastChannel: firstBroadcastChannel = firstBroadcastChannel1 colobj.setColumnHost(columnHost) colobj.setFirstBroadcastChannel(firstBroadcastChannel1) colobj.setFirstBroadcastTime(firstBroadcastTime1) # 欄目名稱,首播時間,重播時間,播出頻道,主持人,欄目url,欄目名稱1(帶時間的),欄目名稱1url,往期視頻url,欄目官網url,),欄目對應圖片url mess = '\n{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}'.format(columnName, firstBroadcastTime, ReplayBroadcastTime, firstBroadcastChannel, columnHost, columnUrl, columnTimeName, columnTimeUrl, pastVideoUrl, officialWebsiteUrl, colImgUrl) # print mess message += mess self.db.SaveCCTVColumnData(colobj,itemIndex) self.db.SaveCCTVColumnUrl(columnUrl, '1', columnName) date = time.strftime('%Y-%m-%d') self.WriteLog(message, date) self.WriteUrl(column_name) self.db.SetCCTVColumnUrlCrawlState(url) except TimeoutException,e: print 'timeout url: '+url self.driver.close() self.driver.quit() def getBroadCast(self): urls = self.db.GetSubCCTVColumnUrls() for u in urls: firstBroadcastTime = '' ReplayBroadcastTime = '' firstBroadcastChannel = '' messsage = '' url = u['url'] # url='http://tv.cctv.com/lm/xqds' # url='http://tv.cctv.com/lm/24xiaoshi/' columnName = u['columnName'] # u'http://tv.cctv.com/lm/kanjian' try: self.driver.get(url) time.sleep(2) selenium_html = self.driver.execute_script("return document.documentElement.outerHTML") doc = pq(selenium_html) Elements = doc("p[class='p_1']") index = 0 for element in Elements.items(): if index == 0: firstBroadcastTime = element.text().encode('utf8').replace(',', ',').replace('\n', '') elif index == 1: ReplayBroadcastTime = element.text().encode('utf8').replace(',', ',').replace('\n', '') elif index == 2: firstBroadcastChannel = element.text().encode('utf8').replace(',', ',').replace('\n', '') break index += 1 if index == 0: Elements = doc("div[class='head_msg']").find('table').find('tbody').find('tr') for element in Elements.items(): messsage+=element.text().encode('utf8').replace(',', ',').replace('\n', '') if messsage: firstBroadcastTime, ReplayBroadcastTime, firstBroadcastChannel= self.getBroadInfo(columnName.encode('utf8'),messsage) self.db.SetCCTVColumnUrlCrawlState(url) if firstBroadcastChannel: colobj = columnData.columnData() colobj.setColumnName(columnName) colobj.setFirstBroadcastTime(firstBroadcastTime) colobj.setFirstBroadcastChannel(firstBroadcastChannel) colobj.setReplayBroadcastTime(ReplayBroadcastTime) self.db.UpdateCCTVColumnData(colobj) print '\n' print url print columnName print firstBroadcastTime print firstBroadcastChannel print ReplayBroadcastTime except TimeoutException, e: print 'TimeoutException:'+url def getBroadInfo(self,columnName,column): # column ='首播頻道: CCTV-14首播時間: 週三17:15' firstBroadcastTime = '' ReplayBroadcastTime = '' firstBroadcastChannel = '' column=column.replace('欄目大全','') if '>>' in column: index = column.index('>>') column = column[0:index] if 'CCTV13' in column: column = column.replace('CCTV13', 'CCTV-13') if 'CCTV6' in column: column = column.replace('CCTV6', 'CCTV-6') if 'CCTV1' in column: column = column.replace('CCTV1','CCTV-1') if '官方微信' in column: index = column.index('官方微信') column = column[0:index] # if '停播' in column or '關閉' in column: # return firstBroadcastTime, ReplayBroadcastTime, firstBroadcastChannel # elif '>>' in column: # index = column.index('>>') # column = column[0:index] if '首播時間' in column: if '重播時間' in column: cols = column.split('重播時間') firstBroadcastTime = cols[0] if '獨播頻道' in cols[1]: ReplayBroadcastTime = '重播時間' + cols[1].split('獨播頻道')[0] firstBroadcastChannel = '獨播頻道' + cols[1].split('獨播頻道')[1] elif '首播頻道' in cols[1]: ReplayBroadcastTime = '重播時間' + cols[1].split('首播頻道')[0] firstBroadcastChannel = '首播頻道' + cols[1].split('首播頻道')[1] elif '播出頻道' in cols[1]: ReplayBroadcastTime = '重播時間' + cols[1].split('播出頻道')[0] firstBroadcastChannel = '播出頻道' + cols[1].split('播出頻道')[1] elif '獨播頻道' in column: cols = column.split('獨播頻道') firstBroadcastTime = cols[0] firstBroadcastChannel = '獨播頻道' + cols[1] elif '播出頻道' in column: cols = column.split('播出頻道') firstBroadcastTime = cols[0] firstBroadcastChannel = '播出頻道' + cols[1] elif '首播頻道' in column: cols = column.split('首播頻道') index = column.index('首播頻道') if index==0: cols = column.split('首播時間') firstBroadcastChannel = cols[0] firstBroadcastTime = '首播時間' + cols[1] else: firstBroadcastTime = cols[0] firstBroadcastChannel = '首播頻道' + cols[1] else: if '首播(' in column and '重播(' in column: if '獨播頻道' in column: cols = column.split('獨播頻道') firstBroadcastChannel = '獨播頻道' + cols[1] firstBroadcastTime = cols[0] # '首播(生活): 一-六18:52 日18:42重播(生活): 一-五 日16:08首播(文史): 一-五22:43六日22:33/30重播(文史): 二-五06:46六日06:24' if '(生活版)' in columnName: if '首播(文史)' in firstBroadcastTime: temp = firstBroadcastTime.split('首播(文史)')[0] if '重播(生活)' in temp: firstBroadcastTime = '首播時間: '+temp.split('重播(生活)')[0].replace('首播(生活): ','') ReplayBroadcastTime = '重播時間: '+temp.split('重播(生活)')[1].replace(': ','') # 首播(文史): 一-五22:43六日22:33/30重播(文史): 二-五06:46六日06:24首播(生活): 一-六18:52 日18:42重播(生活): 一-五 日16:08 elif '(文史版)' in columnName: if '首播(生活)' in firstBroadcastTime: temp = firstBroadcastTime.split('首播(生活)')[0] if '重播(文史)' in temp: firstBroadcastTime = '首播時間: '+temp.split('重播(文史)')[0].replace('首播(文史): ','') ReplayBroadcastTime = '重播時間: '+ temp.split('重播(文史)')[1].replace(': ','') elif '播出頻道' in column: cols = column.split('播出頻道') firstBroadcastTime = cols[0] firstBroadcastChannel = '播出頻道' + cols[1] elif '首播頻道' in column: cols = column.split('首播頻道') firstBroadcastTime = cols[0] firstBroadcastChannel = '首播頻道' + cols[1] return firstBroadcastTime,ReplayBroadcastTime,firstBroadcastChannel def exportColumnInfo(self): columns = self.db.GetCCTVColumnData() for col in columns: columnName = col['columnName'].encode('utf8') firstBroadcastTime = col['firstBroadcastTime'].encode('utf8') firstBroadcastTime=firstBroadcastTime.replace('首播時間: ','') firstBroadcastChannel = col['firstBroadcastChannel'].encode('utf8').replace("播出頻道:", "").replace("獨播頻道:", "").replace("首播頻道:", "") firstBroadcastChannel =firstBroadcastChannel.replace(")","").replace("(","").replace("CCTV-8電視劇","CCTV-8 電視劇") firstBroadcastChannel = firstBroadcastChannel.replace("CCTV-1綜合頻道", "CCTV-1 綜合頻道") firstBroadcastChannel = firstBroadcastChannel.replace("CCTV-1高清頻道", "CCTV-1 高清頻道") firstBroadcastChannel = firstBroadcastChannel.replace("CCTV13", "CCTV-13") firstBroadcastChannel = firstBroadcastChannel.replace("CCTV1", "CCTV-1") firstBroadcastChannel = firstBroadcastChannel.replace("CCTV-少兒", "CCTV-14 少兒") firstBroadcastChannel = firstBroadcastChannel.replace("CCTV6", "CCTV-6") firstBroadcastChannel = firstBroadcastChannel.replace("CCTV-12社會與法", "CCTV-12 社會與法") replayBroadcastTime = col['replayBroadcastTime'].encode('utf8') replayBroadcastTime = replayBroadcastTime.replace('重播時間:', '') columnHost = col['columnHost'].encode('utf8') columnUrl = col['columnUrl'].encode('utf8') columnTimeName = col['columnTimeName'].encode('utf8') columnTimeUrl = col['columnTimeUrl'] if columnTimeUrl: columnTimeUrl = columnTimeUrl.encode('utf8') officialWebsiteUrl = col['officialWebsiteUrl'].encode('utf8') pastVideoUrl = col['pastVideoUrl'].encode('utf8') colImgUrl = col['colImgUrl'].encode('utf8') # 欄目名稱,首播時間,重播時間,播出頻道,主持人,欄目url,欄目名稱1(帶時間的),欄目名稱1url,往期視頻url,欄目官網url,),欄目對應圖片url message = '\n{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}'.format(columnName, firstBroadcastTime, replayBroadcastTime, firstBroadcastChannel, columnHost, columnUrl, columnTimeName, columnTimeUrl, pastVideoUrl, officialWebsiteUrl, colImgUrl) date = time.strftime('%Y-%m-%d') self.WriteLog(message, date) obj = cctvColumnInfo() # obj.getUrls() # obj.CatchData() # obj.getBroadCast() obj.exportColumnInfo()
# coding=utf-8 import os from pymongo import MongoClient from pymongo import ASCENDING, DESCENDING import codecs import time import columnData import datetime import re class mongoDbBase: # def __init__(self, databaseIp = '127.0.0.1',databasePort = 27017,user = "ott",password= "ott", mongodbName='OTT_DB'): def __init__(self, connstr='mongodb://ott:ott@127.0.0.1:27017/', mongodbName='OTT'): # client = MongoClient(connstr) # self.db = client[mongodbName] client = MongoClient('127.0.0.1', 27017) self.db = client.OTT self.db.authenticate('ott', 'ott') def SaveCCTVColumnData(self,columnData,index): count = self.db.column_data.find({'columnName': columnData.getColumnName()}).count() if count == 0: dictM ={'columnName':columnData.getColumnName(), 'firstBroadcastTime':columnData.getFirstBroadcastTime(), 'replayBroadcastTime':'', 'firstBroadcastChannel':columnData.getFirstBroadcastChannel(), 'columnHost':columnData.getColumnHost(), 'columnUrl':columnData.getColumnUrl(), 'columnTimeName':columnData.getColumnTimeName(), 'columnTimeUrl':columnData.getColumnTimeUrl(), 'officialWebsiteUrl':columnData.getOfficialWebsiteUrl(), 'pastVideoUrl': columnData.getPastVideoUrl(), 'colImgUrl':columnData.getColImgUrl(), 'index':index} self.db.column_data.insert(dictM) def GetCCTVColumnData(self): columns = self.db.column_data.find({},{'_id':0}) return columns def UpdateCCTVColumnData(self, columnData): dictM ={'$set':{'replayBroadcastTime':columnData.getReplayBroadcastTime(), 'firstBroadcastTime':columnData.getFirstBroadcastTime(), 'firstBroadcastChannel': columnData.getFirstBroadcastChannel()}} self.db.column_data.update({"columnName":columnData.getColumnName()},dictM) def SaveCCTVColumnUrl(self, url,suburl,columnName): dictM = {'url': url, 'iscrawl': '0','suburl':suburl,'columnName':columnName} # db.urls.find({iscrawl:'1'}).count() count = self.db.columnurls.find({'url': url}).count() if count == 0: self.db.columnurls.insert(dictM) def SaveCCTVColumnUrls(self, urlList,suburl): index = 0 for url in urlList: # db.urls.find({iscrawl:'1'}).count() count = self.db.columnurls.find({'url': url}).count() if count == 0: dictM = {'url': url, 'iscrawl': '0', 'suburl': suburl,'index':index} self.db.columnurls.insert(dictM) index += 1 # self.db.Meeting.update({'title': meet["title"],'date': meet["date"]}, {'$set': dictM}, {'upsert': True}) def GetCCTVColumnUrls(self): urls = self.db.columnurls.find({'iscrawl': '0','suburl':'0'}, {'_id': 0, 'url': 1}) # for url in urls: # #http://top.chinaz.com/hangye/index_yule.html # print urls['url'] # break return urls def GetSubCCTVColumnUrls(self): urls = self.db.columnurls.find({'iscrawl': '0', 'suburl': '1'}, {'_id': 0, 'url': 1,'columnName':1}) # urls = self.db.columnurls.find({'firstBroadcastChannel': re.compile('欄目'), 'suburl': '1'}, {'_id': 0, 'url': 1, 'columnName': 1}) return urls # def SetUrlCrawlState(self,urlList): # for url in urlList: # self.db.urls.update({'url':url},{'$set':{'iscrawl':'1'}}) def SetCCTVColumnUrlCrawlState(self, url): # db.urls.update({iscrawl:'1'},{'$set':{iscrawl:'0'}},false,true) self.db.columnurls.update({'url': url}, {'$set': {'iscrawl': '1'}}) # d = mongoDbBase() # urls = [] # urls.append('abc') # # d.SaveUrls(urls) # d.SetUrlCrawlState(urls)
def download(self, url, name): try: # url='http://pp.myapp.com/ma_icon/0/icon_10910_1523714409/96' # name='D:\work\python_crawl\down\2019.jpg' pic = requests.get(url, timeout=5) with open(name, 'wb') as f: f.write(pic.content) except requests.exceptions.ConnectionError: print('當前圖片沒法下載')