一、獲取網站課程的分類地址;php
''' 爬取屌絲首頁,獲取每一個分類名稱和連接 ''' import requests from lxml import etree headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', } def get_class_data(): list_data = [] url = 'http://www.diaosiweb.net/index.html' responese = requests.get(url,headers=headers) responese.encoding = responese.apparent_encoding class_names = etree.HTML(responese.text).xpath('//div[@id="menu"]/div/ul/li/a/text()') class_links = etree.HTML(responese.text).xpath('//div[@id="menu"]/div/ul/li/a/@href') for class_name,class_link in zip(class_names,class_links): if len(class_link.split('/')[-1]) == 0: class_data = { '類別名稱':class_name, '類別連接':class_link, } list_data.append(class_data) else: pass return list_data
二、經過上面獲取的地址來獲取全部的每一個分類下的全部課程名稱、連接和發佈時間,並保存到Mongodb中去;html
''' 獲取每一個分類url下面的課程名稱和連接,而後經過課程連接,進入到連接裏面去獲取每一個課程的url和密碼 ''' from spiders_diaosi import get_class_data import requests from lxml import etree import pymongo from multiprocessing import Pool headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', } client = pymongo.MongoClient('localhost',27017) diaosi = client['kecheng_message'] kecheng_message = diaosi['message'] def get_kecheng_data(url): #獲取每頁的課程名稱、連接、發佈時間 try: response = requests.get(url,headers = headers) response.encoding = response.apparent_encoding kecheng_names = etree.HTML(response.text).xpath('//ul[@class="g-list1"]/li/a/text()') kecheng_links = etree.HTML(response.text).xpath('//ul[@class="g-list1"]/li/a/@href') times = etree.HTML(response.text).xpath('//ul[@class="g-list1"]/li/span/text()') for kecheng_name,kecheng_link,time in zip(kecheng_names,kecheng_links,times): data = { '課程名稱':kecheng_name, '課程連接':kecheng_link, '發佈時間':time } kecheng_message.insert(data) #把獲取到的課程信息保存到Mongodb中,最後爬取的時候從數據中爬取 #print(data) except Exception as e: print(e) def get_max_page(url): #獲取每一個分類的最大頁數 page_response = requests.get(url,headers=headers) page_num = int(etree.HTML(page_response.text).xpath('//span[@class="pageinfo"]/strong[1]/text()')[0]) return page_num #print(page_num) def get_class_id(url): class_response = requests.get(url,headers=headers) class_response.encoding = class_response.apparent_encoding if get_max_page(url) != 1: class_id = int(etree.HTML(class_response.text).xpath('//ul[@class="pagelist"]/li/a/@href')[-1].split('_')[1]) for num in range(1,get_max_page(url) + 1): new_url = '{}list_{}_{}.html'.format(url,class_id,num) #print(new_url) get_kecheng_data(new_url) else: get_kecheng_data(url) for link in get_class_data(): #從以前的爬取的分類連接中,讀取其中的連接,而後爬取每一個分類連接中的課程信息 url = link['類別連接'] print('開始爬取:' + link['類別名稱']) get_class_id(url) print('已經爬完了:' + link['類別名稱'])
三、從數據庫中讀取每一個課程的連接,由於下載地址只有登入以後才能夠看到,因此模擬登入以後,進行獲取,並保存到Mongodb中去,web
from get_captcha import get_capthca import pymongo import re import requests from lxml import etree import random client = pymongo.MongoClient('localhost',27017) diaosi = client['kecheng_message'] kecheng_message = diaosi['message'] dow_message = diaosi['dow_message'] login_url = 'http://www.diaosiweb.net/member/index.php' headers_data = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0', ] headers = {'User-Agent':random.choice(headers_data)} data = { 'fmdo':'login', 'dopost':'login', 'gourl':'', 'userid':'***', #運行的時候這裏輸入你的用戶名,或者用input函數輸入也能夠 'pwd':'****', #這裏則輸入密碼,獲取用input函數 'vdcode':'', 'keeptime':'604800', } get_capthca(login_url) captcha = input('輸入你看到的驗證碼:') data['vdcode'] = captcha session = requests.Session() session.headers.update(headers) login_response = session.get(login_url,headers= headers,data=data) for link in kecheng_message.find(): html = session.get(link['課程連接']) html.encoding = html.apparent_encoding dow_url = re.compile("<div id='pan' style=\"display:none;\">(.*?)</div>").findall(html.text)[0] mima = etree.HTML(html.text).xpath('//span[@style]/text()') data = { 'name':link['課程名稱'], 'link':link['課程連接'], 'dow_url':dow_url, } try: if len(mima) == 0 or len(mima) > 5 and '網盤提取密碼' not in mima[-1].split(':') : data['mima'] = '沒有密碼' else: data['mima'] = mima dow_message.insert(data) print(data) except Exception as e: print(e) print(link['課程名稱'])
下面是獲取網頁驗證碼的,數據庫
''' 獲取登入界面的驗證碼,並保存到本地 --如今只是保存到本地中,後期再編寫自動輸入 ''' import requests from lxml import etree import os login_url = 'http://www.diaosiweb.net/member/index.php' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', } def get_capthca(url): login_response = requests.get(url,headers=headers) image_url = 'http://www.diaosiweb.net' + etree.HTML(login_response.text).xpath('//img[@id="vdimgck"]/@src')[0].replace('..','') image_response = requests.get(image_url).content with open('captcha.jpg','wb') as f: f.write(image_response) f.close() print('驗證碼已經保存到:{}'.format(os.getcwd()))
恩,這樣差很少就完成了一個爬蟲項目了,由於是第一次完整的爬取,因此寫的比較亂,也沒有思惟圖,也知道有不少地方不完善,可是發懶筋了,不想寫了,先這樣吧!session