Pyspider 爬蟲教程

Pyspider爬蟲教程

1、安裝

一、 安裝pip
(1)準備工做

yum install –y make gcc-c++ python-devel libxml2-devel libxslt-develphp

(2)安裝setuptools

https://pypi.python.org/pypi/setuptools/html

python setup.py installpython

(3)安裝pip

https://pypi.python.org/pypi/pipmysql

python setup.py installc++

二、 安裝pyspider
(1)安裝pyspider及其依賴

pip install pyspiderweb

ORsql

pip install --allow-all-external pyspider[all]數據庫

三、 安裝可選庫

pip install rsasegmentfault

四、 phantomjs

下載後複製至/bin/瀏覽器

2、部署pyspider服務器

(1)配置pyspider.conf

02cb76b9-4463-49bd-bef8-2fd7fca1045c

配置eagles引擎和生成結果數據庫,配置用戶名。密碼等

(2)運行install.sh安裝腳本

(3)/etc/init.d/pyspider start 啓動服務便可

3、使用

(1)ip:5000直接打開網頁客戶端

9e971a03-e4ea-4bc2-a3a1-fe8a49a28f0d

(2)點擊建立腳本

(3)編寫腳本,直接調試

4e4cc9e2-c2a0-430e-b0d6-484ea6febeb9

(4)選擇「running「點擊運行

83f30964-15ba-4062-b034-c2994a000f29

4、爬蟲教程(1)—抓取簡單的靜態頁面

靜態頁面的抓取最簡單,獲取HTML頁面進行標籤抽取便可,例子以下:

貴陽晚報新聞文章抓取:http://www.gywb.com.cn/

 

# Handler類和入口函數

class Handler(BaseHandler):

crawl_config = {

}

@every(minutes=24 * 60)

def on_start(self):

for url in urlList:

self.crawl(url, callback=self.index_page)

self.crawl抓取貴陽晚報首頁的url,跳轉到回調函數index_page

 

# 回調函數index_page

# config age:10天不刷新

# response.url:抓取的url

# response.doc(pyquery):獲取標籤內容,參數:pyquery標籤

# 經過又一次抓取url到detail_page

@config(age=10 * 24 * 60 * 60)

def index_page(self, response):

for each in response.doc('a[href^="http"]').items():

for url in urlList:

if ('%s%s' % (url, 'content/') in each.attr.href) and \ # 字符串鏈接

(not ("#" in each.attr.href)): # 判斷#是否在href裏面

self.crawl(each.attr.href, callback=self.detail_page)

詳細頁面detail_page:獲取文章標題,文章內容,時間等

 

 

# config priority:調用優先級

@config(priority=2)

def detail_page(self, response):

# article title

artTitle = response.doc(artTitleSelector1).text().strip() # 獲取文章標題, artTitleSelector1:pyquery標籤,如:h1[class="g-content-t text-center"]

完整代碼以下:

 

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

# Created on 2015-05-12 10:41:03

# Project: GYWB

 

 

from pyspider.libs.base_handler import *

import re

 

urlList = [ "http://www.gywb.com.cn/" ]

keyWords = [ #u"貴陽",

#u"交通",

u"違章",

u"交警",

u"交通管理",

#u"交通管理局",

u"交管局" ]

 

# article title

artTitleSelector1 = 'h1[class="g-content-t text-center"]'

artTitleSelector2 = 'div[class="detail_title_yy"] h1'

 

# article content

artContentSelector1 = 'div[class="g-content-c"] p'

artContentSelector2 = 'div[class="detailcon"] p'

 

# publish time

artPubTimeSelector1 = '#pubtime_baidu'

artPubTimeFilter1 = r'[^\d]*'

artPubTimeSelector2 = '.detail_more'

artPubTimeFilter2 = r'[\d\-\:\ ]*'

 

class Handler(BaseHandler):

crawl_config = {

}

 

@every(minutes=24 * 60)

def on_start(self):

for url in urlList:

self.crawl(url, callback=self.index_page)

 

@config(age=10 * 24 * 60 * 60)

def index_page(self, response):

for each in response.doc('a[href^="http"]').items():

for url in urlList:

if ('%s%s' % (url, 'content/') in each.attr.href) and \

(not ("#" in each.attr.href)):

self.crawl(each.attr.href, callback=self.detail_page)

 

@config(priority=2)

def detail_page(self, response):

for each in response.doc('a[href^="http"]').items():

self.crawl(each.attr.href, callback=self.index_page)

 

# article title

artTitle = response.doc(artTitleSelector1).text().strip()

if artTitle == '':

artTitle = response.doc(artTitleSelector2).text().strip()

if artTitle == '':

return None

 

artContent = response.doc(artContentSelector1).text().strip()

if artContent == '':

artContent = response.doc(artContentSelector2).text().strip()

 

artPubTime = response.doc(artPubTimeSelector1).text().strip()

if artPubTime != '':

match = re.match (artPubTimeFilter1, artPubTime)

if match != None:

artPubTime = artPubTime[len(match.group()):]

else:

artPubTime = response.doc(artPubTimeSelector2).text().strip()

match = re.match (artPubTimeFilter1, artPubTime)

if match != None:

artPubTime = artPubTime[len(match.group()):]

match = re.search (artPubTimeFilter2, artPubTime)

if match != None:

artPubTime = match.group()

 

artPubTime = artPubTime.strip()

 

for word in keyWords:

if word in artContent:

return {

#"url": response.url,

#"title": response.doc('title').text(),

"title" : artTitle,

"time" : artPubTime,

"content" : artContent,

}

else:

return None

5、爬蟲教程(2)—HTTP請求的頁面(如登錄後抓取)

例子:愛卡汽車論壇:http://a.xcar.com.cn/bbs/forum-d-303.html

首先登錄就須要用戶名和密碼,可是好的網站都須要對用戶名和密碼進行加密的。因此咱們只能模擬登錄方式,獲取用戶名和密碼的加密類型,從而來進行模擬登錄,模擬登錄就須要獲取瀏覽器的cookie

 

class Handler(BaseHandler):

crawl_config = {

}

 

@every(minutes=24 * 60)

def on_start(self):

cookies = getCookies() # 獲取cookie

 

for url in URL_LIST:

self.crawl(url, cookies = cookies, callback=self.index_page) # 傳入cookie模擬登錄

那麼怎麼樣才能獲取cookie呢?

(1)獲取post提交的data,可使用Firefox的httpfox插件或者wireshark來對包進行抓取

下面是採用Firefox的httpfox插件進行抓取

3562953

從上圖能夠看出,post_data包含username和password,還有chash和dhash等,固然這些都是進行加密後的。

因此在程序中須要獲取post_data

 

def getPostData(self):

url = self.login_url.strip() # 登錄的url

if not re.match(r'^http://', url):

return None, None

 

req = urllib2.Request(url)

resp = urllib2.urlopen(req)

login_page = resp.read()

 

# 獲取html表單數據

doc = HTML.fromstring (login_page)

post_url = doc.xpath("//form[@name='login' and @id='login']/@action")[0][1:]

chash = doc.xpath("//input[@name='chash' and @id='chash']/@value")[0]

dhash = doc.xpath("//input[@name='dhash' and @id='dhash']/@value")[0]

ehash = doc.xpath("//input[@name='ehash' and @id='ehash']/@value")[0]

formhash = doc.xpath("//input[@name='formhash']/@value")[0]

loginsubmit = doc.xpath("//input[@name='loginsubmit']/@value")[0].encode('utf-8')

cookietime = doc.xpath("//input[@name='cookietime' and @id='cookietime']/@value")[0]

 

username = self.account # 帳戶

password = self.encoded_password # 密碼

 

#組合post_data

post_data = urllib.urlencode({

'username' : username,

'password' : password,

'chash' : chash,

'dhash' : dhash,

'ehash' : ehash,

'loginsubmit' : loginsubmit,

'formhash' : formhash,

'cookietime' : cookietime,

})

 

return post_url, post_data

 

#將post_data做爲參數模擬登錄

def login(self):

post_url, post_data = self.getPostData()

post_url = self.post_url_prefix + post_url

 

req = urllib2.Request(url = post_url, data = post_data)

resp = urllib2.urlopen(req)

return True

 

# 經過本地瀏覽器cookie文件獲取cookie

# 帳號進行md5加密

COOKIES_FILE = '/tmp/pyspider.xcar.%s.cookies' % hashlib.md5(ACCOUNT).hexdigest()

COOKIES_DOMAIN = 'xcar.com.cn'

def getCookies():

CookiesJar = cookielib.MozillaCookieJar(COOKIES_FILE)

if not os.path.isfile(COOKIES_FILE):

CookiesJar.save()

 

CookiesJar.load (COOKIES_FILE)

CookieProcessor = urllib2.HTTPCookieProcessor(CookiesJar)

CookieOpener = urllib2.build_opener(CookieProcessor, urllib2.HTTPHandler)

for item in HTTP_HEADERS:

CookieOpener.addheaders.append ((item ,HTTP_HEADERS[item]))

urllib2.install_opener(CookieOpener)

 

if len(CookiesJar) == 0:

xc = xcar(ACCOUNT, ENCODED_PASSWORD, LOGIN_URL, POST_URL_PREFIX)

if xc.login(): # 判斷登錄成功,保存cookie

CookiesJar.save()

else:

return None

 

CookiesDict = {}

# 選擇對本次登錄的cookie

for cookie in CookiesJar:

if COOKIES_DOMAIN in cookie.domain:

CookiesDict[cookie.name] = cookie.value

return CookiesDict

怎樣查看用戶名和密碼的加密類型?——經過查看js文件

查看登錄login.js和表單信息login.php

491aced3-92da-470f-9f02-84f8b0191be3

發現:username是採用base64加密,password是先採用md5加密,而後再進行base64加密

3e667c10-1dfe-439b-bcee-04817ef1bc62

完整代碼以下:

 

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

# Created on 2015-05-14 17:39:36

# Project: test_xcar

 

 

from pyspider.libs.base_handler import *

from pyspider.libs.response import *

from pyquery import PyQuery

 

import os

import re

import urllib

import urllib2

import cookielib

import lxml.html as HTML

import hashlib

 

 

URL_LIST= [ 'http://a.xcar.com.cn/bbs/forum-d-303.html' ]

 

THREAD_LIST_URL_FILTER = 'bbs/forum-d-303'

THREAD_LIST_URL_REG = r'bbs\/forum-d-303(-\w+)?\.'

 

ACCOUNT = 'ZhangZujian'

# 32-bit MD5 Hash

ENCODED_PASSWORD = 'e3d541408adb57f4b40992202c5018d8'

 

LOGIN_URL = 'http://my.xcar.com.cn/logging.php?action=login'

POST_URL_PREFIX = 'http://my.xcar.com.cn/'

 

THREAD_URL_REG = r'bbs\/thread-\w+-0'

THREAD_URL_HREF_FILTER = 'bbs/thread-'

THREAD_URL_CLASS_LIST = [ 'prev', 'next' ]

 

THREAD_THEME_SELECTOR = 'h2'

POST_ITEM_SELECTOR = '.posts-con > div'

POST_TIME_SELECTOR = '.pt-time > span'

POST_MEMBER_SELECTOR = '.pt-name'

POST_FLOOR_SELECTOR = '.pt-floor > span'

POST_CONTENT_SELECTOR = '.pt-cons'

# THREAD_REPLY_SELECTOR = ''

 

# !!! Notice !!!

# Tasks that share the same account MUST share the same cookies file

COOKIES_FILE = '/tmp/pyspider.xcar.%s.cookies' % hashlib.md5(ACCOUNT).hexdigest()

COOKIES_DOMAIN = 'xcar.com.cn'

# USERAGENT_STR = 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12A366 Safari/600.1.4'

 

HTTP_HEADERS = {

'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

# 'Accept-Encoding' : 'gzip, deflate, sdch',

'Accept-Language' : 'zh-CN,zh;q=0.8,en;q=0.6',

'Connection' : 'keep-alive',

'DNT' : '1',

'Host' : 'my.xcar.com.cn',

'Referer' : 'http://a.xcar.com.cn/',

'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36',

}

 

 

class xcar(object):

def __init__(self, account, encoded_password, login_url, post_url_prefix):

self.account = account

self.encoded_password = encoded_password

self.login_url = login_url

self.post_url_prefix = post_url_prefix

 

 

def login(self):

post_url, post_data = self.getPostData()

post_url = self.post_url_prefix + post_url

 

req = urllib2.Request(url = post_url, data = post_data)

resp = urllib2.urlopen(req)

return True

 

 

def getPostData(self):

url = self.login_url.strip()

if not re.match(r'^http://', url):

return None, None

 

req = urllib2.Request(url)

resp = urllib2.urlopen(req)

login_page = resp.read()

 

doc = HTML.fromstring (login_page)

post_url = doc.xpath("//form[@name='login' and @id='login']/@action")[0][1:]

chash = doc.xpath("//input[@name='chash' and @id='chash']/@value")[0]

dhash = doc.xpath("//input[@name='dhash' and @id='dhash']/@value")[0]

ehash = doc.xpath("//input[@name='ehash' and @id='ehash']/@value")[0]

formhash = doc.xpath("//input[@name='formhash']/@value")[0]

loginsubmit = doc.xpath("//input[@name='loginsubmit']/@value")[0].encode('utf-8')

cookietime = doc.xpath("//input[@name='cookietime' and @id='cookietime']/@value")[0]

 

username = self.account

password = self.encoded_password

 

post_data = urllib.urlencode({

'username' : username,

'password' : password,

'chash' : chash,

'dhash' : dhash,

'ehash' : ehash,

'loginsubmit' : loginsubmit,

'formhash' : formhash,

'cookietime' : cookietime,

})

 

return post_url, post_data

 

 

def getCookies():

CookiesJar = cookielib.MozillaCookieJar(COOKIES_FILE)

if not os.path.isfile(COOKIES_FILE):

CookiesJar.save()

 

CookiesJar.load (COOKIES_FILE)

CookieProcessor = urllib2.HTTPCookieProcessor(CookiesJar)

CookieOpener = urllib2.build_opener(CookieProcessor, urllib2.HTTPHandler)

for item in HTTP_HEADERS:

CookieOpener.addheaders.append ((item ,HTTP_HEADERS[item]))

urllib2.install_opener(CookieOpener)

 

if len(CookiesJar) == 0:

xc = xcar(ACCOUNT, ENCODED_PASSWORD, LOGIN_URL, POST_URL_PREFIX)

if xc.login():

CookiesJar.save()

else:

return None

 

CookiesDict = {}

for cookie in CookiesJar:

if COOKIES_DOMAIN in cookie.domain:

CookiesDict[cookie.name] = cookie.value

return CookiesDict

 

 

class Handler(BaseHandler):

crawl_config = {

}

 

@every(minutes=24 * 60)

def on_start(self):

cookies = getCookies()

 

for url in URL_LIST:

self.crawl(url, cookies = cookies, callback=self.index_page)

 

@config(age=10 * 24 * 60 * 60)

def index_page(self, response):

cookies = getCookies()

 

for each in response.doc('a[href*="%s"]' % THREAD_URL_HREF_FILTER).items():

if re.search(THREAD_URL_REG, each.attr.href) and \

'#' not in each.attr.href:

self.crawl(each.attr.href, cookies = cookies, callback=self.detail_page)

 

for each in response.doc('a[href*="%s"]' % THREAD_LIST_URL_FILTER).items():

if re.search(THREAD_LIST_URL_REG, each.attr.href) and \

'#' not in each.attr.href:

self.crawl(each.attr.href, cookies = cookies, callback=self.index_page)

 

@config(priority=2)

def detail_page(self, response):

cookies = getCookies()

 

if '#' not in response.url:

for each in response.doc(POST_ITEM_SELECTOR).items():

floorNo = each(POST_FLOOR_SELECTOR).text()

url = '%s#%s' % (response.url, floorNo)

self.crawl(url, cookies = cookies, callback=self.detail_page)

return None

else:

floorNo = response.url[response.url.find('#')+1:]

 

for each in response.doc(POST_ITEM_SELECTOR).items():

if each(POST_FLOOR_SELECTOR).text() == floorNo:

theme = response.doc(THREAD_THEME_SELECTOR).text()

time = each(POST_TIME_SELECTOR).text()

member = each(POST_MEMBER_SELECTOR).text()

content = each(POST_CONTENT_SELECTOR).text()

return {

# "url" : response.url,

# "title" : response.doc('title').text(),

'theme' : theme,

'floor' : floorNo,

'time' : time,

'member' : member,

'content' : content,

}

6、爬蟲教程(3)使用Phantomjs渲染帶js的頁面(視頻連接抓取)

(1)安裝Phantomjs

(2)其實和爬取網頁的不一樣點在於,傳入fetch_tpy = 'js'

 

#!/usr/bin/env python

# -*- encoding: utf-8 -*-

# Created on 2015-03-20 09:46:20

# Project: fly_spider

import re

import time

#from pyspider.database.mysql.mysqldb import SQL

from pyspider.libs.base_handler import *

from pyquery import PyQuery as pq

class Handler(BaseHandler):

headers= {

"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",

"Accept-Encoding":"gzip, deflate, sdch",

"Accept-Language":"zh-CN,zh;q=0.8",

"Cache-Control":"max-age=0",

"Connection":"keep-alive",

"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"

}

crawl_config = {

"headers" : headers,

"timeout" : 100

}

@every(minutes= 1)

def on_start(self):

self.crawl('http://www.zhanqi.tv/games',callback=self.index_page)

@config(age=10 * 24 * 60 * 60)

def index_page(self, response):

print(response)

for each in response.doc('a[href^="http://www.zhanqi.tv/games/"]').items():

if re.match("http://www.zhanqi.tv/games/\w+", each.attr.href, re.U):

self.crawl(each.attr.href,

fetch_type='js', # fetch_type參數

js_script=""" # JavaScript

function() {

setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000);

}

""",callback=self.list_page)

@config(age=1*60*60, priority=2)

def list_page(self, response):

for each in response.doc('.active > div.live-list-tabc > ul#hotList.clearfix > li > a').items():

if re.match("http://www.zhanqi.tv/\w+", each.attr.href, re.U):

self.crawl(each.attr.href,

fetch_type='js',

js_script="""

function() {

setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000);

}

""",callback=self.detail_page)

@config(age=1*60*60, priority=2)

def detail_page(self, response):

for each in response.doc('.video-flash-cont').items():

d = pq(each)

print(d.html())

return {

"url": response.url,

"author":response.doc('.meat > span').text(),

"title":response.doc('.title-name').text(),

"game-name":response.doc('span > .game-name').text(),

"users2":response.doc('div.live-anchor-info.clearfix > div.sub-anchor-info > div.clearfix > div.meat-info > span.num.dv.js-onlines-panel > span.dv.js-onlines-txt > span').text(),

"flash-cont":d.html(),

"picture":response.doc('.active > img').text(),

}

7、附錄

怎樣判斷獲取pyquery的結果是本身想要的?

經過參看元素的命令行,能夠查看結果

5442828

8、參考

官方教程:http://docs.pyspider.org/en/latest/tutorial/

pyspider 爬蟲教程(一):HTML 和 CSS 選擇器:http://segmentfault.com/a/1190000002477863

pyspider 爬蟲教程(二):AJAX 和 HTTP:http://segmentfault.com/a/1190000002477870

pyspider 爬蟲教程(三):使用 PhantomJS 渲染帶 JS 的頁面:http://segmentfault.com/a/1190000002477913

相關文章
相關標籤/搜索