yum install –y make gcc-c++ python-devel libxml2-devel libxslt-develphp
https://pypi.python.org/pypi/setuptools/html
python setup.py installpython
https://pypi.python.org/pypi/pipmysql
python setup.py installc++
pip install pyspiderweb
ORsql
pip install --allow-all-external pyspider[all]數據庫
pip install rsasegmentfault
下載後複製至/bin/瀏覽器
(1)配置pyspider.conf
配置eagles引擎和生成結果數據庫,配置用戶名。密碼等
(2)運行install.sh安裝腳本
(3)/etc/init.d/pyspider start 啓動服務便可
(1)ip:5000直接打開網頁客戶端
(2)點擊建立腳本
(3)編寫腳本,直接調試
(4)選擇「running「點擊運行
靜態頁面的抓取最簡單,獲取HTML頁面進行標籤抽取便可,例子以下:
貴陽晚報新聞文章抓取:http://www.gywb.com.cn/
# Handler類和入口函數
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
for url in urlList:
self.crawl(url, callback=self.index_page)
self.crawl抓取貴陽晚報首頁的url,跳轉到回調函數index_page
# 回調函數index_page
# config age:10天不刷新
# response.url:抓取的url
# response.doc(pyquery):獲取標籤內容,參數:pyquery標籤
# 經過又一次抓取url到detail_page
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
for url in urlList:
if ('%s%s' % (url, 'content/') in each.attr.href) and \ # 字符串鏈接
(not ("#" in each.attr.href)): # 判斷#是否在href裏面
self.crawl(each.attr.href, callback=self.detail_page)
詳細頁面detail_page:獲取文章標題,文章內容,時間等
# config priority:調用優先級
@config(priority=2)
def detail_page(self, response):
# article title
artTitle = response.doc(artTitleSelector1).text().strip() # 獲取文章標題, artTitleSelector1:pyquery標籤,如:h1[class="g-content-t text-center"]
完整代碼以下:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-05-12 10:41:03
# Project: GYWB
from pyspider.libs.base_handler import *
import re
urlList = [ "http://www.gywb.com.cn/" ]
keyWords = [ #u"貴陽",
#u"交通",
u"違章",
u"交警",
u"交通管理",
#u"交通管理局",
u"交管局" ]
# article title
artTitleSelector1 = 'h1[class="g-content-t text-center"]'
artTitleSelector2 = 'div[class="detail_title_yy"] h1'
# article content
artContentSelector1 = 'div[class="g-content-c"] p'
artContentSelector2 = 'div[class="detailcon"] p'
# publish time
artPubTimeSelector1 = '#pubtime_baidu'
artPubTimeFilter1 = r'[^\d]*'
artPubTimeSelector2 = '.detail_more'
artPubTimeFilter2 = r'[\d\-\:\ ]*'
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
for url in urlList:
self.crawl(url, callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
for url in urlList:
if ('%s%s' % (url, 'content/') in each.attr.href) and \
(not ("#" in each.attr.href)):
self.crawl(each.attr.href, callback=self.detail_page)
@config(priority=2)
def detail_page(self, response):
for each in response.doc('a[href^="http"]').items():
self.crawl(each.attr.href, callback=self.index_page)
# article title
artTitle = response.doc(artTitleSelector1).text().strip()
if artTitle == '':
artTitle = response.doc(artTitleSelector2).text().strip()
if artTitle == '':
return None
artContent = response.doc(artContentSelector1).text().strip()
if artContent == '':
artContent = response.doc(artContentSelector2).text().strip()
artPubTime = response.doc(artPubTimeSelector1).text().strip()
if artPubTime != '':
match = re.match (artPubTimeFilter1, artPubTime)
if match != None:
artPubTime = artPubTime[len(match.group()):]
else:
artPubTime = response.doc(artPubTimeSelector2).text().strip()
match = re.match (artPubTimeFilter1, artPubTime)
if match != None:
artPubTime = artPubTime[len(match.group()):]
match = re.search (artPubTimeFilter2, artPubTime)
if match != None:
artPubTime = match.group()
artPubTime = artPubTime.strip()
for word in keyWords:
if word in artContent:
return {
#"url": response.url,
#"title": response.doc('title').text(),
"title" : artTitle,
"time" : artPubTime,
"content" : artContent,
}
else:
return None
例子:愛卡汽車論壇:http://a.xcar.com.cn/bbs/forum-d-303.html
首先登錄就須要用戶名和密碼,可是好的網站都須要對用戶名和密碼進行加密的。因此咱們只能模擬登錄方式,獲取用戶名和密碼的加密類型,從而來進行模擬登錄,模擬登錄就須要獲取瀏覽器的cookie
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
cookies = getCookies() # 獲取cookie
for url in URL_LIST:
self.crawl(url, cookies = cookies, callback=self.index_page) # 傳入cookie模擬登錄
那麼怎麼樣才能獲取cookie呢?
(1)獲取post提交的data,可使用Firefox的httpfox插件或者wireshark來對包進行抓取
下面是採用Firefox的httpfox插件進行抓取
從上圖能夠看出,post_data包含username和password,還有chash和dhash等,固然這些都是進行加密後的。
因此在程序中須要獲取post_data
def getPostData(self):
url = self.login_url.strip() # 登錄的url
if not re.match(r'^http://', url):
return None, None
req = urllib2.Request(url)
resp = urllib2.urlopen(req)
login_page = resp.read()
# 獲取html表單數據
doc = HTML.fromstring (login_page)
post_url = doc.xpath("//form[@name='login' and @id='login']/@action")[0][1:]
chash = doc.xpath("//input[@name='chash' and @id='chash']/@value")[0]
dhash = doc.xpath("//input[@name='dhash' and @id='dhash']/@value")[0]
ehash = doc.xpath("//input[@name='ehash' and @id='ehash']/@value")[0]
formhash = doc.xpath("//input[@name='formhash']/@value")[0]
loginsubmit = doc.xpath("//input[@name='loginsubmit']/@value")[0].encode('utf-8')
cookietime = doc.xpath("//input[@name='cookietime' and @id='cookietime']/@value")[0]
username = self.account # 帳戶
password = self.encoded_password # 密碼
#組合post_data
post_data = urllib.urlencode({
'username' : username,
'password' : password,
'chash' : chash,
'dhash' : dhash,
'ehash' : ehash,
'loginsubmit' : loginsubmit,
'formhash' : formhash,
'cookietime' : cookietime,
})
return post_url, post_data
#將post_data做爲參數模擬登錄
def login(self):
post_url, post_data = self.getPostData()
post_url = self.post_url_prefix + post_url
req = urllib2.Request(url = post_url, data = post_data)
resp = urllib2.urlopen(req)
return True
# 經過本地瀏覽器cookie文件獲取cookie
# 帳號進行md5加密
COOKIES_FILE = '/tmp/pyspider.xcar.%s.cookies' % hashlib.md5(ACCOUNT).hexdigest()
COOKIES_DOMAIN = 'xcar.com.cn'
def getCookies():
CookiesJar = cookielib.MozillaCookieJar(COOKIES_FILE)
if not os.path.isfile(COOKIES_FILE):
CookiesJar.save()
CookiesJar.load (COOKIES_FILE)
CookieProcessor = urllib2.HTTPCookieProcessor(CookiesJar)
CookieOpener = urllib2.build_opener(CookieProcessor, urllib2.HTTPHandler)
for item in HTTP_HEADERS:
CookieOpener.addheaders.append ((item ,HTTP_HEADERS[item]))
urllib2.install_opener(CookieOpener)
if len(CookiesJar) == 0:
xc = xcar(ACCOUNT, ENCODED_PASSWORD, LOGIN_URL, POST_URL_PREFIX)
if xc.login(): # 判斷登錄成功,保存cookie
CookiesJar.save()
else:
return None
CookiesDict = {}
# 選擇對本次登錄的cookie
for cookie in CookiesJar:
if COOKIES_DOMAIN in cookie.domain:
CookiesDict[cookie.name] = cookie.value
return CookiesDict
怎樣查看用戶名和密碼的加密類型?——經過查看js文件
查看登錄login.js和表單信息login.php
發現:username是採用base64加密,password是先採用md5加密,而後再進行base64加密
完整代碼以下:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-05-14 17:39:36
# Project: test_xcar
from pyspider.libs.base_handler import *
from pyspider.libs.response import *
from pyquery import PyQuery
import os
import re
import urllib
import urllib2
import cookielib
import lxml.html as HTML
import hashlib
URL_LIST= [ 'http://a.xcar.com.cn/bbs/forum-d-303.html' ]
THREAD_LIST_URL_FILTER = 'bbs/forum-d-303'
THREAD_LIST_URL_REG = r'bbs\/forum-d-303(-\w+)?\.'
ACCOUNT = 'ZhangZujian'
# 32-bit MD5 Hash
ENCODED_PASSWORD = 'e3d541408adb57f4b40992202c5018d8'
LOGIN_URL = 'http://my.xcar.com.cn/logging.php?action=login'
POST_URL_PREFIX = 'http://my.xcar.com.cn/'
THREAD_URL_REG = r'bbs\/thread-\w+-0'
THREAD_URL_HREF_FILTER = 'bbs/thread-'
THREAD_URL_CLASS_LIST = [ 'prev', 'next' ]
THREAD_THEME_SELECTOR = 'h2'
POST_ITEM_SELECTOR = '.posts-con > div'
POST_TIME_SELECTOR = '.pt-time > span'
POST_MEMBER_SELECTOR = '.pt-name'
POST_FLOOR_SELECTOR = '.pt-floor > span'
POST_CONTENT_SELECTOR = '.pt-cons'
# THREAD_REPLY_SELECTOR = ''
# !!! Notice !!!
# Tasks that share the same account MUST share the same cookies file
COOKIES_FILE = '/tmp/pyspider.xcar.%s.cookies' % hashlib.md5(ACCOUNT).hexdigest()
COOKIES_DOMAIN = 'xcar.com.cn'
# USERAGENT_STR = 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12A366 Safari/600.1.4'
HTTP_HEADERS = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
# 'Accept-Encoding' : 'gzip, deflate, sdch',
'Accept-Language' : 'zh-CN,zh;q=0.8,en;q=0.6',
'Connection' : 'keep-alive',
'DNT' : '1',
'Host' : 'my.xcar.com.cn',
'Referer' : 'http://a.xcar.com.cn/',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36',
}
class xcar(object):
def __init__(self, account, encoded_password, login_url, post_url_prefix):
self.account = account
self.encoded_password = encoded_password
self.login_url = login_url
self.post_url_prefix = post_url_prefix
def login(self):
post_url, post_data = self.getPostData()
post_url = self.post_url_prefix + post_url
req = urllib2.Request(url = post_url, data = post_data)
resp = urllib2.urlopen(req)
return True
def getPostData(self):
url = self.login_url.strip()
if not re.match(r'^http://', url):
return None, None
req = urllib2.Request(url)
resp = urllib2.urlopen(req)
login_page = resp.read()
doc = HTML.fromstring (login_page)
post_url = doc.xpath("//form[@name='login' and @id='login']/@action")[0][1:]
chash = doc.xpath("//input[@name='chash' and @id='chash']/@value")[0]
dhash = doc.xpath("//input[@name='dhash' and @id='dhash']/@value")[0]
ehash = doc.xpath("//input[@name='ehash' and @id='ehash']/@value")[0]
formhash = doc.xpath("//input[@name='formhash']/@value")[0]
loginsubmit = doc.xpath("//input[@name='loginsubmit']/@value")[0].encode('utf-8')
cookietime = doc.xpath("//input[@name='cookietime' and @id='cookietime']/@value")[0]
username = self.account
password = self.encoded_password
post_data = urllib.urlencode({
'username' : username,
'password' : password,
'chash' : chash,
'dhash' : dhash,
'ehash' : ehash,
'loginsubmit' : loginsubmit,
'formhash' : formhash,
'cookietime' : cookietime,
})
return post_url, post_data
def getCookies():
CookiesJar = cookielib.MozillaCookieJar(COOKIES_FILE)
if not os.path.isfile(COOKIES_FILE):
CookiesJar.save()
CookiesJar.load (COOKIES_FILE)
CookieProcessor = urllib2.HTTPCookieProcessor(CookiesJar)
CookieOpener = urllib2.build_opener(CookieProcessor, urllib2.HTTPHandler)
for item in HTTP_HEADERS:
CookieOpener.addheaders.append ((item ,HTTP_HEADERS[item]))
urllib2.install_opener(CookieOpener)
if len(CookiesJar) == 0:
xc = xcar(ACCOUNT, ENCODED_PASSWORD, LOGIN_URL, POST_URL_PREFIX)
if xc.login():
CookiesJar.save()
else:
return None
CookiesDict = {}
for cookie in CookiesJar:
if COOKIES_DOMAIN in cookie.domain:
CookiesDict[cookie.name] = cookie.value
return CookiesDict
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
cookies = getCookies()
for url in URL_LIST:
self.crawl(url, cookies = cookies, callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
cookies = getCookies()
for each in response.doc('a[href*="%s"]' % THREAD_URL_HREF_FILTER).items():
if re.search(THREAD_URL_REG, each.attr.href) and \
'#' not in each.attr.href:
self.crawl(each.attr.href, cookies = cookies, callback=self.detail_page)
for each in response.doc('a[href*="%s"]' % THREAD_LIST_URL_FILTER).items():
if re.search(THREAD_LIST_URL_REG, each.attr.href) and \
'#' not in each.attr.href:
self.crawl(each.attr.href, cookies = cookies, callback=self.index_page)
@config(priority=2)
def detail_page(self, response):
cookies = getCookies()
if '#' not in response.url:
for each in response.doc(POST_ITEM_SELECTOR).items():
floorNo = each(POST_FLOOR_SELECTOR).text()
url = '%s#%s' % (response.url, floorNo)
self.crawl(url, cookies = cookies, callback=self.detail_page)
return None
else:
floorNo = response.url[response.url.find('#')+1:]
for each in response.doc(POST_ITEM_SELECTOR).items():
if each(POST_FLOOR_SELECTOR).text() == floorNo:
theme = response.doc(THREAD_THEME_SELECTOR).text()
time = each(POST_TIME_SELECTOR).text()
member = each(POST_MEMBER_SELECTOR).text()
content = each(POST_CONTENT_SELECTOR).text()
return {
# "url" : response.url,
# "title" : response.doc('title').text(),
'theme' : theme,
'floor' : floorNo,
'time' : time,
'member' : member,
'content' : content,
}
(1)安裝Phantomjs
(2)其實和爬取網頁的不一樣點在於,傳入fetch_tpy = 'js'
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-03-20 09:46:20
# Project: fly_spider
import re
import time
#from pyspider.database.mysql.mysqldb import SQL
from pyspider.libs.base_handler import *
from pyquery import PyQuery as pq
class Handler(BaseHandler):
headers= {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate, sdch",
"Accept-Language":"zh-CN,zh;q=0.8",
"Cache-Control":"max-age=0",
"Connection":"keep-alive",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"
}
crawl_config = {
"headers" : headers,
"timeout" : 100
}
@every(minutes= 1)
def on_start(self):
self.crawl('http://www.zhanqi.tv/games',callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
print(response)
for each in response.doc('a[href^="http://www.zhanqi.tv/games/"]').items():
if re.match("http://www.zhanqi.tv/games/\w+", each.attr.href, re.U):
self.crawl(each.attr.href,
fetch_type='js', # fetch_type參數
js_script=""" # JavaScript
function() {
setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000);
}
""",callback=self.list_page)
@config(age=1*60*60, priority=2)
def list_page(self, response):
for each in response.doc('.active > div.live-list-tabc > ul#hotList.clearfix > li > a').items():
if re.match("http://www.zhanqi.tv/\w+", each.attr.href, re.U):
self.crawl(each.attr.href,
fetch_type='js',
js_script="""
function() {
setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000);
}
""",callback=self.detail_page)
@config(age=1*60*60, priority=2)
def detail_page(self, response):
for each in response.doc('.video-flash-cont').items():
d = pq(each)
print(d.html())
return {
"url": response.url,
"author":response.doc('.meat > span').text(),
"title":response.doc('.title-name').text(),
"game-name":response.doc('span > .game-name').text(),
"users2":response.doc('div.live-anchor-info.clearfix > div.sub-anchor-info > div.clearfix > div.meat-info > span.num.dv.js-onlines-panel > span.dv.js-onlines-txt > span').text(),
"flash-cont":d.html(),
"picture":response.doc('.active > img').text(),
}
怎樣判斷獲取pyquery的結果是本身想要的?
經過參看元素的命令行,能夠查看結果
官方教程:http://docs.pyspider.org/en/latest/tutorial/
pyspider 爬蟲教程(一):HTML 和 CSS 選擇器:http://segmentfault.com/a/1190000002477863
pyspider 爬蟲教程(二):AJAX 和 HTTP:http://segmentfault.com/a/1190000002477870
pyspider 爬蟲教程(三):使用 PhantomJS 渲染帶 JS 的頁面:http://segmentfault.com/a/1190000002477913