scrapy startproject scrapyseleniumtest複製代碼
scrapy genspider淘寶網www.taobao.com複製代碼
ROBOTSTXT_OBEY
False
ROBOTSTXT_OBEY = False複製代碼
Item
ProductItem
來自 scrapy import Item,Field
class ProductItem (Item):
collection = 'products'image
= Field()
price = Field()
deal = Field()
title = Field()
shop = Field()
location = Field()複製代碼
collection
start_requests()
從 scrapy 進口請求,蜘蛛
從的urllib.parse 進口報價
從 scrapyseleniumtest.items 導入 ProductItem
類 TaobaoSpider (蜘蛛):
名稱= '淘'
allowed_domains = [ 'www.taobao.com' ]
BASE_URL = 「https://s.taobao的.com /查詢q =」?
DEF start_requests (個體):
爲關鍵字在 self.settings.get('關鍵字'):
對頁面在範圍(1,self.settings.get('MAX_PAGE')+ 1):
url = self.base_url + quote(關鍵字)
yield請求(url = url,callback = self.parse,meta = { 'page':page},dont_filter = True)複製代碼
base_url
KEYWORDS
MAX_PAGE
KEYWORDS = [ 'iPad' ]
MAX_PAGE = 100複製代碼
start_requests()
meta
dont_filter
process_request()
HtmlResponse
來自 selenium import webdriver
from selenium.common.exceptions 導入 TimeoutException異常
從 selenium.webdriver.common.by 進口經過
從 selenium.webdriver.support.ui 進口 WebDriverWait
從 selenium.webdriver.support 進口 expected_conditions 爲 EC
從 scrapy.http 進口 HtmlResponse
從記錄import getLogger
類 SeleniumMiddleware ():
def __init__(個體,超時=無,service_args = []) :
self.logger = getLogger(__ name__)
self.timeout =超時
self.browser = webdriver.PhantomJS(service_args = service_args)
self.browser.set_window_size( 1400, 700)
的自我。 browser.set_page_load_timeout(self.timeout)
self.wait = WebDriverWait(self.browser,self.timeout)
def __del __ (self):
self.browser.close()
def process_request (self,request,spider):
「」「
用PhantomJS抓取頁面
:param request:請求對象
:param spider:Spider對象
:return:HtmlResponse
「」「
self.logger.debug('PhantomJS is Starting')
page = request.meta.get('page',1)
try:
self.browser.get(request.url)
if page> 1:
input = self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager div.form> input))) submit = self.wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR,'# mainsrp-pager div.form> span.btn.J_Submit'))) input.clear() input.send_keys(page) submit.click() self.wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager li.item.active> span'),str(page))) self.wait .until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item'))) return HtmlResponse(url = request.url,body = self.browser.page_source,request = request,encoding = 'utf -8',status = 200), 除了 TimeoutException: return HtmlResponse(url = request.url,status = 500,request = request) @classmethod def from_crawler (cls,crawler): return cls(timeout = crawler.settings.get(' SELENIUM_TIMEOUT '), service_args = crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))複製代碼
__init__()
PhantomJS
WebDriverWait
process_request()
meta
get()
page_source
HtmlResponse
url
body
HtmlResponse
process_request()
process_request()
process_request()
process_exception()
process_response()
HtmlResponse
process_response()
process_response()
SeleniumMiddleware
DOWNLOADER_MIDDLEWARES = {
'scrapyseleniumtest.middlewares.SeleniumMiddleware':543,
}複製代碼
def parse (self,response):
products = response.xpath(
'// div [@ id =「mainsrp-itemlist」] // div [@ class =「items」] [1] // div [contains(@class , 「項目」)] ')
爲產品在產品:
產品= ProductItem()
項[ '價格' ] = '' 。加入(product.xpath(' .//div[contains(@class, 「價格」)] ()。)(); // text()').extract())。strip()
item [ 'title' ] = '' .join(product.xpath('.//div[contains ( @class,「title」)] // text()').extract())。strip()
item [ 'shop' ] = ''. join(product.xpath('.div[contains(@class,「shop」)] // text()').extract())。strip()
item [ 'image' ] = ''. join(product.xpath('。 / / div [@ class =「pic」] // img [contains(@class,「img」)] / @ data-src').extract())。strip()
item [ 'deal' ] = product。 xpath(' .//div[contains( @class,「deal-cnt」)] // text()').extract_first()
item [ 'location' ] = product.xpath('.// div [contains( @class,「location」)] // text()').extract_first()
yield item複製代碼
response
xpath()
ProductItem
導入 pymongo
類 MongoPipeline (object):
def __init__ (self,mongo_uri,mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler (cls,crawler):
return cls(mongo_uri = crawler.settings.get(' MONGO_URI'),mongo_db = crawler.settings.get('MONGO_DB'))
def open_spider(個體,蜘蛛):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client [self.mongo_db]
DEF process_item(self,item,spider):
self.db [item.collection] .insert(dict(item))
return item
def close_spider (self,spider):
self.client.close()複製代碼
ITEM_PIPELINES = {
'scrapyseleniumtest.pipelines.MongoPipeline':300,
}複製代碼
MONGO_URI
MONGO_DB
MONGO_URI = 'localhost'MONGO_DB
= 'taobao'複製代碼
scrapy抓取淘寶複製代碼
本資源首發於崔慶才的我的博客靜覓: Python3網絡爬蟲開發實戰教程 | 靜覓html
如想了解更多爬蟲資訊,請關注個人我的微信公衆號:進擊的Codergit
weixin.qq.com/r/5zsjOyvEZ… (二維碼自動識別)github