共有三級頁面html
http://www.daomubiji.com/
直接複製的 xpath app
/html/body/section/article/a/@href
這裏存在着反爬蟲機制, 改變了頁面結構dom
在返回的數據改變了頁面結構, 須要換爲下面的 xpath 才能夠scrapy
//ul[@class="sub-menu"]/li/a/@href
//article
pycharm 啓動文件,方便操做ide
from scrapy import cmdline cmdline.execute('scrapy crawl daomu --nolog'.split())
相關的參數配置函數
BOT_NAME = 'Daomu'
SPIDER_MODULES = ['Daomu.spiders'] NEWSPIDER_MODULE = 'Daomu.spiders'
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'WARNING'
DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', }
ITEM_PIPELINES = { 'Daomu.pipelines.DaomuPipeline': 300, }
指定相關指望數據url
-*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class DaomuItem(scrapy.Item): # define the fields for your item here like: # 卷名 juan_name = scrapy.Field() # 章節數 zh_num = scrapy.Field() # 章節名 zh_name = scrapy.Field() # 章節連接 zh_link = scrapy.Field() # 小說內容 zh_content = scrapy.Field()
爬蟲文件spa
# -*- coding: utf-8 -*- import scrapy from ..items import DaomuItem class DaomuSpider(scrapy.Spider): name = 'daomu' allowed_domains = ['www.daomubiji.com'] start_urls = ['http://www.daomubiji.com/'] # 解析一級頁面,提取 盜墓筆記1 2 3 ... 連接 def parse(self, response): # print(response.text) one_link_list = response.xpath( '//ul[@class="sub-menu"]/li/a/@href' ).extract() # print('*' * 50) # print(one_link_list) # 把連接交給調度器入隊列 for one_link in one_link_list: yield scrapy.Request( url=one_link, callback=self.parse_two_link ) # 解析二級頁面 def parse_two_link(self, response): # 基準xpath,匹配全部章節對象列表 article_list = response.xpath('//article') # print(article_list) # 依次獲取每一個章節信息 for article in article_list: # 建立item對象 item = DaomuItem() info = article.xpath('./a/text()'). \ extract_first().split() print(info) # ['秦嶺神樹篇', '第一章', '老癢出獄'] item['juan_name'] = info[0] item['zh_num'] = info[1] item['zh_name'] = info[2] item['zh_link'] = article.xpath('./a/@href').extract_first() # 把章節連接交給調度器 yield scrapy.Request( url=item['zh_link'], # 把item傳遞到下一個解析函數 meta={'item': item}, callback=self.parse_three_link ) # 解析三級頁面 def parse_three_link(self, response): item = response.meta['item'] # 獲取小說內容 item['zh_content'] = '\n'.join(response.xpath( '//article[@class="article-content"]' '//p/text()' ).extract()) yield item # '\n'.join(['第一段','第二段','第三段'])
持久化處理code
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html class DaomuPipeline(object): def process_item(self, item, spider): filename = 'downloads/{}-{}-{}.txt'.format( item['juan_name'], item['zh_num'], item['zh_name'] ) f = open(filename,'w') f.write(item['zh_content']) f.close() return item