CrawlSpider（規則爬蟲）和Spider版爬蟲

時間 2019-11-08

原文原文鏈接

需求：爬取投訴帖子的名稱、帖子的url、帖子的標題，和帖子裏的內容。php

1.規則爬蟲--scrapy genspider -t crawl Question wz.sun0769.comhtml

**Question .pypython

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from Dongguan.items import DongguanItem


class QuestionSpider(CrawlSpider):
    name = 'Question'
    allowed_domains = ['wz.sun0769.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']
    rules = (
        # Rule規則裏面若是沒有寫Rule,默認是深度爬取
        # 因此帖子的頁面的數據
        Rule(LinkExtractor(allow=r'type=4'), follow=True),  # 下一頁的匹配
        Rule(LinkExtractor(allow=r'question/\d+/\d+.shtml'), process_links="handle_links", callback='parse_item',
             follow=True),
    )

    # 把有錯誤的連接，能夠修改過來，再去請求
    def handle_links(self, links):
        for link in links:
            print("link====", link)
        return links

    # 帖子的詳細信息
    def parse_item(self, response):
        item = DongguanItem()
        # 帖子連接
        url = response.url
        title_number = response.xpath('//div[@class="pagecenter p3"]/div/div/div/strong/text()').extract()
        if len(title_number) > 0:
            title_number = title_number[0]
            # 編號:191166
            # 帖子的編號
            number = title_number.split("\xa0\xa0")[1]
            number = number.split(":")[1]
            # 帖子標題
            title = title_number.split("\xa0\xa0")[0]
            title = title.split("：")[1]
            item["title"] = title
            item["number"] = number
        content = response.xpath('//div[@class="c1 text14_2"]/text()|//div[@class="contentext"]/text()').extract()
        # 把列表使用「」連接變成字符串
        content = "".join(content).strip()
        item["url"] = url
        item["content"] = content
        yield item

2.Spider版爬蟲---scrapy genspider Question2 wz.sun0769.comjson

**Question2 .pydom

import scrapy
from Dongguan.items import DongguanItem


class Question2Spider(scrapy.Spider):
    name = 'Question2'
    allowed_domains = ['wz.sun0769.com']
    # 偏移
    offset = 0
    url = "http://wz.sun0769.com/index.php/question/questionType?type=4&page="
    start_urls = [url + str(offset)]
    # 就是帖子具體的內容了
    def process_item(self, response):
        item = DongguanItem()
        # 帖子連接
        url = response.url
        title_number = response.xpath('//div[@class="pagecenter p3"]/div/div/div/strong/text()').extract()
        if len(title_number) > 0:
            title_number = title_number[0]
            # 編號:191166
            # 帖子的編號
            number = title_number.split("\xa0\xa0")[1]
            number = number.split(":")[1]
            # 帖子標題
            title = title_number.split("\xa0\xa0")[0]
            title = title.split("：")[1]
            item["title"] = title
            item["number"] = number
        content = response.xpath('//div[@class="c1 text14_2"]/text()|//div[@class="contentext"]/text()').extract()
        # 把列表使用「」連接變成字符串
        content = "".join(content).strip()
        item["url"] = url
        item["content"] = content
        yield item

    def parse(self, response):
        # 獲得某一頁的因此的帖子的連接
        current_page_link = response.xpath('//a[@class="news14"]/@href').extract()
        print(current_page_link)
        for link in current_page_link:
            # 添加具體的帖子連接，讓其幫我請求
            yield scrapy.Request(link, callback=self.process_item)
        # 拼接下一頁
        if self.offset < 93630:
            self.offset += 30
        # 下一頁的連接
        new_url = self.url + str(self.offset)
        yield scrapy.Request(new_url, callback=self.parse)

3.CrawlSpider（規則爬蟲）和Spider版爬蟲通用的----pipelines.pyscrapy

import json


class DongguanPipeline(object):
    def open_spider(self, spider):
        # 建立文件
        self.file = open(spider.name + ".json", "w", encoding="utf-8")

    def process_item(self, item, spider):
        # python字典
        python_dict = dict(item)
        # python的str
        python_str = json.dumps(python_dict, ensure_ascii=False) + "\n"
        self.file.write(python_str)
        return item

    def close_spider(self, spider):
        self.file.close()

4.CrawlSpider（規則爬蟲）和Spider版爬蟲通用的----item.pyide

import scrapy


class DongguanItem(scrapy.Item):
    # define the fields for your item here like:
    # 每一個帖子的標題
    title = scrapy.Field()
    # 每一個帖子的編號
    number = scrapy.Field()
    # 每一個帖子的內容
    content = scrapy.Field()
    # 每一個帖子的連接
    url = scrapy.Field()

5.CrawlSpider（規則爬蟲）和Spider版爬蟲通用的----settings.pyurl

# 爬蟲的協議
ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
    'Dongguan.pipelines.DongguanPipeline': 300,
}
# 設置日誌
LOG_FILE = "dongguan.log"
LOG_LEVEL = "DEBUG"
# 設置用戶代理
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。