Scrapy框架——CrawlSpider爬取某招聘信息網站

時間 2020-05-09

標籤 scrapy 框架 crawlspider 招聘信息網站欄目 Python 简体版

原文原文鏈接

CrawlSpider

Scrapy框架中分兩類爬蟲，Spider類和CrawlSpider類。php

它是Spider的派生類，Spider類的設計原則是隻爬取start_url列表中的網頁，html

而CrawlSpider類定義了一些規則(rule)來提供跟進link的方便的機制，從爬取的網頁中獲取link並繼續爬取的工做更適合。node

建立項目指令：python

scrapy startproject tenCent

CrawlSpider建立：web

scrapy genspider -t crawl crawl_tencent "hr.tencent.com"

CrawlSpider繼承於Spider類，除了繼承過來的屬性外（name、allow_domains），還提供了新的屬性和方法:正則表達式

LinkExtractorjson

from scrapy.linkextractors import LinkExtractor

LinkExtractor(allow=r'start=\d+')

經過實例化LinkExtractor提取連接框架

主要參數dom

allow：知足括號中「正則表達式」的值會被提取，若是爲空，則所有匹配。 deny：與這個正則表達式(或正則表達式列表)不匹配的URL必定不提取。

rulesscrapy

Rule(LinkExtractor(allow=r'start=\d+'), callback='parse_tencent', follow=True),

在rules中包含一個或多個Rule對象，每一個Rule對爬取網站的動做定義了特定操做。

若是多個rule匹配了相同的連接，則根據規則在本集合中被定義的順序，第一個會被使用

主要參數

link_extractor：是一個Link Extractor對象，用於定義須要提取的連接 callback： 從link_extractor中每獲取到連接時，參數所指定的值做爲回調函數，該回調函數接受一個response做爲其第一個參數。 注意：當編寫爬蟲規則時，避免使用parse做爲回調函數。因爲CrawlSpider使用parse方法來實現其邏輯，若是覆蓋了 parse方法，crawl spider將會運行失敗。 follow：是一個布爾(boolean)值，指定了根據該規則從response提取的連接是否須要跟進。 若是callback爲None，follow 默認設置爲True ，不然默認爲False。

使用CrawlSpider爬取信息

1.編寫item文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html

import scrapy class TencentItem(scrapy.Item): # define the fields for your item here like:
    # name = scrapy.Field()
    # 職位名稱
    position_name = scrapy.Field() # 詳情連接
    position_link = scrapy.Field() # 職位類別
    position_type = scrapy.Field() # 職位人數
    position_number = scrapy.Field() # 職位地點
    work_location = scrapy.Field() # 發佈時間
    publish_times = scrapy.Field() # 工做職責
    position_duty = scrapy.Field() # 工做要求
    position_require = scrapy.Field() class DetailItem(scrapy.Item): # 工做職責
    position_duty = scrapy.Field() # 工做要求
    position_require = scrapy.Field()

2.編寫crawlspider文件

# -*- coding: utf-8 -*-
import scrapy from tenCent.items import TencentItem from tenCent.items import DetailItem from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class CrawlTencentSpider(CrawlSpider): name = 'crawl_tencent' allowed_domains = ['hr.tencent.com'] start_urls = ['https://hr.tencent.com/position.php'] ''' rule LinkExtractor規則: allow:根據正則表達式匹配連接 callback:回調函數 follow:是否提取跟進頁（連接套連接） ''' rules = ( Rule(LinkExtractor(allow=r'start=\d+'), callback='parse_tencent', follow=True), # 從上面的規則傳遞下一個
        Rule(LinkExtractor(allow=r'position_detail\.php\?id=\d+'), callback='parse_detail', follow=False), ) def parse_tencent(self, response): print('start……') node_list = response.xpath('//tr[@class="even"] | //tr[@class="odd"]') # 選取全部標籤tr 且class屬性等於even或odd的元素
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        for node in node_list: item = TencentItem() item['position_name'] = node.xpath('./td[1]/a/text()').extract_first()  # 獲取第一個td標籤下a標籤的文本
            item['position_link'] = node.xpath('./td[1]/a/@href').extract_first()  # 獲取第一個td標籤下a標籤href屬性
            item['position_type'] = node.xpath('./td[2]/text()').extract_first()  # 獲取第二個td標籤下文本
            item['position_number'] = node.xpath('./td[3]/text()').extract_first()  # 獲取第3個td標籤下文本
            item['work_location'] = node.xpath('./td[4]/text()').extract_first()  # 獲取第4個td標籤下文本
            item['publish_times'] = node.xpath('./td[5]/text()').extract_first()  # 獲取第5個td標籤下文本

            yield item def parse_detail(self, response): item = DetailItem() item['position_duty'] = ''.join( response.xpath('//ul[@class="squareli"]')[0].xpath('./li/text()').extract())  # 轉化爲字符串
        item['position_require'] = ''.join( response.xpath('//ul[@class="squareli"]')[1].xpath('./li/text()').extract())  # 轉化爲字符串

        yield item

3.創建pipeline文件

# -*- coding: utf-8 -*-

# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json from .items import TencentItem from .items import DetailItem class TencentPipeline(object): def open_spider(self, spider): """ # spider (Spider 對象) – 被開啓的spider # 可選實現，當spider被開啓時，這個方法被調用。 :param spider: :return: """ self.file = open('tencent.json', 'w', encoding='utf-8') json_header = '{ "tencent_info":[' self.count = 0 self.file.write(json_header) # 保存到文件

    def close_spider(self, spider): """ # spider (Spider 對象) – 被關閉的spider # 可選實現，當spider被關閉時，這個方法被調用 :param spider: :return: """ json_tail = '] }' self.file.seek(self.file.tell() - 1)  # 定位到最後一個逗號
        self.file.truncate()  # 截斷後面的字符
        self.file.write(json_tail)  # 添加終止符保存到文件
 self.file.close() def process_item(self, item, spider): """ # item (Item 對象) – 被爬取的item # spider (Spider 對象) – 爬取該item的spider # 這個方法必須實現，每一個item pipeline組件都須要調用該方法， # 這個方法必須返回一個 Item 對象，被丟棄的item將不會被以後的pipeline組件所處理。 :param item: :param spider: :return: """
        # print('item=',dict(item))

        if isinstance(item, TencentItem): print('--'*20) content = json.dumps(dict(item), ensure_ascii=False, indent=2) + ","  # 字典轉換json字符串
            self.count += 1
            print('content', self.count) self.file.write(content) # 保存到文件
        ''' return item後，item會根據優先級 傳遞到下一個管道DetailPipeline處理 此段代碼說明當實例不屬於TencentItem時，放棄存儲json， 直接傳遞到下一個管道處理 return放在if外面，若是寫在if裏面item在不屬於TencentItem實例後， item會終止傳遞，形成detail數據丟失 '''
        return item class DetailPipeline(object): def open_spider(self, spider): """ # spider (Spider 對象) – 被開啓的spider # 可選實現，當spider被開啓時，這個方法被調用。 :param spider: :return: """ self.file = open('detail.json', 'w', encoding='utf-8') json_header = '{ "detail_info":[' self.count = 0 self.file.write(json_header) # 保存到文件

    def close_spider(self, spider): """ # spider (Spider 對象) – 被關閉的spider # 可選實現，當spider被關閉時，這個方法被調用 :param spider: :return: """ json_tail = '] }' self.file.seek(self.file.tell() - 1)  # 定位到最後一個逗號
        self.file.truncate()  # 截斷後面的字符
        self.file.write(json_tail)  # 添加終止符保存到文件
 self.file.close() def process_item(self, item, spider): """ # item (Item 對象) – 被爬取的item # spider (Spider 對象) – 爬取該item的spider # 這個方法必須實現，每一個item pipeline組件都須要調用該方法， # 這個方法必須返回一個 Item 對象，被丟棄的item將不會被以後的pipeline組件所處理。 :param item: :param spider: :return: """
        # print('item=',dict(item))

        if isinstance(item, DetailItem): ''' 獲得item,判斷item實例屬於DetailItem，存儲json文件 若是不屬於，直接return item到下一個管道 '''
            print('**' * 30) content = json.dumps(dict(item), ensure_ascii=False, indent=2) + ","  # 字典轉換json字符串
            self.count += 1
            print('content', self.count) self.file.write(content) # 保存到文件
        return item

4.設置settiing

#一、項目名稱，默認的USER_AGENT由它來構成，也做爲日誌記錄的日誌名
BOT_NAME = 'tenCent'
# 二、爬蟲應用路徑
SPIDER_MODULES = ['tenCent.spiders'] NEWSPIDER_MODULE = 'tenCent.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = '"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"'  # 頭部信息，反爬

# Obey robots.txt rules
ROBOTSTXT_OBEY = True # log日誌
LOG_FILE = 'tencent.log' LOG_LEVEL = 'DEBUG' LOG_ENCODING = 'utf-8' LOG_DATEFORMAT='%m/%d/%Y %H:%M:%S %p' ITEM_PIPELINES = { 'tenCent.pipelines.TencentPipeline': 300, 'tenCent.pipelines.DetailPipeline':400 }