潭州課堂25班：Ph201805201 爬蟲高級第三課 sclapy 框架騰訊招聘案例 (課堂筆記）

時間 2019-11-09

標籤課堂 ph201805201 爬蟲高級第三 sclapy 框架騰訊招聘案例筆記欄目網絡爬蟲简体版

原文原文鏈接

到指定目錄下，建立個項目php

進到 spiders 目錄建立執行文件，並命名html

運行調試python

執行代碼，：數據庫

# -*- coding: utf-8 -*-
import scrapy
from ..items import TenXunItem


class TenxunSpider(scrapy.Spider):
    name = 'tenxun'
    # allowed_domains = ['tenxun.com']  # 域名範圍
    start_urls = ['https://hr.tencent.com/position.php?lid=&tid=87&keywords']
    burl = 'https://hr.tencent.com/'

    def parse(self, response):
        tr_list = response.xpath('//table[@class="tablelist"]/tr')
        for tr in tr_list[1:-1]:
            item = TenXunItem()
            item['position_name']=tr.xpath('./td[1]/a/text()').extract()[0]
            item['position_link']=self.burl+tr.xpath('./td[1]/a/@href').extract()[0]
            item['position_type']=tr.xpath('./td[2]/text()').extract()[0]
            item['position_num']=tr.xpath('./td[3]/text()').extract()[0]
            item['position_addr']=tr.xpath('./td[4]/text()').extract()[0]
            item['position_time']=tr.xpath('./td[5]/text()').extract()[0]

            # yield item
            
        # 匹配下一頁
        next_url =self.burl + response.xpath('//div[@class="pagenav"]/a[11]/@href').extract()[0]
        yield scrapy.Request(url=next_url, callback=self.parse)

            # 要獲取內容，則要發起個新的請求，                      回調函數                回調時傳參
            yield scrapy.Request(url = item['position_link'],callback=self.detail_tent,meta={'items': item})

    def detail_tent(self,response):
        # 獲得上面傳過來的參數
        item = response.meta.get('items')
        item['position_con'] = ''.join(response.xpath('//ul[@class="squareli"]//text()').extract())

        yield item


        # # 名字
        # position_name_list = response.xpath('//td[@class="l square"]/a/text()').extract()
        # # 連接
        # position_link_list = response.xpath('//td[@class="l square"]/a/@href').extract()
        # # 類型
        # position_type_list = response.xpath('//table[@class="tablelist"]/tr/td[2]/text()').extract()
        # # 人數
        # position_num_list = response.xpath('//table[@class="tablelist"]/tr/td[3]/text()').extract()
        # print('====================')
        # print('====================')
        # print(self.burl + tr_list[2].xpath('./td[1]/a/@href').extract()[0])
        # print('====================')
        # print('====================')

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json

class TenXunPipeline(object):
    def open_spider(self,spider):
        self.f = open('tenxun.json', 'w', encoding='utf8')

    def process_item(self, item, spider):
        conn = json.dumps(dict(item), ensure_ascii=False)+'\n'
        self.f.write(conn)
        return item

    def close_spider(self,spider):
        self.f.close()

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class TenXunItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 名字
    print('00000000000000001111111111111111')
    position_name = scrapy.Field()
    # 連接
    position_link = scrapy.Field()
    # 類型
    position_type = scrapy.Field()
    # 人數
    position_num = scrapy.Field()
    # 地點
    position_addr = scrapy.Field()
    # 發佈時間
    position_time = scrapy.Field()
    # 要求
    position_con = scrapy.Field()

存入數據庫：json