爬取騰訊招聘

scrapy startproject insist  #建立項目
scrapy  genspider  teng  carees.tencent.com#建立爬蟲(爬蟲名字+域名)

items.py
#須要爬取的信息
import scrapy
class InsistItem(scrapy.Item):
    # define the fields for your item here like:
    positionname = scrapy.Field()
    type=scrapy.Field()
    place=scrapy.Field()
    mian=scrapy.Field()
    time=scrapy.Field()
    #pass


pipelines.py
#保存數據到數據庫或者json文件
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

import json

class InsistPipeline(object):
    def __init__(self):
        self.f=open('teng.json','w',encoding='utf-8')#編碼
    def process_item(self, item, spider):
        #item(Item對象,被爬取的item)
        #這個方法必須實現
        content=json.dumps(dict(item),ensure_ascii=False)+",\n"
        self.f.write(content)
        return item

teng.py
import scrapy
import json
from insist.items import InsistItem

class TengSpider(scrapy.Spider):
    name = 'teng'
    allowed_domains = ['careers.tencent.com']
    #start_urls = ['http://careers.tencent.com/']
    baseURL = 'https://careers.tencent.com/tencentcareer/api/post/Query?pageSize=10&pageIndex='
    offset = 1
    start_urls = [baseURL + str(offset)]

    def parse(self, response):
        contents=json.loads(response.text)
        jobs=contents['Data']['Posts']
        item=InsistItem()
        for job in jobs:
            item['positionname']=job['RecruitPostName']
            item['type']=job['BGName']
            item['place']=job['LocationName']
            item['mian']=job['CategoryName']
            item['time']=job['LastUpdateTime']
            yield item
        if self.offset<=10:
                 self.offset += 1
                 yield scrapy.Request(self.baseURL + str(self.offset), callback=self.parse)
相關文章
相關標籤/搜索