scrapy將爬取的數據存入MySQL數據庫

items.py

import scrapy


class InsistItem(scrapy.Item):

    positionname=scrapy.Field()
    type=scrapy.Field()
    place=scrapy.Field()
    mian=scrapy.Field()
    time=scrapy.Field()

pipelines.py

import json
import scrapy
import pymysql
from scrapy.pipelines.images import ImagesPipeline
class InsistPipeline(object):
    def __init__(self):
        self.db=pymysql.connect(host='localhost',user='dsuser',passwd='badpassword',db='dsdb',charset='utf8',port=3306)
        self.cur=self.db.cursor()
    def process_item(self, item, spider):
        sql='INSERT INTO job(name,type,place,mian,time) VALUES(%s,%s,%s,%s,%s) '
        self.cur.execute(sql,(item['positionname'],item['type'],item['place'],item['mian'],item['time']))
        self.db.commit()
        return item
      
    def close_spider(self, spider):
        self.cur.close()
        self.db.close()

insisits.py
#爬蟲程序
import scrapy
from insist.items import InsistItem
import json
class InsistsSpider(scrapy.Spider):
    name = 'insists'
    allowed_domains = ['careers.tencent.com']
    #start_urls =['https://careers.tencent.com/search.html?index=']
    baseURL='https://careers.tencent.com/tencentcareer/api/post/Query?pageSize=10&pageIndex='
    offset=1
    start_urls=[baseURL+str(offset)]

    def parse(self, response):
        contents = json.loads(response.text)
        jobs = contents['Data']['Posts']
        item = InsistItem()
        for job in jobs:
            item['positionname'] = job['RecruitPostName']
            item['type'] = job['BGName']
            item['place'] = job['LocationName']
            item['mian'] = job['CategoryName']
            item['time'] = job['LastUpdateTime']
            yield item#返回後繼續執行數據
        if self.offset<=5:
              self.offset+=1
              url=self.baseURL+str(self.offset)
              yield scrapy.Request(url,callback=self.parse)

相關文章
相關標籤/搜索