爬取騰訊hr招聘信息 php
工程目錄:html
1,建立爬蟲: scrapy startproject tencent tencent.compython
2.進入tencent目錄,建立爬蟲文件:scrapy genspider tencent_hrjson
tencent\tencent\spiders\tencent_hr.py #引擎dom
# -*- coding: utf-8 -*- import scrapy from tencent.items import TencentItem#導入Item class TencentHrSpider(scrapy.Spider): name = 'tencent_hr' #爬蟲的別名, allowed_domains = ['tencent.com']#爬取的範圍 url='https://hr.tencent.com/position.php?&start=' #構造爬取的url offset=0 #因爲要爬多頁,每一頁url後綴加10,用offset的變化來爬不一樣的頁 start_urls = [url+str(offset)] #開始爬的起點的url def parse(self, response): #必選的方法,爬到的頁面響應放在這裏來處理 #下面的for循環處理爬取一頁的數據 for each in response.xpath('//tr[@class="even"] | //tr[@class="odd"]'):# | 表明或的關係 item=TencentItem() #產生一個存放數據的item item['positiontname'] = each.xpath('.//a/text()').extract()[0] #職位名稱 item['positiontlink'] = each.xpath('.//a/@href').extract()[0] #職位url item['positiontype'] = each.xpath('./td[2]/text()').extract()[0]#職位所屬類型 item['peopleNum'] = each.xpath('./td[3]/text()').extract()[0] #招的人數 item['workLocation'] = each.xpath('./td[4]/text()').extract()[0]#工做地點 item['publishtime'] = each.xpath('./td[5]/text()').extract()[0]# 發佈時間 item['pagenum']=self.offset/10 #用來看爬到第幾頁了 yield item #yield是python內置的語法:生成器,就是這個函數返回一堆item的生成器, #處理好了爬完一頁的數據 if self.offset < 1680: self.offset += 10 #產生新的請求 yield scrapy.Request(self.url+str(self.offset),callback=self.parse)
tencent/tencent/items.py #要爬取的數據字段scrapy
# -*- coding: utf-8 -*- import scrapy class TencentItem(scrapy.Item): positiontname=scrapy.Field() positiontlink=scrapy.Field() positiontype=scrapy.Field() peopleNum=scrapy.Field() workLocation=scrapy.Field() publishtime=scrapy.Field() pagenum=scrapy.Field()
tencent/tencent/settings.py 配置文件
ide
# Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'tencent.pipelines.TencentPipeline': 300, }
tencent/tencent/pipelines.py 要使用pipline,記得在上面的settings.py裏把管道名字寫對
函數
# -*- coding: utf-8 -*- import json class TencentPipeline(object): #可選方法:存儲數據以前作的事 def __init__(self): self.file = open('tencent.json', 'w') #必選方法:存儲數據 def process_item(self, item, spider): text=json.dumps(dict(item),ensure_ascii=False)+'\n' self.file.write(text.encode("utf-8")) return item #return是必須的 # 可選方法:存儲數據以後作的事 def close_spider(self): self.file.close()