0.0.1 scrapy 爬蟲示例

scrapy 爬蟲流程示例

爬取騰訊hr招聘信息 php

工程目錄:html

1,建立爬蟲: scrapy startproject  tencent tencent.compython

2.進入tencent目錄,建立爬蟲文件:scrapy genspider tencent_hrjson

tencent\tencent\spiders\tencent_hr.py  #引擎dom

# -*- coding: utf-8 -*-
import scrapy
from tencent.items import TencentItem#導入Item
class TencentHrSpider(scrapy.Spider):
    name = 'tencent_hr'             #爬蟲的別名,
    allowed_domains = ['tencent.com']#爬取的範圍
    url='https://hr.tencent.com/position.php?&start='  #構造爬取的url
    offset=0                        #因爲要爬多頁,每一頁url後綴加10,用offset的變化來爬不一樣的頁
    start_urls = [url+str(offset)]  #開始爬的起點的url

    def parse(self, response):    #必選的方法,爬到的頁面響應放在這裏來處理
        #下面的for循環處理爬取一頁的數據
        for each in response.xpath('//tr[@class="even"] | //tr[@class="odd"]'):# | 表明或的關係
            item=TencentItem()      #產生一個存放數據的item
            item['positiontname'] = each.xpath('.//a/text()').extract()[0]  #職位名稱
            item['positiontlink'] = each.xpath('.//a/@href').extract()[0]   #職位url
            item['positiontype'] = each.xpath('./td[2]/text()').extract()[0]#職位所屬類型
            item['peopleNum'] = each.xpath('./td[3]/text()').extract()[0]   #招的人數
            item['workLocation'] = each.xpath('./td[4]/text()').extract()[0]#工做地點
            item['publishtime'] = each.xpath('./td[5]/text()').extract()[0]# 發佈時間
            item['pagenum']=self.offset/10                                 #用來看爬到第幾頁了
            yield item  #yield是python內置的語法:生成器,就是這個函數返回一堆item的生成器,
        #處理好了爬完一頁的數據
        if self.offset < 1680:
            self.offset += 10
        #產生新的請求
        yield scrapy.Request(self.url+str(self.offset),callback=self.parse)

tencent/tencent/items.py     #要爬取的數據字段scrapy

# -*- coding: utf-8 -*-

import scrapy
class TencentItem(scrapy.Item):
    positiontname=scrapy.Field()
    positiontlink=scrapy.Field()
    positiontype=scrapy.Field()
    peopleNum=scrapy.Field()
    workLocation=scrapy.Field()
    publishtime=scrapy.Field()
    pagenum=scrapy.Field()

tencent/tencent/settings.py       配置文件
ide

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'tencent.pipelines.TencentPipeline': 300,
}

tencent/tencent/pipelines.py    要使用pipline,記得在上面的settings.py裏把管道名字寫對
函數

# -*- coding: utf-8 -*-

import json
class TencentPipeline(object):
    #可選方法:存儲數據以前作的事
    def __init__(self):
        self.file = open('tencent.json', 'w')
    #必選方法:存儲數據
    def process_item(self, item, spider):
        text=json.dumps(dict(item),ensure_ascii=False)+'\n'
        self.file.write(text.encode("utf-8"))
        return item #return是必須的

    # 可選方法:存儲數據以後作的事
    def close_spider(self):
        self.file.close()
相關文章
相關標籤/搜索