安裝scrapy再也不贅述,php
在控制檯中輸入scrapy startproject tencent 建立爬蟲項目名字爲 tencenthtml
接着cd tencentpython
用pycharm打開tencent項目git
構建item文件github
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class TencentItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #職位名 positionname = scrapy.Field() #詳細連接 positionLink = scrapy.Field() #職位類別 positionType = scrapy.Field() #招聘人數 peopleNum = scrapy.Field() #工做地點 workLocation = scrapy.Field() #發佈時間 publishTime = scrapy.Field()
接着在spiders文件夾中新建tencentPostition.py文件代碼以下注釋寫的很清楚json
# -*- coding: utf-8 -*- import scrapy from tencent.items import TencentItem class TencentpostitionSpider(scrapy.Spider): #爬蟲名 name = 'tencent' #爬蟲域 allowed_domains = ['tencent.com'] #設置URL url = 'http://hr.tencent.com/position.php?&start=' #設置頁碼 offset = 0 #默認url start_urls = [url+str(offset)] def parse(self, response): #xpath匹配規則 for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): item = TencentItem() # 職位名 item["positionname"] = each.xpath("./td[1]/a/text()").extract()[0] # 詳細連接 item["positionLink"] = each.xpath("./td[1]/a/@href").extract()[0] # 職位類別 try: item["positionType"] = each.xpath("./td[2]/text()").extract()[0] except: item["positionType"] = '空' # 招聘人數 item["peopleNum"] = each.xpath("./td[3]/text()").extract()[0] # 工做地點 item["workLocation"] = each.xpath("./td[4]/text()").extract()[0] # 發佈時間 item["publishTime"] = each.xpath("./td[5]/text()").extract()[0] #把數據交給管道文件 yield item #設置新URL頁碼 if(self.offset<2620): self.offset += 10 #把請求交給控制器 yield scrapy.Request(self.url+str(self.offset),callback=self.parse)
接着配置管道文件pipelines.py代碼以下app
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import json class TencentPipeline(object): def __init__(self): #在初始化方法中打開文件 self.fileName = open("tencent.json","wb") def process_item(self, item, spider): #把數據轉換爲字典再轉換成json text = json.dumps(dict(item),ensure_ascii=False)+"\n" #寫到文件中編碼設置爲utf-8 self.fileName.write(text.encode("utf-8")) #返回item return item def close_spider(self,spider): #關閉時關閉文件 self.fileName.close()
接下來須要配置settings.py文件dom
不遵循ROBOTS規則scrapy
ROBOTSTXT_OBEY = False
#下載延遲 DOWNLOAD_DELAY = 3
#設置請求頭 DEFAULT_REQUEST_HEADERS = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', }
#交給哪一個管道文件處理 文件夾.管道文件名.類名 ITEM_PIPELINES = { 'tencent.pipelines.TencentPipeline': 300, }
接下來再控制檯中輸入 ide
scrapy crawl tencent
便可爬取
源碼地址
https://github.com/ingxx/scrapy_to_tencent