首先準備python3+scrapy+mysql+pycharm。。。html
此次咱們選擇爬取智聯招聘網站的企業招聘信息,首先咱們有針對的查看網站的html源碼,發現其使用的是js異步加載的方式,直接從服務端調取json數據,這就意味着咱們用地址欄的網址獲取的網站內容是不全的,沒法得到想要的數據。java
那麼咱們用什麼方式獲取想要的數據呢,正所謂道高一尺魔高一丈,有反爬蟲就有範反爬蟲,固然咱們不用那麼麻煩,經過分析頁面的加載有針對性的抓包獲取信息進行分析,咱們會發現每次刷新或者搜索頁面時候,除了會加載許多圖片、廣告等信息外,還加載了一個包,這個包裏就有咱們想要的全部信息,而且服務端都給咱們打包成json格式了,這樣看似複雜,實則簡化了咱們對數據進行過濾的步驟。python
直接從頭文件中找到請求的url源頭,那麼就能直接獲取json數據了。 怎麼樣是否是感覺到了世界的友好呢?mysql
分析完畢,接下來就能夠編寫爬蟲數據了。。。sql
1、建立爬蟲項目數據庫
在命令行中在指定的目錄建立爬蟲項目json
scrapy startproject zhilian
而後就是建立爬蟲文件api
scrapy genspider zhaopin "sou.zhaopin.com"
#要把http://www去掉由於爬蟲項目運行時會自動加上,這裏也是爲了不沒必要要的錯誤
2、編寫程序app
首先編寫item文件,咱們有選擇的爬取幾個關鍵數據dom
import scrapy class ZhilianItem(scrapy.Item): # 崗位名稱 jobName = scrapy.Field() # 公司名稱 companyName = scrapy.Field() # 工做地點 workSite = scrapy.Field() # 更新日期 updateDate = scrapy.Field() # 薪資水平 salaryLevel = scrapy.Field() # 崗位關鍵詞 jobKeyWord = scrapy.Field()
而後就是編寫爬蟲文件了咱們的命名爲zhaopin.py
# -*- coding: utf-8 -*- import json import scrapy from zhilian.items import ZhilianItem class zhaopinSpider(scrapy.Spider): name = 'javaDevelop' allowed_domains = ['sou.zhaopin.com'] offset = 0 url1 = "https://fe-api.zhaopin.com/c/i/sou?start=" url2 = "&pageSize=90&cityId=530&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=%E7%BB%8F%E6%B5%8E&kt=3" start_urls = ( url1 + str(offset) + url2, #此爲咱們簡化後的網址,網址的規律至關簡單,簡單試試就找到規律了 ) print(start_urls) def parse(self, response):for flag in range(0, 90): item = ZhilianItem() job = json.loads(response.text)['data']['results'][flag] # 崗位名稱 item['jobName'] = job['jobName'] # 公司名稱 item['companyName'] = job['company']['name'] # 工做地點 item['workSite'] = job['city']['display'] # 更新日期 item['updateDate'] = job['updateDate'] # 薪資水平 item['salaryLevel'] = job['salary'] # 崗位關鍵詞 item['jobKeyWord'] = job['welfare'] yield item if self.offset < 450: self.offset += 90 yield scrapy.Request(self.url1 + str(self.offset) + self.url2, callback=self.parse, dont_filter=True)
#dont_filter=True這個參數至關重要,指不過濾url直接爬取。不然你會發現你的爬蟲爬取完第一頁後就不會再爬取了,這樣是由於url與爬取域url不符,爬蟲自動認爲爬取結束
接下來就是寫管道文件了,這裏我用了兩種方式,一種是寫到數據庫中,還有一種是寫道本地txt文件中
import pymysql class zhaoPipeline(object): def __init__(self): self.conn = pymysql.connect(host='172.18.96.151', user='root', password='123456', db='zhilian', charset='utf8' ) self.cur = self.conn.cursor() def process_item(self, item, spider): # 崗位名稱 jobName = item['jobName'] # 公司名稱 companyName = item['companyName'] # 工做地點 workSite = item['workSite'] # 官網連接 updateDate = item['updateDate'] # 薪資水平 salaryLevel = item['salaryLevel'] # 崗位關鍵詞 jobKeyWord = item['jobKeyWord'] data = [jobName, companyName, workSite, updateDate, salaryLevel, ','.join(jobKeyWord)] print(data) print("======================================") sql = """ insert into zhaopin (jobname,companyname,worksite,updatedate,salarylevel,jobkeyword) values (%s,%s,%s,%s,%s,%s) """ # self.conn.ping(reconnect=True) self.cur.execute(sql, data) self.conn.commit() def close_spider(self, spider): self.cur.close() self.conn.close() class ZhilianPipeline(object): def __init__(self): self.filename = open("java.txt", 'wb') # self.path = "G:\images\p" # if not os.path.exists(self.path): # os.mkdir(self.path) def process_item(self, item, spider): # 崗位名稱 jobName = item['jobName'] # 公司名稱 companyName = item['companyName'] # 工做地點 workSite = item['workSite'] # 官網連接 updateDate = item['updateDate'] # 薪資水平 salaryLevel = item['salaryLevel'] # 崗位關鍵詞 jobKeyWord = item['jobKeyWord'] self.filename.write(jobName.encode('utf-8') + ' '.encode('utf-8') + companyName.encode('utf-8') + ' '.encode('utf-8') + workSite.encode('utf-8') + ' '.encode('utf-8') + updateDate.encode('utf-8') + ' '.encode('utf-8') + salaryLevel.encode('utf-8') + ' '.encode('utf-8') + ','.join(jobKeyWord).encode('utf-8') + '\n'.encode('utf-8')) return item def close_spider(self, spider): self.filename.close()
手把手教學,咱們附上創建庫語句
create table zhaopin (id int(10) not null primary key AUTO_INCREMENT, jobname varchar(40), companyname varchar(20), worksite varchar(10), updatedate datetime, salarylevel varchar(10), jobkeyword varchar(40) );
而後就剩下最後的設置setting了,下面三個關鍵的地方要改
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36', # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', }
ITEM_PIPELINES = {
'zhilian.pipelines.ZhilianPipeline': 300,
'zhilian.pipelines.zhaoPipeline': 200,
}
3、運行爬蟲
scrapy crawl zhaopin
等待片刻,刷新數據表
OVER。。。