最近學習一點數據分析,本身就想來點實際的比較有意思,恰好一年畢業季,就爬取了拉鉤網,本身本人恰好又在作Java的程序設計實驗,就順便爬取了Java崗位。下面是個人爬蟲代碼。python
1 import requests 2 import json 3 import lxml 4 from lxml import etree 5 from bs4 import BeautifulSoup 6 import time 7 8 9 positions = [] 10 header = {'Host':'www.lagou.com', 11 'Referer':'https://www.lagou.com/jobs/list_Java?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=', 12 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Mobile Safari/537.36', 13 'X-Anit-Forge-Code':'0', 14 'X-Anit-Forge-Token':None, 15 'X-Requested-With':'XMLHttpRequest'} 16 for x in range(1,31): 17 if(x==1): 18 y='true' 19 else: 20 y='false' 21 data ={'first':y,'pn':x,'kd':'Java'} 22 urls = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false" 23 r = requests.post(url = urls,headers = header,data = data) 24 json_result = r.json() #將爬取得網頁轉化成json格式 25 position_page = json_result['content']['positionResult']['result']#獲取每頁咱們須要的內容 26 for position in position_page: 27 position_dict = {"崗位名稱": position['positionName'], 28 '地點': position['city'], 29 '公司名稱': position['companyFullName'], 30 '薪水': position['salary'], 31 '工做經驗': position['workYear'], 32 } #經過字典存儲的咱們的信息 33 positions.append(position_dict) 34 line = json.dumps(positions, ensure_ascii=False)#將字典轉化爲json 35 time.sleep(20)#由於拉鉤的反扒機制,頻率太快就會被限制,因此睡眠一下 36 with open('lagou.json', 'w') as fp: 37 fp.write(line) 38 print("第%d已經爬取完畢" % x) 39 print('爬取完成')
本人對爬蟲也不太精通,有錯誤的地方還忘多多指教,拉鉤網的爬蟲仍是很不錯的,首先他使用了異步加載的方式,若是不懂異步加載,那麼極可能你就沒法爬取到內容,什麼是異步加載,這裏就很少作解釋,說白了就是要找對咱們想要信息的網址就是 urls = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false",而不是直接看到URL框裏的那個網址,找到這個你覺得你就能爬到真正的信息了嗎?注意我用的是」真正的信息「,由於拉鉤仍是很狡猾的,若是你沒有吧請求頭加完整的話,那麼拉鉤將給你返回一大堆假的信息,若是你不仔細觀察,就會被它騙了,不信你能夠試試,爬蟲就說這麼多,下來看看這些信息的分析。json
一波圖片我想全部的問題都已經說的明明白白了。下面附上我代碼,若有不足,多多指正。app
# coding: utf-8 import numpy as np import pandas as pd import matplotlib import matplotlib.pyplot as plt import matplotlib.font_manager as fm import json import matplotlib.pyplot as plt plt.rcParams['font.sans-serif']=['SimHei'] #設置字體,解決中文亂碼問題 result = pd.read_json("C:/Users/JDELL/Desktop/lagou.json") result.工做經驗.value_counts().plot(kind = "bar") plt.title("工做經驗分佈圖") fig = plt.figure(figsize = (10,5)) result.地點.value_counts().plot(kind = "bar") plt.title("工做地點分佈圖") plt.figure(figsize=(10,6)) result.薪水[result.地點=="北京"].value_counts().plot(kind = "bar") plt.title("北京工資分佈") plt.figure(figsize=(10,8)) lables = '15k-25k','15k-30k','10k-20k','10k-15k','20k-40k','15k-20k','8k-15k' explode = [0.1,0,0,0,0,0,0] data = [60,46,42,31,21,21,15] plt.pie(x = data,labels = lables,autopct='%3.1f %%',shadow=True, explode=explode) plt.figure(figsize=(10,8)) lables = '15k-25k','15k-30k','10k-20k','10k-15k','20k-40k','15k-20k','8k-15k' explode = [0.1,0,0,0,0,0,0] data = [60,46,42,31,21,21,15] plt.title("薪水分佈圖") plt.pie(x = data,labels = lables,autopct='%3.1f %%',shadow=True, explode=explode) plt.figure(figsize=(8,6)) lables = t.index data = t.values plt.title("深圳薪水分佈餅狀圖") plt.pie(x = data,labels = lables,autopct='%3.1f %%',shadow=True) t = result.崗位名稱.value_counts() plt.figure(figsize=(15,20)) lables = t.index data = t.values plt.title("崗位分佈餅狀圖") plt.pie(x = data,labels = lables,autopct='%3.1f %%',shadow=True)