思路:
1.在爬蟲中,爬取了地名,我們可以通過for循環+if語句來篩選各地的工資;
2.將數據清洗的程序,封裝爲一個函數,方便使用;
3.將直方圖和餅圖程序也封裝爲函數;
4.進行數據可視化分析;
以下爲代碼:
import pymysql import pandas as pda import numpy as npy import re import matplotlib.pylab as pyl import matplotlib.mlab as mlab import matplotlib.pyplot as plt #連接數據庫 conn = pymysql.connect(host="127.0.0.1",user="root",passwd="lxw19961230",db="51job") sql = "select * from python_1" data = pda.read_sql(sql,conn) #篩選城市數據:北京,上海,深圳,南京,杭州,無錫 data_money = data['money'].dropna() data_adress = data['adress'].dropna() beijing_money = [] count_1 = 0 for i in range(0,len(data_adress)): if "北京" in data_adress[i]: count_1 += 1 beijing_money.append(data_money[i]) shanghai_money = [] count_2 = 0 for i in range(0,len(data_adress)): if "上海" in data_adress[i]: count_2 += 1 shanghai_money.append(data_money[i]) shenzheng_money = [] count_3 = 0 for i in range(0,len(data_adress)): if "深圳" in data_adress[i]: count_3 += 1 shenzheng_money.append(data_money[i]) hangzhou_money = [] count_4 = 0 for i in range(0,len(data_adress)): if "杭州" in data_adress[i]: count_4 += 1 hangzhou_money.append(data_money[i]) nanjing_money = [] count_5 = 0 for i in range(0,len(data_adress)): if "南京" in data_adress[i]: count_5 += 1 nanjing_money.append(data_money[i]) wuxi_money = [] count_6 = 0 for i in range(0,len(data_adress)): if "無錫" in data_adress[i]: count_6 += 1 wuxi_money.append(data_money[i]) '''--------------------------------整理數據----------------------------------------''' #繪製散點圖比較,各城市python人才需求量 a = ["beijing","shanghai","shenzheng","hangzhou","nanjing","wuxi"] b = [count_1,count_2,count_3,count_4,count_5,count_6] pyl.title("city_wages") pyl.xlabel("city") pyl.ylabel("wages") pyl.plot(a,b,'o') pyl.savefig("D:/python/爬蟲/51job分析/city-wages.png") pyl.show() '''------------------------------繪製散點圖初步分析--------------------------------''' #整理各城市工資數據,整理爲 k/月,函數 def change_wages_data(data_money): #統計萬和月的數據 wan_yue=[] #統計萬以上的數據 wan_yis=[] #統計萬和年的數據 wan_nian=[] count11=0 count21=0 count31=0 for i in data_money: if '以上' in i and '月' in i: wan_yis.append(i) count11 += 1 continue if "年" in i and "萬" in i and "以下" not in i and "以上" not in i: wan_nian.append(i) count31 += 1 continue if '月' in i and type(i)==str: if '萬' in i: wan_yue.append(i) '''----------------------------------------------------------------------------------''' #字符串分割,去掉月,整理爲 1-2萬 的格式 wan_yue_split=[] for c in wan_yue: wan_yue_split.append(re.split('[-/]',c)) #最小值組成列表 minw=[] for e in wan_yue_split: minw.append(e[0]) #整理最小值格式,轉化爲以k爲單位的數據(str類型) minww=[] for ff in minw: if len(ff)==1: ff=ff+'0' else: ff=ff.replace('.','') minww.append(ff) #minww求和,先轉化爲float類型 wan_yue_min = [] for hh in minww: hhh=float(hh) wan_yue_min.append(hhh) sum_wan_yue_min = sum(wan_yue_min) #所有工資最小值的平均值 wan_yue_min_avg=sum_wan_yue_min/(len(wan_yue_min)-count11) print("最低工資平均值(篩選的萬/月,單位k):",wan_yue_min_avg) #最大值組成列表 maxw=list(map(lambda x:x[1].strip('萬'),wan_yue_split)) #整理最大值格式,轉化爲以k爲單位的數據(str類型) maxww=[] for f in maxw: if len(f)==1: f=f+'0' else: f=f.replace('.','') maxww.append(f) #maxww求和,先轉化爲float類型 wan_yue_max=[] for h in maxww: wan_yue_max.append(float(h)) sum_wanyue_max = sum(wan_yue_max) #所有工資最大值的平均值 wan_yue_max_avg=sum_wanyue_max/(len(maxww)-count11) print("最低工資平均值(篩選萬/月,單位k):",wan_yue_max_avg) '''-------------------------------------------------------------------------------''' #萬以上的數據(萬以上/月) wan_yis_split=[] for i in wan_yis: wan_yis_split.append(i.replace("萬以上/月","")) wan_yis_av=[] for j in wan_yis_split: wan_yis_av.append(float(j)) wan_yis_sum = sum(wan_yis_av) if count11 == 0: print("沒有萬以上/月的") if count11: wan_yis_avg = wan_yis_sum/count11 print("萬以上/月的平均工資爲:",wan_yis_avg) '''-----------------------------------------------------------------------------''' #萬和年的數據 wan_nian_split=[] #整理爲20-30萬的格式 for c in wan_nian: wan_nian_split.append(re.split('[-/]',c)) #最小值組成列表 minnn=[] for e in wan_nian_split: minnn.append(e[0]) #minnn求和,先轉化爲float類型 wan_nian_min = [] for hh in minnn: hhh=float(hh) wan_nian_min.append(hhh) sum_wan_nian_min = sum(wan_nian_min) #所有工資最小值的平均值,轉化爲多少萬/月,方便比較 wan_nian_min_avg=sum_wan_nian_min/(count31*1.2) print("萬/年的最低工資平均值(轉化爲萬/月,單位k):",wan_nian_min_avg) #最大值組成列表 maxnn=list(map(lambda x:x[1].strip('萬'),wan_nian_split)) #整理最大值格式,轉化爲以k爲單位的數據(str類型) wan_nian_max=[] for f in maxnn: wan_nian_max.append(f) #wan_nian_max求和,先轉化爲float類型 wan_nian_max_av=[] for h in wan_nian_max: wan_nian_max_av.append(float(h)) sum_wanyue_max=sum(wan_nian_max_av) #所有工資最大值的平均值 wan_nian_max_avg=sum_wanyue_max/(count31*1.2) print("萬/年的最高工資平均值(轉化爲萬/月,單位k):",wan_nian_max_avg) '''------------------------------------------------------------------------------------''' #篩選出千和月的 qian_yue=[] count21 = 0 aa=data_money for j in aa: if '以下' in j: count21 += 1 continue if '月' in j and type(j)==str: if '千' in j: qian_yue.append(j) #分割字符 qian_yue_split=[] for n in qian_yue: qian_yue_split.append(re.split('[-/]',n)) #最小值列表 minq=[] for x in qian_yue_split: minq.append(x[0]) #最小求和,轉化爲float類型 qian_yue_min=[] for y in minq: qian_yue_min.append(float(y)) sum_qian_yue_min=sum(qian_yue_min) #求平均值 qian_yue_min_avg=sum_qian_yue_min/(len(qian_yue_min)-count21) print("最低工資平均值(篩選千/月,單位k):",qian_yue_min_avg) #最大值列表 v=list(map(lambda x:x[1].strip('千'),qian_yue_split)) qian_yue_max=[] for i in v: qian_yue_max.append(float(i)) sum_qian_yue_max=sum(qian_yue_max) qian_yue_max_avg=sum_qian_yue_max/(len(qian_yue_max)-count21) print("最高工資平均值(篩選千/月,單位k):",qian_yue_max_avg) '''-------------以上爲各數據平均值分析---------------------''' #整合數據 #萬/月,計算其平均值,存儲到一個列表中 avg=[] for i in range(0,len(wan_yue_min)): avg.append((wan_yue_min[i]+wan_yue_max[i])/2) #萬以上的整合入列表中 for i in range(0,len(wan_yis_av)): avg.append(wan_yis_av[i]) #將萬/年整理進列表2.4=1.2*2 for i in range(0,len(wan_nian_min)): avg.append((wan_nian_min[i]+wan_nian_max_av[i])/2.4) #千/月整理進列表中 for i in range(0,len(qian_yue_min)): avg.append((qian_yue_min[i]+qian_yue_max[i])/2) return avg '''----------------------整理數據的函數--------------------------------''' #繪製直方圖的函數 def hist(avg,city): #初步統計,整理爲直方圖,方便觀察 #設置直方圖組距 avg_max = max(avg) avg_min = min(avg) #極差 avg_rg = avg_max - avg_min #組距 avg_dst = avg_rg/50 #設置參數 avg_sty = npy.arange(avg_min,avg_max,avg_dst) pyl.title(city + "wages--count--assortment") pyl.xlabel("wages") pyl.ylabel("nmber") pyl.hist(avg,avg_sty) pyl.savefig("D:/python/爬蟲/51job分析/"+city+"-wages.jpg") pyl.show() '''------------畫好直方圖,可進行分析----------------------''' #繪製餅圖的函數 def pie(avg,city): #可畫餅圖分析,各個工資段的百分比 count1 = 0 count2 = 0 count3 = 0 count4 = 0 count5 = 0 count6 = 0 #對數據進行處理,以便來畫餅圖 for i in avg: if float(i) < 5.0: count1 += 1 if float(i) >= 5.0 and float(i) < 10.0: count2 += 1 if float(i) >= 10.0 and float(i) < 20.0: count3 += 1 if float(i) >= 20.0 and float(i) < 30.0: count4 += 1 if float(i) >= 30.0 and float(i) < 40.0: count5 += 1 if float(i) >= 40.0: count6 += 1 #設置標籤 labels = ['under-5k','5k-10k','10k-20k','20k-30k','30k-40k','over-40k'] #數據 count = [count1,count2,count3,count4,count5,count6] #設置突出部分 expl = [0,0,0.1,0,0,0] #設置餅圖屬性 fig = plt.figure(1, figsize=(6,6)) #autopct數據百分數,小數設置;shadow是否有陰影,labeldistance文字離餅圖的距離 plt.pie(count,explode=expl,labels=labels,autopct='%1.2f%%',shadow=True,labeldistance=1.1) plt.title(city+"wages-count") plt.savefig("D:/python/爬蟲/51job分析/"+city+"-wages.png") plt.show() #調用各個函數畫圖分析數據 print("北京:") beijing_avg = change_wages_data(beijing_money) hist(beijing_avg,"beijing") pie(beijing_avg,"beijing") print("\n") print("上海") shanghai_avg = change_wages_data(shanghai_money) hist(shanghai_avg,"shanghai") pie(shanghai_avg,"shanghai") print("\n") print("深圳") shenzheng_avg = change_wages_data(shenzheng_money) hist(shenzheng_avg,"shenzheng") pie(shenzheng_avg,"shenzheng") print("\n") print("南京") nanjing_avg = change_wages_data(nanjing_money) hist(nanjing_avg,"nanjing") pie(nanjing_avg,"nanjing") print("\n") print("杭州") hangzhou_avg = change_wages_data(hangzhou_money) hist(hangzhou_avg,"hangzhou") pie(hangzhou_avg,"hangzhou") print("\n") print("無錫") wuxi_avg = change_wages_data(wuxi_money) hist(wuxi_avg,"wuxi") pie(wuxi_avg,"wuxi")
此程序是之前程序的整合,不多做解釋。
以下爲可視化圖:
北京:
直方圖:
餅圖:
上海:
直方圖:
餅圖:
深圳:
直方圖:
餅圖:
南京:
直方圖:
餅圖:
杭州:
直方圖:
餅圖:
無錫:
直方圖:
餅圖: