Python學習筆記,51job進一步分析

分析北京,上海,深圳,杭州,南京,無錫的工資情況

思路:
1.在爬蟲中,爬取了地名,我們可以通過for循環+if語句來篩選各地的工資;
2.將數據清洗的程序,封裝爲一個函數,方便使用;
3.將直方圖和餅圖程序也封裝爲函數;
4.進行數據可視化分析;

以下爲代碼:

import pymysql
import pandas as pda
import numpy as npy
import re
import matplotlib.pylab as pyl
import matplotlib.mlab as mlab  
import matplotlib.pyplot as plt  
#連接數據庫
conn = pymysql.connect(host="127.0.0.1",user="root",passwd="lxw19961230",db="51job")
sql = "select * from python_1"
data = pda.read_sql(sql,conn)

#篩選城市數據:北京,上海,深圳,南京,杭州,無錫
data_money = data['money'].dropna()
data_adress = data['adress'].dropna()
beijing_money = []
count_1 = 0
for i in range(0,len(data_adress)):
    if "北京" in data_adress[i]:
        count_1 += 1
        beijing_money.append(data_money[i])
shanghai_money = []
count_2 = 0
for i in range(0,len(data_adress)):
    if "上海" in data_adress[i]:
        count_2 += 1
        shanghai_money.append(data_money[i])
shenzheng_money = []
count_3 = 0
for i in range(0,len(data_adress)):
    if "深圳" in data_adress[i]:
        count_3 += 1
        shenzheng_money.append(data_money[i])
hangzhou_money = []
count_4 = 0
for i in range(0,len(data_adress)):
    if "杭州" in data_adress[i]:
        count_4 += 1
        hangzhou_money.append(data_money[i])
nanjing_money = []
count_5 = 0
for i in range(0,len(data_adress)):
    if "南京" in data_adress[i]:
        count_5 += 1
        nanjing_money.append(data_money[i])
wuxi_money = []
count_6 = 0
for i in range(0,len(data_adress)):
    if "無錫" in data_adress[i]:
        count_6 += 1
        wuxi_money.append(data_money[i])
'''--------------------------------整理數據----------------------------------------'''
#繪製散點圖比較,各城市python人才需求量
a = ["beijing","shanghai","shenzheng","hangzhou","nanjing","wuxi"]
b = [count_1,count_2,count_3,count_4,count_5,count_6]
pyl.title("city_wages")
pyl.xlabel("city")
pyl.ylabel("wages")
pyl.plot(a,b,'o')
pyl.savefig("D:/python/爬蟲/51job分析/city-wages.png")
pyl.show()
'''------------------------------繪製散點圖初步分析--------------------------------'''


#整理各城市工資數據,整理爲 k/月,函數
def change_wages_data(data_money):
    #統計萬和月的數據
    wan_yue=[]
    #統計萬以上的數據
    wan_yis=[]
    #統計萬和年的數據
    wan_nian=[]
    count11=0
    count21=0
    count31=0
    for i in data_money:
        if '以上' in i and '月' in i:
            wan_yis.append(i)
            count11 += 1
            continue
        if "年" in i and "萬" in i and "以下" not in i and "以上" not in i:
            wan_nian.append(i)
            count31 += 1
            continue
        if '月' in i and type(i)==str:
            if '萬' in i:
                wan_yue.append(i)
    '''----------------------------------------------------------------------------------'''
    #字符串分割,去掉月,整理爲 1-2萬 的格式
    wan_yue_split=[]
    for c in wan_yue:
        wan_yue_split.append(re.split('[-/]',c))

    #最小值組成列表
    minw=[]
    for e in wan_yue_split:
        minw.append(e[0])

    #整理最小值格式,轉化爲以k爲單位的數據(str類型)
    minww=[]
    for ff in minw:
        if len(ff)==1:
            ff=ff+'0'
        else:
            ff=ff.replace('.','')
        minww.append(ff)

    #minww求和,先轉化爲float類型
    wan_yue_min = []
    for hh in minww:
        hhh=float(hh)
        wan_yue_min.append(hhh)
    sum_wan_yue_min = sum(wan_yue_min) 
    #所有工資最小值的平均值
    wan_yue_min_avg=sum_wan_yue_min/(len(wan_yue_min)-count11)
    print("最低工資平均值(篩選的萬/月,單位k):",wan_yue_min_avg)

    #最大值組成列表
    maxw=list(map(lambda x:x[1].strip('萬'),wan_yue_split))
    #整理最大值格式,轉化爲以k爲單位的數據(str類型)
    maxww=[]
    for f in maxw:
        if len(f)==1:
            f=f+'0'
        else:
            f=f.replace('.','')
        maxww.append(f)
    #maxww求和,先轉化爲float類型
    wan_yue_max=[]
    for h in maxww:
        wan_yue_max.append(float(h))
    sum_wanyue_max = sum(wan_yue_max)
    #所有工資最大值的平均值
    wan_yue_max_avg=sum_wanyue_max/(len(maxww)-count11)
    print("最低工資平均值(篩選萬/月,單位k):",wan_yue_max_avg)

    '''-------------------------------------------------------------------------------'''

    #萬以上的數據(萬以上/月)
    wan_yis_split=[]
    for i in wan_yis:
        wan_yis_split.append(i.replace("萬以上/月",""))
    wan_yis_av=[]
    for j in wan_yis_split:
        wan_yis_av.append(float(j))
    wan_yis_sum = sum(wan_yis_av)
    if count11 == 0:
        print("沒有萬以上/月的")
    if count11:
        wan_yis_avg = wan_yis_sum/count11
        print("萬以上/月的平均工資爲:",wan_yis_avg)

    '''-----------------------------------------------------------------------------'''

    #萬和年的數據
    wan_nian_split=[]
    #整理爲20-30萬的格式
    for c in wan_nian:
        wan_nian_split.append(re.split('[-/]',c))
    #最小值組成列表
    minnn=[]
    for e in wan_nian_split:
        minnn.append(e[0])
    #minnn求和,先轉化爲float類型
    wan_nian_min = []
    for hh in minnn:
        hhh=float(hh)
        wan_nian_min.append(hhh)
    sum_wan_nian_min = sum(wan_nian_min) 
    #所有工資最小值的平均值,轉化爲多少萬/月,方便比較
    wan_nian_min_avg=sum_wan_nian_min/(count31*1.2)
    print("萬/年的最低工資平均值(轉化爲萬/月,單位k):",wan_nian_min_avg)

    #最大值組成列表
    maxnn=list(map(lambda x:x[1].strip('萬'),wan_nian_split))
    #整理最大值格式,轉化爲以k爲單位的數據(str類型)
    wan_nian_max=[]
    for f in maxnn:
        wan_nian_max.append(f)
    #wan_nian_max求和,先轉化爲float類型
    wan_nian_max_av=[]
    for h in wan_nian_max:
        wan_nian_max_av.append(float(h))
    sum_wanyue_max=sum(wan_nian_max_av)
    #所有工資最大值的平均值
    wan_nian_max_avg=sum_wanyue_max/(count31*1.2)
    print("萬/年的最高工資平均值(轉化爲萬/月,單位k):",wan_nian_max_avg)

    '''------------------------------------------------------------------------------------'''

    #篩選出千和月的
    qian_yue=[]
    count21 = 0
    aa=data_money
    for j in aa:
        if '以下' in j:
            count21 += 1
            continue
        if '月' in j and type(j)==str:
            if '千' in j:
                qian_yue.append(j)
    #分割字符
    qian_yue_split=[]
    for n in qian_yue:
        qian_yue_split.append(re.split('[-/]',n))
    #最小值列表
    minq=[]
    for x in qian_yue_split:
        minq.append(x[0])

    #最小求和,轉化爲float類型
    qian_yue_min=[]
    for y in minq:
        qian_yue_min.append(float(y))
    sum_qian_yue_min=sum(qian_yue_min)
    #求平均值
    qian_yue_min_avg=sum_qian_yue_min/(len(qian_yue_min)-count21)
    print("最低工資平均值(篩選千/月,單位k):",qian_yue_min_avg)

    #最大值列表
    v=list(map(lambda x:x[1].strip('千'),qian_yue_split))
    qian_yue_max=[]
    for i in v:
        qian_yue_max.append(float(i))
    sum_qian_yue_max=sum(qian_yue_max)
    qian_yue_max_avg=sum_qian_yue_max/(len(qian_yue_max)-count21)
    print("最高工資平均值(篩選千/月,單位k):",qian_yue_max_avg)

    '''-------------以上爲各數據平均值分析---------------------'''   
    #整合數據
    #萬/月,計算其平均值,存儲到一個列表中
    avg=[]
    for i in range(0,len(wan_yue_min)):
        avg.append((wan_yue_min[i]+wan_yue_max[i])/2)
    #萬以上的整合入列表中
    for i in range(0,len(wan_yis_av)):
        avg.append(wan_yis_av[i])
    #將萬/年整理進列表2.4=1.2*2
    for i in range(0,len(wan_nian_min)):
        avg.append((wan_nian_min[i]+wan_nian_max_av[i])/2.4)
    #千/月整理進列表中
    for i in range(0,len(qian_yue_min)):
        avg.append((qian_yue_min[i]+qian_yue_max[i])/2)
    return avg
'''----------------------整理數據的函數--------------------------------'''

#繪製直方圖的函數
def hist(avg,city):
    #初步統計,整理爲直方圖,方便觀察
    #設置直方圖組距
    avg_max = max(avg)
    avg_min = min(avg)
    #極差
    avg_rg = avg_max - avg_min
    #組距
    avg_dst = avg_rg/50
    #設置參數
    avg_sty = npy.arange(avg_min,avg_max,avg_dst)
    pyl.title(city + "wages--count--assortment")
    pyl.xlabel("wages")
    pyl.ylabel("nmber")
    pyl.hist(avg,avg_sty)
    pyl.savefig("D:/python/爬蟲/51job分析/"+city+"-wages.jpg")
    pyl.show()
    '''------------畫好直方圖,可進行分析----------------------'''


#繪製餅圖的函數
def pie(avg,city):
    #可畫餅圖分析,各個工資段的百分比
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    count5 = 0
    count6 = 0
    #對數據進行處理,以便來畫餅圖
    for i in avg:
        if float(i) < 5.0:
            count1 += 1
        if float(i) >= 5.0 and float(i) < 10.0:
            count2 += 1
        if float(i) >= 10.0 and float(i) < 20.0:
            count3 += 1
        if float(i) >= 20.0 and float(i) < 30.0:
            count4 += 1
        if float(i) >= 30.0 and float(i) < 40.0:
            count5 += 1
        if float(i) >= 40.0:
            count6 += 1
    #設置標籤
    labels = ['under-5k','5k-10k','10k-20k','20k-30k','30k-40k','over-40k']
    #數據
    count = [count1,count2,count3,count4,count5,count6]
    #設置突出部分
    expl = [0,0,0.1,0,0,0]
    #設置餅圖屬性
    fig = plt.figure(1, figsize=(6,6))
    #autopct數據百分數,小數設置;shadow是否有陰影,labeldistance文字離餅圖的距離
    plt.pie(count,explode=expl,labels=labels,autopct='%1.2f%%',shadow=True,labeldistance=1.1) 
    plt.title(city+"wages-count")
    plt.savefig("D:/python/爬蟲/51job分析/"+city+"-wages.png")
    plt.show()
#調用各個函數畫圖分析數據
print("北京:")
beijing_avg = change_wages_data(beijing_money)
hist(beijing_avg,"beijing")
pie(beijing_avg,"beijing")
print("\n")

print("上海")
shanghai_avg = change_wages_data(shanghai_money)
hist(shanghai_avg,"shanghai")
pie(shanghai_avg,"shanghai")
print("\n")

print("深圳")
shenzheng_avg = change_wages_data(shenzheng_money)
hist(shenzheng_avg,"shenzheng")
pie(shenzheng_avg,"shenzheng")
print("\n")

print("南京")
nanjing_avg = change_wages_data(nanjing_money)
hist(nanjing_avg,"nanjing")
pie(nanjing_avg,"nanjing")
print("\n")

print("杭州")
hangzhou_avg = change_wages_data(hangzhou_money)
hist(hangzhou_avg,"hangzhou")
pie(hangzhou_avg,"hangzhou")
print("\n")

print("無錫")
wuxi_avg = change_wages_data(wuxi_money)
hist(wuxi_avg,"wuxi")
pie(wuxi_avg,"wuxi")

此程序是之前程序的整合,不多做解釋。
以下爲可視化圖:
北京:
直方圖:
在這裏插入圖片描述
餅圖:
在這裏插入圖片描述
上海:
直方圖:
在這裏插入圖片描述
餅圖:
在這裏插入圖片描述
深圳:
直方圖:
在這裏插入圖片描述
餅圖:
在這裏插入圖片描述
南京:
直方圖:
在這裏插入圖片描述
餅圖:
在這裏插入圖片描述
杭州:
直方圖:
在這裏插入圖片描述
餅圖:
在這裏插入圖片描述
無錫:
直方圖:
在這裏插入圖片描述
餅圖:
在這裏插入圖片描述