使用python進行數據處理的實例(數據爲某公司HR部門關於員工信息的部分摘錄,kaggle上面的一次賽題)python
https://www.kaggle.com/c/kfru-dbm-hr-analyticsapp
該實例是根據其餘所給屬性預測員工是否會離職,代碼實現以下所示dom
import pandas as pd from sklearn.preprocessing import MinMaxScaler,StandardScaler from sklearn.preprocessing import LabelEncoder,OneHotEncoder from sklearn.decomposition import PCA def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=False,slr=False,lower_id=False,ld_n=1): df=pd.read_csv('C:\\Users\\Administrator\Desktop\\network\\HR.csv') #1 清洗數據,根據探索性數據分析出的結果來去掉空值 df=df.dropna(subset=['satisfaction_level','last_evaluation']) df=df[df['satisfaction_level']<=1][df['salary']!='nme'] #2 獲得標註 label=df['left'] df=df.drop('left',axis=1) #3 特徵選擇(由於本文的特徵原本就很少,暫時不作) # 4 特徵處理(歸一化,標準化,降維) scaler_lst=[sl,le,npr,amh,tsc,wa,pl5] column_lst = ["satisfaction_level", "last_evaluation", "number_project", "average_monthly_hours", "time_spend_company", "Work_accident", "promotion_last_5years"] for i in range(len(scaler_lst)): if not scaler_lst[i]: df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0] else: df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0] #針對離散值的處理 scaler_lst=[slr,dp] column_lst=['salary','department'] for i in range(len(scaler_lst)): if not scaler_lst[i]: if column_lst[i]=='salary': df[column_lst[i]]=[map_salary(s)for s in df['salary'].values] else: df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]]) df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0] else: df=pd.get_dummies(df,columns=[column_lst[i]]) if lower_id: return PCA(n_components=ld_n).fit_transform(df.values),label return df,label d=dict([('low',0),('medium',1),('high',2)]) def map_salary(s): return d.get(s,0) #建模 def hr_modeling_nn(features,label): from sklearn.model_selection import train_test_split #sklearn中沒有能夠一次性將數據劃分紅驗證集、訓練集、測試集的包,因此須要進行兩步劃分 f_v=features.values l_v=label.values X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2) X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25) #print(len(X_train),len(X_test),len(X_validation)) #分類 from sklearn.metrics import accuracy_score,recall_score,f1_score from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier from sklearn.naive_bayes import GaussianNB,BernoulliNB from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.linear_model import LogisticRegression from sklearn.ensemble import GradientBoostingClassifier models=[] models.append(('KNN',KNeighborsClassifier(n_neighbors=3))) models.append(('GaussianNB',GaussianNB())) models.append(('BernoulliNB',BernoulliNB())) models.append(('DecisionTreeGini',DecisionTreeClassifier())) models.append(('DecisionTreeEntropy',DecisionTreeClassifier(criterion='entropy'))) models.append(('SVM:',SVC(C=1000))) models.append(('OriginalRandomForest',RandomForestClassifier())) models.append(('RandomForest',RandomForestClassifier(n_estimators=11,max_features=None))) models.append(('Adaboost',AdaBoostClassifier(n_estimators=100))) models.append(('LogisticRegression',LogisticRegression(C=1000,tol=1e-10,solver='sag',max_iter=10000))) models.append(('GBDT',GradientBoostingClassifier(max_depth=6,n_estimators=100))) for clf_name ,clf in models: clf.fit(X_train,Y_train) xy_list=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)] for i in range(len(xy_list)): X_part=xy_list[i][0] Y_part=xy_list[i][1] Y_pred=clf.predict(X_part) print(i) print(clf_name,'-ACC',accuracy_score(Y_part,Y_pred)) print(clf_name,'-REC:',recall_score(Y_part,Y_pred)) print(clf_name,'-F1:',f1_score(Y_pred,Y_part)) #迴歸 def regr_t(features,label): print('X',features) print('Y',label) from sklearn.linear_model import LinearRegression,Ridge,Lasso regr=Ridge(alpha=1) regr.fit(features.values,label.values) Y_pred=regr.predict(features.values) print('Coef:',regr.coef_) from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score print('MSE:',mean_squared_error(label.values,Y_pred)) print('MAE:',mean_absolute_error(label.values,Y_pred)) print('R2:',r2_score(label.values,Y_pred)) def main(): features,label=hr_preprocessing() hr_modeling_nn(features,label) regr_t(features[['number_project','average_monthly_hours']],features['last_evaluation']) if __name__=='__main__': main()
來源:https://blog.csdn.net/weixin_39667003/article/details/85632885ide