經過問卷調查數據,選取其中多組變量來預測其對幸福感的評價。算法
鏈接:app
https://tianchi.aliyun.com/competition/entrance/231702/informationdom
下載:ide
train_set:happiness_train_complete.csv測試
test_set:happiness_test_complete.csv優化
index:文件中包含每一個變量對應的問卷題目,以及變量取值的含義spa
survey:文件是數據源的原版問卷,做爲補充以方便理解問題背景3d
使用matplotlib.pyplot依次畫出id和其它列的scatter圖code
經過圖對數據進行操做:orm
# jupyter notebook下運行 import numpy as np import pandas as pd import matplotlib.pyplot as plt # 導入訓練數據集和測試集 # encoding='gbk',不能用utf-8 train_data = pd.read_csv('happiness_train_complete.csv', encoding='gbk') test_data = pd.read_csv('happiness_test_complete.csv', encoding='gbk') # 訓練集樣本個數8000,每一個樣本含有140個特徵 # 測試集樣本個數2968,每一個樣本含有139個特徵 train_data.shape test_data.shape # 去除-8值 train_data = train_data[train_data.happiness>0] train_data.shape # 訓練集標籤 y = train_data.happiness ind1 = ['id','happiness','survey_time','edu_other','join_party','property_other','invest_other'] # 訓練集樣本中刪除指定列數據 X = train_data.drop(ind, axis=1) # 刪除測試集中刪除指定列數據 ind2 = ['id','survey_time','edu_other','join_party','property_other','invest_other'] X_test_data = test_data.drop(ind, axis=1) # 把DateFrame類型轉爲np.array y = np.array(y, dtype=int) X = np.array(X, dtype=float) X_test_data = np.array(X_test_data, dtype=float) # 把小於0的值設置爲-8 X[X<0]=-8 X_test_data[X_test_data<0]=-8 from sklearn.impute import SimpleImputer # 把樣本中的值爲空的特徵設置爲-8 X = SimpleImputer(fill_value=-8).fit_transform(X) X_test_data = SimpleImputer(fill_value=-8).fit_transform(X_test_data) from sklearn.model_selection import train_test_split # 由於測試集沒有標籤,因此拆分訓練集 X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=666) # 均值歸一化 from sklearn.preprocessing import StandardScaler std = StandardScaler().fit(X_train) X_train_std = std.transform(X_train) X_test_std = std.transform(X_test) std_1 = StandardScaler().fit(X) X_std = std_1.transform(X) X_test_data = std_1.transform(X_test_data)
這是一個分類問題初步定爲使用KNN算法來進行建模
from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import GridSearchCV param_grid = [ { 'weights': ['uniform'], 'n_neighbors':[i for i in range(1,11)] }, { 'weights': ['distance'], 'n_neighbors': [i for i in range(1,11)], 'p': [i for i in range(1,6)] }] # 網格搜索優化超參數 knn_clf_grid = KNeighborsClassifier() grid_search = GridSearchCV(knn_clf_grid, param_grid,n_jobs=-1, verbose=2).fit(X_train_std, y_train) # 最優超參數爲:{'n_neighbors': 10, 'p': 1, 'weights': 'distance'} grid_search.best_estimator_ grid_search.best_params_ grid_search.best_score_ # 使用真正測試集加載模型 knn = KNeighborsClassifier(n_neighbors=10, p=1, weights='distance').fit(X_std, y) y_pre = knn.predict(X_test_data) # 把預測結果寫入文件 df = pd.DataFrame({'id':test_data.id, 'happniess': y_pre}) df.to_csv('forecast_3.csv', index=None)
提交結果到天池等待評測分數結果score=0.6814
結果提交3次:
第一次:score=1.3260
第二次:數據均值歸一化score=0.9629
第三次:數據均值歸一化+網格搜索優化超參數score=0.6814
第四次:數據均值歸一化+PCA+邏輯迴歸(OvO)score=0.6099
import numpy as np import pandas as pd # 導入train_set和test_set, encoding='gbk',不能用utf-8 train_set = pd.read_csv('happiness_train_complete.csv', encoding='gbk') test_set = pd.read_csv('happiness_test_complete.csv', encoding='gbk') # 去除標籤中不合理的數據 -8 train_set = train_set[train_set.happiness>0] y_label = train_set.happiness ind1 = ['id','happiness','survey_time','edu_other','join_party','property_other','invest_other'] X_train_set = train_set.drop(ind1, axis=1) ind2 = ['id','survey_time','edu_other','join_party','property_other','invest_other'] X_test_set = test_set.drop(ind2, axis=1) y_label = np.array(y_label, dtype=int) X_train_set = np.array(X_train_set, dtype=float) X_test_set = np.array(X_test_set, dtype=float) from sklearn.impute import SimpleImputer # 空值設置爲-1 X_train_set = SimpleImputer(fill_value=-1).fit_transform(X_train_set) X_test_set = SimpleImputer(fill_value=-1).fit_transform(X_test_set) # # 小於0的值設置爲-1 X_train_set[X_train_set < 0] = -1 X_test_set[X_test_set < 0] = -1 from sklearn.preprocessing import StandardScaler # 均值歸一化 std = StandardScaler().fit(X_train_set) X_train__std = std.transform(X_train_set) X_test__std = std.transform(X_test_set) # PCA降維 from sklearn.decomposition import PCA # 包含95%的方差信息 pca = PCA(0.95) pca.fit(X_train__std) X_train_pca = pca.transform(X_train__std) X_test_pca = pca.transform(X_test__std) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_train_pca, y_label, random_state=666) from sklearn.linear_model import LogisticRegression best_c = 0. best_score = 0. best_sum = 10. for c in np.arange(0.001, 0.3, 0.001): log_reg2 = LogisticRegression(C=c, multi_class='multinomial', solver='newton-cg').fit(X_train, y_train) y_pre = log_reg2.predict(X_test) s = sum((y_pre-y_test)**2/len(y_test)) score = log_reg2.score(X_test, y_test) if best_sum > s: best_sum = s best_c = c best_score = score print('c:', best_c) print('score:', best_score) print('sum:', best_sum) log_reg = LogisticRegression(C=0.01, multi_class='multinomial', solver='newton-cg').fit(X_train, y_train) y_pre2 = log_reg.predict(X_test_pca) df = pd.DataFrame({'id':test_set.id, 'happniess': y_pre2}) df.to_csv('log_reg_pca.csv', index=None)