#!/usr/bin/env python # coding: utf-8 # In[1]: # 1.定義問題 # 2.導入數據 # 導入類庫 import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split, KFold, cross_val_score from sklearn.preprocessing import StandardScaler get_ipython().run_line_magic('matplotlib', 'inline') import warnings warnings.filterwarnings('ignore') # 顯示全部列 pd.set_option('display.max_columns', None) # 導入數據 train_data = pd.read_csv('../data/train.csv') test_data = pd.read_csv('../data/test.csv') # In[2]: # 3.理解數據 # 數據信息 train_data.info() # In[3]: # 數據維度 train_data.shape # In[4]: # 前5個數據 train_data.head(5) # In[5]: # 描述性統計數據 train_data.describe().T # In[6]: # 4.數據可視化 # 分析SalePrice train_data['SalePrice'].describe() sns.distplot(train_data['SalePrice']) plt.show() # In[7]: # 關係矩陣 corr = train_data.corr() f, ax = plt.subplots(figsize=(20, 9)) sns.heatmap(corr, vmax=1, vmin=-1,square=True) # In[8]: # 選取特徵 # 選擇相關係數絕對值大於0.5的特徵(共十個) train_data.corr()[train_data.corr()['SalePrice'].values > abs(0.5)] # In[9]: cols = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'SalePrice'] train_data = train_data[cols] train_data.info() # In[10]: # 5.創建模型 # 分離數據集 X = train_data.values[:, 0:10] Y = train_data.values[:, 10] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) # 建模 model = LinearRegression() # 預測數據 model.fit(X_train,Y_train) y_pred = model.predict(X_test) print('cost:'+ str(np.sum(abs(y_pred-Y_test)/len(y_pred)))) # In[11]: # 因爲原始數據所得cost太大,因此接下來對數據進行歸一化處理(偏差反而更大了,不知道爲何???) X_scaled = StandardScaler().fit_transform(X) Y_scaled = StandardScaler().fit_transform(Y.reshape(-1, 1)) X_scaled_train, X_scaled_test, Y_scaled_train, Y_scaled_test = train_test_split(X_scaled, Y_scaled, test_size=0.33, random_state=42) model_scaled = LinearRegression() model_scaled.fit(X_scaled_train,Y_scaled_train) y_pred = model.predict(X_scaled_test) y_pred print('cost:'+ str(np.sum(abs(y_pred-Y_scaled_test)/len(y_pred)))) # In[12]: test_data['SalePrice'] = None test_data = test_data[cols] # 填充缺失值 test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].median(), inplace=True) test_data['GarageCars'].fillna(test_data['GarageCars'].median(), inplace=True) test_data['GarageArea'].fillna(test_data['GarageArea'].median(), inplace=True) # In[13]: X = test_data.values[:, 0:10] y_test_pre = model.predict(X) test_data['SalePrice'] = y_test_pre test_data.head(10)