機器學習入門之房價預測(線性迴歸)

#!/usr/bin/env python
# coding: utf-8

# In[1]:


# 1.定義問題

# 2.導入數據

# 導入類庫
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
get_ipython().run_line_magic('matplotlib', 'inline')

import warnings
warnings.filterwarnings('ignore')

# 顯示全部列
pd.set_option('display.max_columns', None)

# 導入數據
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')


# In[2]:


# 3.理解數據
# 數據信息
train_data.info()


# In[3]:


# 數據維度
train_data.shape


# In[4]:


# 前5個數據
train_data.head(5)


# In[5]:


# 描述性統計數據
train_data.describe().T


# In[6]:


# 4.數據可視化

# 分析SalePrice
train_data['SalePrice'].describe()
sns.distplot(train_data['SalePrice'])
plt.show()


# In[7]:


# 關係矩陣
corr = train_data.corr()
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corr, vmax=1, vmin=-1,square=True)


# In[8]:


# 選取特徵
# 選擇相關係數絕對值大於0.5的特徵(共十個)
train_data.corr()[train_data.corr()['SalePrice'].values > abs(0.5)]


# In[9]:


cols = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'SalePrice']
train_data = train_data[cols]
train_data.info()


# In[10]:


# 5.創建模型

# 分離數據集
X = train_data.values[:, 0:10]
Y = train_data.values[:, 10]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
# 建模
model = LinearRegression()
# 預測數據
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)
print('cost:'+ str(np.sum(abs(y_pred-Y_test)/len(y_pred))))


# In[11]:


# 因爲原始數據所得cost太大,因此接下來對數據進行歸一化處理(偏差反而更大了,不知道爲何???)
X_scaled = StandardScaler().fit_transform(X)
Y_scaled = StandardScaler().fit_transform(Y.reshape(-1, 1))
X_scaled_train, X_scaled_test, Y_scaled_train, Y_scaled_test = train_test_split(X_scaled, Y_scaled, test_size=0.33, random_state=42)
model_scaled = LinearRegression()
model_scaled.fit(X_scaled_train,Y_scaled_train)
y_pred = model.predict(X_scaled_test)
y_pred
print('cost:'+ str(np.sum(abs(y_pred-Y_scaled_test)/len(y_pred))))


# In[12]:


test_data['SalePrice'] = None
test_data = test_data[cols]
# 填充缺失值
test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].median(), inplace=True)
test_data['GarageCars'].fillna(test_data['GarageCars'].median(), inplace=True)
test_data['GarageArea'].fillna(test_data['GarageArea'].median(), inplace=True)


# In[13]:


X = test_data.values[:, 0:10]
y_test_pre = model.predict(X)
test_data['SalePrice'] = y_test_pre
test_data.head(10)
相關文章
相關標籤/搜索