[TOC] 更新、更全的《機器學習》的更新網站,更有python、go、數據結構與算法、爬蟲、人工智能教學等着你:http://www.javashuo.com/article/p-vozphyqp-cm.htmlhtml
import pandas as pd import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression %matplotlib inline font = FontProperties(fname='/Library/Fonts/Heiti.ttc')
df = pd.read_csv('housing-data.txt', sep='\s+', header=0) X = df.iloc[:, :-1].values y = df['MEDV'].values # 將數據分紅訓練集(0.7)和測試集(0.3) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
lr = LinearRegression() # 訓練模型 lr.fit(X_train, y_train) # 預測訓練集數據 y_train_predict = lr.predict(X_train) # 預測測試集數據 y_test_predict = lr.predict(X_test)
# y_train_predict-y_train訓練數據偏差值 plt.scatter(y_train_predict, y_train_predict-y_train, c='r', marker='s', edgecolor='white', label='訓練數據') # y_train_predict-y_train測試數據偏差值 plt.scatter(y_test_predict, y_test_predict-y_test, c='g', marker='o', edgecolor='white', label='測試數據') plt.xlabel('預測值', fontproperties=font) plt.ylabel('偏差值', fontproperties=font) # 可視化y=0的一條直線即偏差爲0的直線 plt.hlines(y=0, xmin=-10, xmax=50, color='k') plt.xlim(-10, 50) plt.legend(prop=font) plt.show()
![png](http://www.chenyoude.com/ml/02-07 多元線性迴歸(波士頓房價預測)_8_0.png?x-oss-process=style/watermark)python
from sklearn.metrics import mean_squared_error # 訓練集的均方偏差 train_mse = mean_squared_error(y_train,y_train_predict) # 測試集的均方偏差 test_mse = mean_squared_error(y_test,y_test_predict) print('訓練集的均方偏差:{}'.format(train_mse)) print('測試集的均方偏差:{}'.format(test_mse))
訓練集的均方偏差:23.049177061822277 測試集的均方偏差:19.901828312902534
訓練集的均方偏差是19.4,而測試集的均方偏差是28.4,能夠發現測試集的偏差更大了,也就是說訓練集過擬合了。算法