# -*- coding:utf-8 -*-
#基於波士頓房屋租賃數據進行房屋租賃價格預測模型構建,使用lasso迴歸算法作特徵選擇後,分別使用線性迴歸,
#Lasso迴歸, Ridge迴歸, ElasticNet四類迴歸算法構建模型(分別測試1,2,3階)
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn.linear_model import LinearRegression,LassoCV,RidgeCV,ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures #多項式特徵
from sklearn.pipeline import Pipeline
from sklearn.linear_model.coordinate_descent import ConvergenceWarning #攔截異常的
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV #從sklearn.grid_search中導入網格搜索模塊GridSearchCV。
from sklearn import metrics #評價指標
def notEmpty(s):
return s !='' #是空的話就是FLASE,不是空的話就是TRUE
#設置字符集,防止中文亂碼
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
#攔截異常
warnings.filterwarnings(action = 'ignore', category=ConvergenceWarning)
# 加載數據
names = ['CRIM','ZN', 'INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT'] #前13個和房價相關的字段,LSTAT爲房價
path = "datas/boston_housing.data"
# 因爲數據文件格式不統一,因此讀取的時候,先按照一行一個字段屬性讀取數據,而後再安裝每行數據進行處理
fd = pd.read_csv(path,header=None)
#print(fd.shape)
data = np.empty((len(fd),14)) # len(fd)行,14列
for i, d in enumerate(fd.values): #enumerate生成一列索引i,d爲其元素
d = map(float,filter(notEmpty,d[0].split(' '))) #filter一個函數,一個list, 就是空的扔掉,有值的留下
#根據函數結果是否爲真 ,來過濾list中的項
data[i]=list(d)
#分割數據
x,y = np.split(data,(13,),axis=1) #分割前13列數據
# print(x[0:5])
#print(y) 因爲y是個二維的,因此要用ravel拉成一維的
y = y.ravel() #轉換格式 拉直操做
#print(y[0:5])
ly=len(y)
# print(y.shape)
print('樣本數據量:%d,特徵個數:%d '%x.shape)
print('target樣本數據量:%d'%y.shape[0])
#Pipeline經常使用於並行調參
models = [
Pipeline([
('ss', StandardScaler()),
('poly', PolynomialFeatures()),
('linear', RidgeCV(alphas=np.logspace(-3,1,20)))
]),
Pipeline([
('ss', StandardScaler()),
('poly', PolynomialFeatures()),
('linear', LassoCV(alphas=np.logspace(-3,1,20))) #logspace 以10爲底,從10的-3次方止10的0次方,中間有20步
]),
Pipeline([
('ss', StandardScaler()),
('poly', PolynomialFeatures()),
('linear', LinearRegression())
]),
Pipeline([
('ss', StandardScaler()),
('poly', PolynomialFeatures()),
('linear', ElasticNetCV(alphas=np.logspace(-3,1,20)))
])
]
#參數字典,字典中的key是屬性的名稱,value是可選的參數列表
parameters = {
"poly__degree": [3,2,1],
"poly__interaction_only": [True, False],#只產生交互相選TRUE,獲得[0次方,X自己,Y自己,X1*Y1] ;默認選FLASE,不只產生交互項,如X1*X1,Y1*Y1也會有
"poly__include_bias": [True, False], #多項式冪爲零的特徵做爲線性模型中的截距,默認爲True
"linear__fit_intercept": [True, False]
}
# rf = PolynomialFeatures(2,interaction_only=True)
# a = pd.DataFrame({
# 'name':[1,2,3,4,5],
# 'score':[2,3,4,4,5]
# })
# b=rf.fit_transform(a)
# print(b)
#數據分割
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
#Lasso和Ridge模型比較運行圖表展現
titles = ['Ridge','Lasso','LinearRegression','ElasticNet']
colors = ['g-','b-',