Chapter 2 - Hoursingjavascript
Main Steps
1.Look at the big picture
2.Get the data
3.Discover and visualize the data to gain insights
4.Prepare the data for Machine Learning algorithms
5.Select a model and train it
6.Fine-tune model
7.Present solution
8.Launch, monitor, and maintain system
css
Note: automate as much as possible so you can easily get fresh data.html
# 獲取數據
import os
import tarfile
from six.moves import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
def fetch_hoursing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedires(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
# urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path = housing_path)
housing_tgz.close()
fetch_hoursing_data()
# 導入數據,查看數據基本信息
import pandas as pd
def load_housing_data(housing_path = HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)
housing = load_housing_data()
housing.head()
housing.info()
housing["ocean_proximity"].value_counts()
housing.describe()
%matplotlib inline
#將生成的圖片嵌入Jupyter notebook magic command
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()
%magic
Notice:html5
avoid data snooping bias creat test setjava
import numpy as np
import numpy.random as rnd
rnd.seed(42) # to make this notebook's output identical at every run,使用相同的起源(seed)會使最後的隨機序列相同
#每次都生成一個序列,每次的序列都不相同
def split_train_test(data, test_ratio):
shuffled_indices = rnd.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), len(test_set))
儘管seed(number)能夠獲得每次都相同的僞隨機序列(Pseudo-random sequence) but these solution will break next time you fetch an updated dataset use each instance's identifier to decide whether or not it should going in test set housing data don't have identifier can use housing.reset_index()<
> or use the stable features to build a unique identifiernode
import hashlib
def test_set_check(identifier, test_ratio, hash):
return hash(np.int64(identifier)).digest()[-1] < 256*test_ratio #最後一個字節轉化爲整數和256×0.2比較,小於的劃入test_set
def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
ids = data[id_column]
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash)) #http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html#pandas.DataFrame.apply
return data.loc[~in_test_set],data.loc[in_test_set] #http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.loc.html#pandas.DataFrame.loc
#方案1:使用行標1...20640來計算hash
housing_with_id_1 = housing.reset_index() #http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reset_index.html
train_set_1, test_set_1 = split_train_test_by_id(housing_with_id_1, 0.2, "index")
print(len(train_set_1),len(test_set_1))
#方案2:使用經緯度的相加來獲得id,計算hash
housing_with_id = housing.copy()
housing_with_id["id"] = housing["longitude"]*1000 + housing["latitude"]
train_set_2, test_set_2 = split_train_test_by_id(housing_with_id, 0.2, "id")
print(len(train_set_2),len(test_set_2))
#方案3:直接使用sklearn模塊的方法
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(len(train_set),len(test_set))
#分層抽樣(stratified sampling),represent all dataset, need sufficient number of instaces in your dataset for each stratum
housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5, 5.0, inplace=True) #小於5的保留,大於的納入5。http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.where.html#pandas.DataFrame.where
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) #http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html
for train_index,test_index in split.split(housing, housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
housing["income_cat"].value_counts()/len(housing)
strat_train_set["income_cat"].value_counts()/len(strat_train_set)
for set in (strat_test_set, strat_train_set):
set.drop(["income_cat"], axis=1, inplace=True) #移除income_cat
Note: try to get insights from a field expert for these steps.python
Name Type (categorical, int/float, bounded/unbounded, text, structured, etc.) % of missing values Noisiness and type of noise (stochastic, outliers, rounding errors, etc.) Possibly useful for the task? Type of distribution (Gaussian, uniform, logarithmic, etc.)jquery
Visualizing Geographical Datalinux
#creat a copy
housing = strat_train_set.copy()
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=.1)
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"]/100, label="population", c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()
Looking for Correlations 計算標準相關係數android
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
#another way to check for correlation between attributes is to use Pandas'scatter_matrix
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes],figsize=(24,16))
#zoom in
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
Experimenting with Attribute Combination 查看相較以前的屬性,相關係數有啥改變
#try out various attribute combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]
#correlation matrix
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
Notes:
Requement
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
# housing.dropna(subset=["total_bedroom"]) option 1:刪掉對應的district
# housing.drop("total_bedroom", axis=1) option 2:刪掉整個attribute
# median = housing["total_bedroom"].median() option 3:給它一個值
# housing["total_bedroom"].fillna(median) option 3
#option 3 use scikit-learn
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = "median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
print(imputer.statistics_)
print(housing_num.median().values)
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns = housing_num.columns)
Scikit-Learn's API Design
fit()
transform()
predict()
scord()
mode that measure the qualityimputer.strategy
Handling Text and Catagorical Attributes
# convert these text labels to number
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
print(encoder.classes_)
# use OneHotEncoder encoder to convert integer categorical values into one-hot vector
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
# fit_transform expect 2D array should reshape
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
housing_cat_1hot
housing_cat_1hot.toarray()
#使用LabelBinarizer代替以上兩個
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat) #add "sparse_output = True" get scipy sparse matrix
housing_cat_1hot
#custom transform
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room = True): #no *args or **kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
rooms_per_household = X[:, rooms_ix]/X[:, household_ix]
population_per_household = X[:, population_ix]/X[:, household_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
else:
return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room = False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing.values
housing_extra_attribs = pd.DataFrame(housing_extra_attribs, columns=list(housing.columns)+["rooms_per_household", "population_per_household"])
housing_extra_attribs.head()
Feature Scaling :
Transformation Pipline : do sequence of transform
#使用Pipline來對Estimators 進行 fit_transfoem()
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipline = Pipeline([('imputer', Imputer(strategy='median')),\
('attribs_adder', CombinedAttributesAdder()),\
('std_scaler', StandardScaler()),]) #三步,每一步的結果傳到下一步繼續執行,填充缺失->屬性組合->標準化
# housing_num 表示全部的數據行
housing_num_tr = num_pipline.fit_transform(housing_num)
housing_num_tr[0:5]
# 將transform後的數據行和文本行合併
from sklearn.pipeline import FeatureUnion
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
# apply the LabelBinarizer on the categorical values
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([('selector', DataFrameSelector(num_attribs)),\
('imputer', Imputer(strategy='median')),\
('attribs_adder', CombinedAttributesAdder()),\
('std_scaler', StandardScaler())])
cat_pipeline = Pipeline([('selector', DataFrameSelector(cat_attribs)),\
('label_binarizer', LabelBinarizer()),])
full_pipeline = FeatureUnion(transformer_list=[("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline)])
from sklearn.pipeline import FeatureUnion
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[self.attribute_names].values
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('label_binarizer', LabelBinarizer()),
])
preparation_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
housing_prepared.shape
Notes: If the data is huge, you may want to sample smaller training sets so you can train many different models in a reasonable time (be aware that this penalizes complex models such as large neural nets or Random Forests). Once again, try to automate these steps as much as possible.
# 迴歸
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
#prediction
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Prediation:\t", lin_reg.predict(some_data_prepared))
print("Labels:\t\t", list(some_labels))
#RMSE
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse
# 決策樹
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse # overfitting
# cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
print("Scores:\t",scores)
print("Mean:\t",scores.mean())
print("Standard deviation:", scores.std())
display_scores(tree_rmse_scores)
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)
display_scores(lin_rmse_scores)
Decision Tree Overfitting LinearRegression Underfitting
#Random Forest work by training mang Decision Trees on random subsets of the feature
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
from sklearn.svm import SVR
svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse
Notes: You will want to use as much data as possible for this step, especially as you move toward the end of fine-tuning. As always automate what you can.
tell it what which hyperparameters you want to experiment with, and what values to try out, and it will evaluate all the possible combinations of hyperparameter valus, using cross-validation
# 網格try
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)
grid_search.best_params_
grid_search.best_estimator_
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
pd.DataFrame(grid_search.cv_results_)
# 隨機try
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distribs = {
'n_estimators': randint(low=1, high=200),
'max_features': randint(low=1, high=8),
}
forest_reg = RandomForestRegressor()
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
n_iter=10, cv=5, scoring='neg_mean_squared_error')
rnd_search.fit(housing_prepared, housing_labels)
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
Analy the Best Modes and Their Errors
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
extra_attribs = ["rooms_per_household", "population_per_household", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
Evaluate Your System on the test set
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_transformed = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_transformed)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
param_grid = [{"kernel" : ["linear"], "C" : [10., 50.]},
{"kernel" : ['rbf'], "C" : [300., 600.], 'gamma' : [.001]}]
svr_reg = SVR()
svr_search = GridSearchCV(svr_reg, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=4, verbose=2)
svr_search.fit(housing_prepared, housing_labels)
svres = svr_search.cv_results_
for mean_score, params in zip(svres["mean_test_score"], svres["params"]):
print(np.sqrt(-mean_score), params)
svr_reg.get_params()
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal
# see https://docs.scipy.org/doc/scipy-0.19.0/reference/stats.html
# for `expon()` and `reciprocal()` documentation and more probability distribution functions.
# Note: gamma is ignored when kernel is "linear"
param_distribs = {
'kernel': ['linear', 'rbf'],
'C': reciprocal(20, 200), #handson-ml answers 20000
'gamma': expon(scale=1.0),
}
svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,
n_iter=10, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4)
rnd_search.fit(housing_prepared, housing_labels)
negative_mse = rnd_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse
rnd_search.best_params_
expon_distrib = expon(scale=1.)
samples = expon_distrib.rvs(10000)
plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.title("Exponential distribution (scale=1.0)")
plt.hist(samples, bins=50)
plt.subplot(122)
plt.title("Log of this distribution")
plt.hist(np.log(samples), bins=50)
plt.show()
feature selector assume you has already compute the feature importances
from sklearn.base import BaseEstimator, TransformerMixin
def indices_of_top_k(arr, k):
return np.sort(np.argpartition(np.array(arr), -k)[-k:])
class TopFeatureSelector(BaseEstimator, TransformerMixin):
def __init__(self, feature_importances, k):
self.feature_importances = feature_importances
self.k = k
def fit(self, X, y=None):
self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
return self
def transform(self, X, y=None):
return X[:, self.feature_indices_]
#define k
k = 5
#look the selected feature
top_k_feature_indices = indices_of_top_k(feature_importances, k)
print(top_k_feature_indices)
print(np.array(attributes)[top_k_feature_indices])
sorted(zip(feature_importances, attributes), reverse=True)[:k]
#pipeline
preparation_and_feature_selection_pipeline = Pipeline([
('preparation', full_pipeline),
('feature_selection', TopFeatureSelector(feature_importances, k))
])
#fit_transform
housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)
housing_prepared_top_k_features
注意,必定要把LabelBinarizer換成Spuervision Friendly的!!!!!!
class SupervisionFriendlyLabelBinarizer(LabelBinarizer):
def fit_transform(self, X, y=None):
return super(SupervisionFriendlyLabelBinarizer, self).fit_transform(X)
# Replace the Labelbinarizer with a SupervisionFriendlyLabelBinarizer
cat_pipeline.steps[1] = ("label_binarizer", SupervisionFriendlyLabelBinarizer())
# Now you can create a full pipeline with a supervised predictor at the end.
fulll_pipeline = Pipeline([
("preparation", preparation_pipeline),
("linear", LinearRegression())
])
fulll_pipeline.fit(housing, housing_labels)
fulll_pipeline.predict(some_data)
prepare_select_and_predict_pipeline = Pipeline([
('preparation', preparation_pipeline),
('feature_selection', TopFeatureSelector(feature_importances, k)),
('svr_rege', SVR(C=122659.12862707644, gamma=0.22653313890837068, kernel='rbf')),
])
prepare_select_and_predict_pipeline.fit(housing, housing_labels)
終於找到了報錯緣由:沒有轉換爲監督友好的label二值化函數!!!
param_grid = [
{'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],
'feature_selection__k': [3, 4, 5, 6, 7]}
]
grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,
scoring='neg_mean_squared_error', verbose=2, n_jobs=4)
grid_search_prep.fit(housing, housing_labels)
grid_search_prep.best_params_
housing.shape