1 sklearn簡單例子
from sklearn import svm
X = [[2, 0], [1, 1], [2,3]]
y = [0, 0, 1]
clf = svm.SVC(kernel = 'linear')
clf.fit(X, y)
print clf
# get support vectors
print clf.support_vectors_
# get indices of support vectors
print clf.support_
# get number of support vectors for each class
print clf.n_support_
2 sklearn畫出決定界限
print(__doc__)
import numpy as np
import pylab as pl
from sklearn import svm
# we create 40 separable points
np.random.seed(0)
X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
Y = [0] * 20 + [1] * 20
# fit the model
clf = svm.SVC(kernel='linear')
clf.fit(X, Y)
# get the separating hyperplane
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (clf.intercept_[0]) / w[1]
# plot the parallels to the separating hyperplane that pass through the
# support vectors
b = clf.support_vectors_[0]
yy_down = a * xx + (b[1] - a * b[0])
b = clf.support_vectors_[-1]
yy_up = a * xx + (b[1] - a * b[0])
print "w: ", w
print "a: ", a
# print " xx: ", xx
# print " yy: ", yy
print "support_vectors_: ", clf.support_vectors_
print "clf.coef_: ", clf.coef_
# In scikit-learn coef_ attribute holds the vectors of the separating hyperplanes for linear models. It has shape (n_classes, n_features) if n_classes > 1 (multi-class one-vs-all) and (1, n_features) for binary classification.
#
# In this toy binary classification example, n_features == 2, hence w = coef_[0] is the vector orthogonal to the hyperplane (the hyperplane is fully defined by it + the intercept).
#
# To plot this hyperplane in the 2D case (any hyperplane of a 2D plane is a 1D line), we want to find a f as in y = f(x) = a.x + b. In this case a is the slope of the line and can be computed by a = -w[0] / w[1].
# plot the line, the points, and the nearest vectors to the plane
pl.plot(xx, yy, 'k-')
pl.plot(xx, yy_down, 'k--')
pl.plot(xx, yy_up, 'k--')
pl.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=80, facecolors='none')
pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired)
pl.axis('tight')
pl.show()
5.2 支持向量機(SVM)算法(下)算法
1. SVM算法特性:數據庫
1.1 訓練好的模型的算法複雜度是由支持向量的個數決定的,而不是由數據的維度決定的。因此SVM不太容易產生overfitting
1.2 SVM訓練出來的模型徹底依賴於支持向量(Support Vectors), 即便訓練集裏面全部非支持向量的點都被去除,重複訓練過程,結果仍然會獲得徹底同樣的模型。
1.3 一個SVM若是訓練得出的支持向量個數比較小,SVM訓練出的模型比較容易被泛化。
2. 線性不可分的狀況 (linearly inseparable case)
2.1 數據集在空間中對應的向量不可被一個超平面區分開
2.2 兩個步驟來解決:
2.2.1 利用一個非線性的映射把原數據集中的向量點轉化到一個更高維度的空間中
2.2.2 在這個高維度的空間中找一個線性的超平面來根據線性可分的狀況處理
2.3 如何利用非線性映射把原始數據轉化到高維中?
2.3.1 例子:
3維輸入向量:
轉化到6維空間 Z 中去:
新的決策超平面:
其中W和Z是向量,這個超平面是線性的
解出W和b以後,而且帶入回原方程:
2.3.2 思考問題:
2.3.2.1: 如何選擇合理的非線性轉化把數據轉到高緯度中?
2.3.2.2: 如何解決計算內積時算法複雜度很是高的問題?
2.3.3 使用核方法(kernel trick)
3. 核方法(kernel trick)
3.1 動機
在線性SVM中轉化爲最優化問題時求解的公式計算都是之內積(dot product)的形式出現的
,其中
是把訓練集中的向量點轉化到高維的非線性映射函數,由於內積的算法複雜
度很是大,因此咱們利用核函數來取代計算非線性映射函數的內積
3.1 如下核函數和非線性映射函數的內積等同
3.2 經常使用的核函數(kernel functions)
h度多項式核函數(polynomial kernel of degree h):
高斯徑向基核函數(Gaussian radial basis function kernel):
S型核函數(Sigmoid function kernel):
如何選擇使用哪一個kernel?
根據先驗知識,好比圖像分類,一般使用RBF,文字不使用RBF
嘗試不一樣的kernel,根據結果準確度而定
3.3 核函數舉例:
假設定義兩個向量: x = (x1, x2, x3); y = (y1, y2, y3)
定義方程:f(x) = (x1x1, x1x2, x1x3, x2x1, x2x2, x2x3, x3x1, x3x2, x3x3)
K(x, y ) = (<x, y>)^2
假設x = (1, 2, 3); y = (4, 5, 6).
f(x) = (1, 2, 3, 2, 4, 6, 3, 6, 9)
f(y) = (16, 20, 24, 20, 25, 36, 24, 30, 36)
<f(x), f(y)> = 16 + 40 + 72 + 40 + 100+ 180 + 72 + 180 + 324 = 1024windows
K(x, y) = (4 + 10 + 18 ) ^2 = 32^2 = 1024
一樣的結果,使用kernel方法計算容易不少
4. SVM擴展可解決多個類別分類問題
對於每一個類,有一個當前類和其餘類的二類分類器(one-vs-rest)
5.3 支持向量機(SVM)算法(下)應用 api
利用SVM進行人臉識別實例:
from __future__ import print_function
from time import time
import logging
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.datasets import fetch_lfw_people
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import RandomizedPCA
from sklearn.svm import SVC
print(__doc__)
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
###############################################################################
# Download the data, if not already on disk and load it as numpy arrays
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape
# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_people.data
n_features = X.shape[1]
# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]
print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)
###############################################################################
# Split into a training set and a test set using a stratified k fold
# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25)
###############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150
print("Extracting the top %d eigenfaces from %d faces"
% (n_components, X_train.shape[0]))
t0 = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0))
eigenfaces = pca.components_.reshape((n_components, h, w))
print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))
###############################################################################
# Train a SVM classification model
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)
###############################################################################
# Quantitative evaluation of the model quality on the test set
print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))
print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
###############################################################################
# Qualitative evaluation of the predictions using matplotlib
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
"""Helper function to plot a gallery of portraits"""
plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + 1)
plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(titles[i], size=12)
plt.xticks(())
plt.yticks(())
# plot the result of the prediction on a portion of the test set
def title(y_pred, y_test, target_names, i):
pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
return 'predicted: %s\ntrue: %s' % (pred_name, true_name)
prediction_titles = [title(y_pred, y_test, target_names, i)
for i in range(y_pred.shape[0])]
plot_gallery(X_test, prediction_titles, h, w)
# plot the gallery of the most significative eigenfaces
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)
plt.show()
6.1 神經網絡算法(Nerual Networks)(上)
1. 背景:
1.1 以人腦中的神經網絡爲啓發,歷史上出現過不少不一樣版本
1.2 最著名的算法是1980年的 backpropagation
2. 多層向前神經網絡(Multilayer Feed-Forward Neural Network)
2.1 Backpropagation被使用在多層向前神經網絡上
2.2 多層向前神經網絡由如下部分組成:
輸入層(input layer), 隱藏層 (hidden layers), 輸入層 (output layers)
2.3 每層由單元(units)組成
2.4 輸入層(input layer)是由訓練集的實例特徵向量傳入
2.5 通過鏈接結點的權重(weight)傳入下一層,一層的輸出是下一層的輸入
2.6 隱藏層的個數能夠是任意的,輸入層有一層,輸出層有一層
2.7 每一個單元(unit)也能夠被稱做神經結點,根據生物學來源定義
2.8 以上成爲2層的神經網絡(輸入層不算)
2.8 一層中加權的求和,而後根據非線性方程轉化輸出
2.9 做爲多層向前神經網絡,理論上,若是有足夠多的隱藏層(hidden layers) 和足夠大的訓練集, 能夠模
擬出任何方程
3. 設計神經網絡結構
3.1 使用神經網絡訓練數據以前,必須肯定神經網絡的層數,以及每層單元的個數
3.2 特徵向量在被傳入輸入層時一般被先標準化(normalize)到0和1之間 (爲了加速學習過程)
3.3 離散型變量能夠被編碼成每個輸入單元對應一個特徵值可能賦的值
好比:特徵值A可能取三個值(a0, a1, a2), 可使用3個輸入單元來表明A。
若是A=a0, 那麼表明a0的單元值就取1, 其餘取0;
若是A=a1, 那麼表明a1de單元值就取1,其餘取0,以此類推
3.4 神經網絡便可以用來作分類(classification)問題,也能夠解決迴歸(regression)問題
3.4.1 對於分類問題,若是是2類,能夠用一個輸出單元表示(0和1分別表明2類)
若是多餘2類,每個類別用一個輸出單元表示
因此輸入層的單元數量一般等於類別的數量
3.4.2 沒有明確的規則來設計最好有多少個隱藏層
3.4.2.1 根據實驗測試和偏差,以及準確度來實驗並改進
4. 交叉驗證方法(Cross-Validation)
K-fold cross validation
5. Backpropagation算法
5.1 經過迭代性的來處理訓練集中的實例
5.2 對比通過神經網絡後輸入層預測值(predicted value)與真實值(target value)之間
5.3 反方向(從輸出層=>隱藏層=>輸入層)來以最小化偏差(error)來更新每一個鏈接的權重(weight)
5.4 算法詳細介紹
輸入:D:數據集,l 學習率(learning rate), 一個多層前向神經網絡
輸入:一個訓練好的神經網絡(a trained neural network)
5.4.1 初始化權重(weights)和偏向(bias): 隨機初始化在-1到1之間,或者-0.5到0.5之間,每一個單元有
一個偏向
5.4.2 對於每個訓練實例X,執行如下步驟:
5.4.2.1: 由輸入層向前傳送
5.4.2.2 根據偏差(error)反向傳送
對於輸出層:
對於隱藏層:
權重更新:
偏向更新
5.4.3 終止條件
5.4.3.1 權重的更新低於某個閾值
5.4.3.2 預測的錯誤率低於某個閾值
5.4.3.3 達到預設必定的循環次數
6. Backpropagation 算法舉例
對於輸出層:
對於隱藏層:
權重更新:
偏向更新
6.2 神經網絡算法(Nerual Networks)應用(上)
1. 關於非線性轉化方程(non-linear transformation function)
sigmoid函數(S 曲線)用來做爲activation function:
1.1 雙曲函數(tanh)
1.2 邏輯函數(logistic function)
2. 實現一個簡單的神經網絡算法
import numpy as np
def tanh(x):
return np.tanh(x)
def tanh_deriv(x):
return 1.0 - np.tanh(x)*np.tanh(x)
def logistic(x):
return 1/(1 + np.exp(-x))
def logistic_derivative(x):
return logistic(x)*(1-logistic(x))
class NeuralNetwork:
def __init__(self, layers, activation='tanh'):
"""
:param layers: A list containing the number of units in each layer.
Should be at least two values
:param activation: The activation function to be used. Can be
"logistic" or "tanh"
"""
if activation == 'logistic':
self.activation = logistic
self.activation_deriv = logistic_derivative
elif activation == 'tanh':
self.activation = tanh
self.activation_deriv = tanh_deriv
self.weights = []
for i in range(1, len(layers) - 1):
self.weights.append((2*np.random.random((layers[i - 1] + 1, layers[i] + 1))-1)*0.25)
self.weights.append((2*np.random.random((layers[i] + 1, layers[i + 1]))-1)*0.25)
def fit(self, X, y, learning_rate=0.2, epochs=10000):
X = np.atleast_2d(X)
temp = np.ones([X.shape[0], X.shape[1]+1])
temp[:, 0:-1] = X # adding the bias unit to the input layer
X = temp
y = np.array(y)
for k in range(epochs):
i = np.random.randint(X.shape[0])
a = [X[i]]
for l in range(len(self.weights)): #going forward network, for each layer
a.append(self.activation(np.dot(a[l], self.weights[l]))) #Computer the node value for each layer (O_i) using activation function
error = y[i] - a[-1] #Computer the error at the top layer
deltas = [error * self.activation_deriv(a[-1])] #For output layer, Err calculation (delta is updated error)
#Staring backprobagation
for l in range(len(a) - 2, 0, -1): # we need to begin at the second to last layer
#Compute the updated error (i,e, deltas) for each node going from top layer to input layer
deltas.append(deltas[-1].dot(self.weights[l].T)*self.activation_deriv(a[l]))
deltas.reverse()
for i in range(len(self.weights)):
layer = np.atleast_2d(a[i])
delta = np.atleast_2d(deltas[i])
self.weights[i] += learning_rate * layer.T.dot(delta)
def predict(self, x):
x = np.array(x)
temp = np.ones(x.shape[0]+1)
temp[0:-1] = x
a = temp
for l in range(0, len(self.weights)):
a = self.activation(np.dot(a, self.weights[l]))
return a
6.3 神經網絡算法(Nerual Networks)應用(下)網絡
1. 簡單非線性關係數據集測試(XOR):
X: Y
0 0 0
0 1 1
1 0 1
1 1 0
Code:
from NeuralNetwork import NeuralNetwork
import numpy as np
nn = NeuralNetwork([2,2,1], 'tanh')
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 0])
nn.fit(X, y)
for i in [[0, 0], [0, 1], [1, 0], [1,1]]:
print(i, nn.predict(i))
2. 手寫數字識別:
每一個圖片8x8
識別數字:0,1,2,3,4,5,6,7,8,9
Code:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelBinarizer
from NeuralNetwork import NeuralNetwork
from sklearn.cross_validation import train_test_split
digits = load_digits()
X = digits.data
y = digits.target
X -= X.min() # normalize the values to bring them into the range 0-1
X /= X.max()
nn = NeuralNetwork([64,100,10],'logistic')
X_train, X_test, y_train, y_test = train_test_split(X, y)
labels_train = LabelBinarizer().fit_transform(y_train)
labels_test = LabelBinarizer().fit_transform(y_test)
print "start fitting"
nn.fit(X_train,labels_train,epochs=3000)
predictions = []
for i in range(X_test.shape[0]):
o = nn.predict(X_test[i] )
predictions.append(np.argmax(o))
print confusion_matrix(y_test,predictions)
print classification_report(y_test,predictions)
7.1 簡單線性迴歸 (Simple Linear Regression)上app
0. 前提介紹:
爲何須要統計量?
統計量:描述數據特徵
0.1 集中趨勢衡量
0.1.1均值(平均數,平均值)(mean)
{6, 2, 9, 1, 2}
(6 + 2 + 9 + 1 + 2) / 5 = 20 / 5 = 4
0.1.2中位數 (median): 將數據中的各個數值按照大小順序排列,居於中間位置的變量
0.1.2.1. 給數據排序:1, 2, 2, 6, 9
0.1.2.2. 找出位置處於中間的變量:2
當n爲基數的時候:直接取位置處於中間的變量
當n爲偶數的時候,取中間兩個量的平均值
0.1.2衆數 (mode):數據中出現次數最多的數
0.2
0.2.1. 離散程度衡量
0.2.1.1方差(variance)
{6, 2, 9, 1, 2}
(1) (6 - 4)^2 + (2 - 4) ^2 + (9 - 4)^2 + (1 - 4)^2 + (2 - 4)^2
= 4 + 4 + 25 + 9 + 4
= 46
(2) n - 1 = 5 - 1 = 4
(3) 46 / 4 = 11.5
0.2.1.2標準差 (standard deviation)
s = sqrt(11.5) = 3.39
1. 介紹:迴歸(regression) Y變量爲連續數值型(continuous numerical variable)
如:房價,人數,降雨量
分類(Classification): Y變量爲類別型(categorical variable)
如:顏色類別,電腦品牌,有無信譽
2. 簡單線性迴歸(Simple Linear Regression)
2.1 不少作決定過過程一般是根據兩個或者多個變量之間的關係
2.3 迴歸分析(regression analysis)用來創建方程模擬兩個或者多個變量之間如何關聯
2.4 被預測的變量叫作:因變量(dependent variable), y, 輸出(output)
2.5 被用來進行預測的變量叫作: 自變量(independent variable), x, 輸入(input)
3. 簡單線性迴歸介紹
3.1 簡單線性迴歸包含一個自變量(x)和一個因變量(y)
3.2 以上兩個變量的關係用一條直線來模擬
3.3 若是包含兩個以上的自變量,則稱做多元迴歸分析(multiple regression)
4. 簡單線性迴歸模型
4.1 被用來描述因變量(y)和自變量(X)以及誤差(error)之間關係的方程叫作迴歸模型
4.2 簡單線性迴歸的模型是:
其中: 參數 誤差
5. 簡單線性迴歸方程
E(y) = β
0+β
1x
這個方程對應的圖像是一條直線,稱做迴歸線
其中,β
0是迴歸線的截距
β
1是迴歸線的斜率
E(y)是在一個給定x值下y的指望值(均值)
6. 正向線性關係:
7. 負向線性關係:
8. 無關係
9. 估計的簡單線性迴歸方程
ŷ=b
0+b
1x
這個方程叫作估計線性方程(estimated regression line)
其中,b
0是估計線性方程的縱截距
b
1是估計線性方程的斜率
ŷ是在自變量x等於一個給定值的時候,y的估計值
10. 線性迴歸分析流程:
11. 關於誤差ε的假定
11.1 是一個隨機的變量,均值爲0
11.2 ε的方差(variance)對於全部的自變量x是同樣的
11.3 ε的值是獨立的
11.4 ε知足正態分佈
7.1 簡單線性迴歸 (Simple Linear Regression)下
1. 簡單線性迴歸模型舉例:
汽車賣家作電視廣告數量與賣出的汽車數量:
1.1 如何練處適合簡單線性迴歸模型的最佳迴歸線?
使sum of squares最小
1.1.2 計算
分子 = (1-2)(14-20)+(3-2)(24-20)+(2-2)(18-20)+(1-2)(17-20)+(3-2)(27-20)
= 6 + 4 + 0 + 3 + 7
= 20
分母 = (1-2)^2 + (3-2)^2 + (2-2)^2 + (1-2)^2 + (3-2)^2
= 1 + 1 + 0 + 1 + 1
4
b1 = 20/4 =5
b0 = 20 - 5*2 = 20 - 10 = 10
1.2 預測:
假設有一週廣告數量爲6,預測的汽車銷售量是多少?
x_given = 6
Y_hat = 5*6 + 10 = 40
1.3 Python實現:
import numpy as np
def fitSLR(x, y):
n = len(x)
dinominator = 0
numerator = 0
for i in range(0, n):
numerator += (x[i] - np.mean(x))*(y[i] - np.mean(y))
dinominator += (x[i] - np.mean(x))**2
b1 = numerator/float(dinominator)
b0 = np.mean(y)/float(np.mean(x))
return b0, b1
def predict(x, b0, b1):
return b0 + x*b1
x = [1, 3, 2, 1, 3]
y = [14, 24, 18, 17, 27]
b0, b1 = fitSLR(x, y)
print "intercept:", b0, " slope:", b1
x_test = 6
y_test = predict(6, b0, b1)
print "y_test:", y_test
7.3 多元迴歸分析(multiple regression)
1. 與簡單線性迴歸區別(simple linear regression)
多個自變量(x)
2. 多元迴歸模型
y=β
0+β
1x
1+β
2x
2+ ... +β
px
p+ε
其中:β
0,β
1,β
2... β
p是參數
ε是偏差值
3. 多元迴歸方程
E(y)=β
0+β
1x
1+β
2x
2+ ... +β
px
p
4. 估計多元迴歸方程:
y_hat=b0+b1x1+b2x2+ ... +bpxp
一個樣本被用來計算β
0,β
1,β
2... β
p的點估計b
0, b
1, b
2,..., b
p
5. 估計流程 (與簡單線性迴歸相似)
6. 估計方法
使sum of squares最小
運算與簡單線性迴歸相似,涉及到線性代數和矩陣代數的運算
7. 例子
一家快遞公司送貨:X1: 運輸里程 X2: 運輸次數 Y:總運輸時間
Driving 框架 Assignment |
X1=Miles Traveled |
X2=Number of Deliveries |
Y= Travel Time (Hours) |
1 |
100 |
4 |
9.3 |
2 |
50 |
3 |
4.8 |
3 |
100 |
4 |
8.9 |
4 |
100 |
2 |
6.5 |
5 |
50 |
2 |
4.2 |
6 |
80 |
2 |
6.2 |
7 |
75 |
3 |
7.4 |
8 |
65 |
4 |
6.0 |
9 |
90 |
3 |
7.6 |
10 |
90 |
2 |
6.1 |
Time = b0+ b1*Miles + b2 * Deliveries
Time = -0.869 + 0.0611 Miles + 0.923 Deliveries
8. 描述參數含義
b0: 平均每多運送一英里,運輸時間延長0.0611 小時
b1: 平均每多一次運輸,運輸時間延長 0.923 小時
9. 預測
若是一個運輸任務是跑102英里,運輸6次,預計多少小時?
Time = -0.869 +0.0611 *102+ 0.923 * 6
= 10.9 (小時)
10. 若是自變量中有分類型變量(categorical data) , 如何處理?
英里數 |
次數 |
車型 |
時間 |
100 |
4 |
1 |
9.3 |
50 |
3 |
0 |
4.8 |
100 |
4 |
1 |
8.9 |
100 |
2 |
2 |
6.5 |
50 |
2 |
2 |
4.2 |
80 |
2 |
1 |
6.2 |
75 |
3 |
1 |
7.4 |
65 |
4 |
0 |
6 |
90 |
3 |
0 |
7.6 |
11. 關於偏差的分佈
偏差ε是一個隨機變量,均值爲0
ε的方差對於全部的自變量來講相等
全部ε的值是獨立的
ε知足正態分佈,而且經過β
0+β
1x
1+β
2x
2+ ... +β
px
p反映y的指望值
7.4 多元迴歸分析(multiple regression)應用
1. 例子
一家快遞公司送貨:X1: 運輸里程 X2: 運輸次數 Y:總運輸時間
Driving Assignment |
X1=Miles Traveled |
X2=Number of Deliveries |
Y= Travel Time (Hours) |
1 |
100 |
4 |
9.3 |
2 |
50 |
3 |
4.8 |
3 |
100 |
4 |
8.9 |
4 |
100 |
2 |
6.5 |
5 |
50 |
2 |
4.2 |
6 |
80 |
2 |
6.2 |
7 |
75 |
3 |
7.4 |
8 |
65 |
4 |
6.0 |
9 |
90 |
3 |
7.6 |
10 |
90 |
2 |
6.1 |
目的,求出b0, b1,.... bp:
y_hat=b
0+b
1x
1+b
2x
2+ ... +b
px
p
2. Python代碼:
from numpy import genfromtxt
import numpy as np
from sklearn import datasets, linear_model
dataPath = r"D:\MaiziEdu\DeepLearningBasics_MachineLearning\Datasets\Delivery.csv"
deliveryData = genfromtxt(dataPath, delimiter=',')
print "data"
print deliveryData
X = deliveryData[:, :-1]
Y = deliveryData[:, -1]
print "X:"
print X
print "Y: "
print Y
regr = linear_model.LinearRegression()
regr.fit(X, Y)
print "coefficients"
print regr.coef_
print "intercept: "
print regr.intercept_
xPred = [102, 6]
yPred = regr.predict(xPred)
print "predicted y: "
print yPred
7.5 非線性迴歸 logistic regression
1. 機率:
1.1 定義 機率(P)robability: 對一件事情發生的可能性的衡量
1.2 範圍 0 <= P <= 1
1.3 計算方法:
1.3.1 根據我的置信
1.3.2 根據歷史數據
1.3.3 根據模擬數據
1.4 條件機率:
2. Logistic Regression (邏輯迴歸)
2.1 例子
h(x) > 0.5
h(x) > 0.2
2.2 基本模型
測試數據爲X(x0,x1,x2···xn)
要學習的參數爲: Θ(θ0,θ1,θ2,···θn)
向量表示:
處理二值數據,引入Sigmoid函數時曲線平滑化
預測函數:
用機率表示:
正例(y=1):
反例(y=0):
2.3 Cost函數
線性迴歸:
找到合適的 θ0,θ1使上式最小
Logistic regression:
Cost函數:
目標:找到合適的 θ0,θ1使上式最小
2.4 解法:梯度降低(gradient decent)
更新法則:
學習率
同時對全部的θ進行更新
重複更新直到收斂
7.6 非線性迴歸應用:losgistic regression application
Python 實現:
import numpy as np
import random
# m denotes the number of examples here, not the number of features
def gradientDescent(x, y, theta, alpha, m, numIterations):
xTrans = x.transpose()
for i in range(0, numIterations):
hypothesis = np.dot(x, theta)
loss = hypothesis - y
# avg cost per example (the 2 in 2*m doesn't really matter here.
# But to be consistent with the gradient, I include it)
cost = np.sum(loss ** 2) / (2 * m)
print("Iteration %d | Cost: %f" % (i, cost))
# avg gradient per example
gradient = np.dot(xTrans, loss) / m
# update
theta = theta - alpha * gradient
return theta
def genData(numPoints, bias, variance):
x = np.zeros(shape=(numPoints, 2))
y = np.zeros(shape=numPoints)
# basically a straight line
for i in range(0, numPoints):
# bias feature
x[i][0] = 1
x[i][1] = i
# our target variable
y[i] = (i + bias) + random.uniform(0, 1) * variance
return x, y
# gen 100 points with a bias of 25 and 10 variance as a bit of noise
x, y = genData(100, 25, 10)
m, n = np.shape(x)
numIterations= 100000
alpha = 0.0005
theta = np.ones(n)
theta = gradientDescent(x, y, theta, alpha, m, numIterations)
print(theta)
7.7 迴歸中的相關度和R平方值
1. 皮爾遜相關係數 (Pearson Correlation Coefficient):
1.1 衡量兩個值線性相關強度的量
1.2 取值範圍 [-1, 1]:
正向相關: >0, 負向相關:<0, 無相關性:=0
1.3
2. 計算方法舉例:
X |
Y |
1 |
10 |
3 |
12 |
8 |
24 |
7 |
21 |
9 |
34 |
|
|
3. 其餘例子:
4. R平方值:
4.1定義:決定係數,反應因變量的所有變異能經過迴歸關係被自變量解釋的比例。
4.2 描述:如R平方爲0.8,則表示迴歸關係能夠解釋因變量80%的變異。換句話說,若是咱們能控制自變量不變,則因變量的變異程度會減小80%
4.3: 簡單線性迴歸:R^2 = r * r
多元線性迴歸:
5. R平方也有其侷限性:R平方隨着自變量的增長會變大,R平方和樣本量是有關係的。所以,咱們要到R平方進行修正。修正的方法:
7.8 迴歸中的相關度和R平方值應用
Python實現:
import numpy as np
from astropy.units import Ybarn
import math
def computeCorrelation(X, Y):
xBar = np.mean(X)
yBar = np.mean(Y)
SSR = 0
varX = 0
varY = 0
for i in range(0 , len(X)):
diffXXBar = X[i] - xBar
diffYYBar = Y[i] - yBar
SSR += (diffXXBar * diffYYBar)
varX += diffXXBar**2
varY += diffYYBar**2
SST = math.sqrt(varX * varY)
return SSR / SST
testX = [1, 3, 8, 7, 9]
testY = [10, 12, 24, 21, 34]
print computeCorrelation(testX, testY)
8.1 聚類(Clustering) K-means算法
1. 歸類:
聚類(clustering) 屬於非監督學習 (unsupervised learning)
無類別標記(class label)
2. 舉例:
3. K-means 算法:
3.1 Clustering 中的經典算法,數據挖掘十大經典算法之一
3.2 算法接受參數 k ;而後將事先輸入的n個數據對象劃分爲 k個聚類以便使得所得到的聚類知足:同一
聚類中的對象類似度較高;而不一樣聚類中的對象類似度較小。
3.3 算法思想:
以空間中k個點爲中心進行聚類,對最靠近他們的對象歸類。經過迭代的方法,逐次更新各聚類中心
的值,直至獲得最好的聚類結果
3.4 算法描述:
(1)適當選擇c個類的初始中心;
(2)在第k次迭代中,對任意一個樣本,求其到c各中心的距離,將該樣本歸到距離最短的中心所在
的類;
(3)利用均值等方法更新該類的中心值;
(4)對於全部的c個聚類中心,若是利用(2)(3)的迭代法更新後,值保持不變,則迭代結束,
不然繼續迭代。
3.5 算法流程:
輸入:k, data[n];
(1) 選擇k個初始中心點,例如c[0]=data[0],…c[k-1]=data[k-1];
(2) 對於data[0]….data[n], 分別與c[0]…c[k-1]比較,假定與c[i]差值最少,就標記爲i;
(3) 對於全部標記爲i點,從新計算c[i]={ 全部標記爲i的data[j]之和}/標記爲i的個數;
(4) 重複(2)(3),直到全部c[i]值的變化小於給定閾值。
4. 舉例:
中止
優勢:速度快,簡單
缺點:最終結果跟初始點選擇相關,容易陷入局部最優,需直到k值
8.2 聚類(Clustering) K-means算法應用
import numpy as np
# Function: K Means
# -------------
# K-Means is an algorithm that takes in a dataset and a constant
# k and returns k centroids (which define clusters of data in the
# dataset which are similar to one another).
def kmeans(X, k, maxIt):
numPoints, numDim = X.shape
dataSet = np.zeros((numPoints, numDim + 1))
dataSet[:, :-1] = X
# Initialize centroids randomly
centroids = dataSet[np.random.randint(numPoints, size = k), :]
centroids = dataSet[0:2, :]
#Randomly assign labels to initial centorid
centroids[:, -1] = range(1, k +1)
# Initialize book keeping vars.
iterations = 0
oldCentroids = None
# Run the main k-means algorithm
while not shouldStop(oldCentroids, centroids, iterations, maxIt):
print "iteration: \n", iterations
print "dataSet: \n", dataSet
print "centroids: \n", centroids
# Save old centroids for convergence test. Book keeping.
oldCentroids = np.copy(centroids)
iterations += 1
# Assign labels to each datapoint based on centroids
updateLabels(dataSet, centroids)
# Assign centroids based on datapoint labels
centroids = getCentroids(dataSet, k)
# We can get the labels too by calling getLabels(dataSet, centroids)
return dataSet
# Function: Should Stop
# -------------
# Returns True or False if k-means is done. K-means terminates either
# because it has run a maximum number of iterations OR the centroids
# stop changing.
def shouldStop(oldCentroids, centroids, iterations, maxIt):
if iterations > maxIt:
return True
return np.array_equal(oldCentroids, centroids)
# Function: Get Labels
# -------------
# Update a label for each piece of data in the dataset.
def updateLabels(dataSet, centroids):
# For each element in the dataset, chose the closest centroid.
# Make that centroid the element's label.
numPoints, numDim = dataSet.shape
for i in range(0, numPoints):
dataSet[i, -1] = getLabelFromClosestCentroid(dataSet[i, :-1], centroids)
def getLabelFromClosestCentroid(dataSetRow, centroids):
label = centroids[0, -1];
minDist = np.linalg.norm(dataSetRow - centroids[0, :-1])
for i in range(1 , centroids.shape[0]):
dist = np.linalg.norm(dataSetRow - centroids[i, :-1])
if dist < minDist:
minDist = dist
label = centroids[i, -1]
print "minDist:", minDist
return label
# Function: Get Centroids
# -------------
# Returns k random centroids, each of dimension n.
def getCentroids(dataSet, k):
# Each centroid is the geometric mean of the points that
# have that centroid's label. Important: If a centroid is empty (no points have
# that centroid's label) you should randomly re-initialize it.
result = np.zeros((k, dataSet.shape[1]))
for i in range(1, k + 1):
oneCluster = dataSet[dataSet[:, -1] == i, :-1]
result[i - 1, :-1] = np.mean(oneCluster, axis = 0)
result[i - 1, -1] = i
return result
x1 = np.array([1, 1])
x2 = np.array([2, 1])
x3 = np.array([4, 3])
x4 = np.array([5, 4])
testX = np.vstack((x1, x2, x3, x4))
result = kmeans(testX, 2, 10)
print "final result:"
print result
8.3 聚類(Clustering) hierarchical clustering 層次聚類
假設有N個待聚類的樣本,對於層次聚類來講,步驟:
一、(初始化)把每一個樣本歸爲一類,計算每兩個類之間的距離,也就是樣本與樣本之間的類似度;
二、尋找各個類之間最近的兩個類,把他們歸爲一類(這樣類的總數就少了一個);
三、從新計算新生成的這個類與各個舊類之間的類似度;
四、重複2和3直到全部樣本點都歸爲一類,結束
整個聚類過程實際上是創建了一棵樹,在創建的過程當中,能夠經過在第二步上設置一個閾值,當最近的兩個類的距離大於這個閾值,則認爲迭代能夠終止。另外關鍵的一步就是第三步,如何判斷兩個類之間的類似度有很多種方法。這裏介紹一下三種:
SingleLinkage:又叫作 nearest-neighbor ,就是取兩個類中距離最近的兩個樣本的距離做爲這兩個集合的距離,也就是說,最近兩個樣本之間的距離越小,這兩個類之間的類似度就越大。容易形成一種叫作 Chaining 的效果,兩個 cluster 明明從「大局」上離得比較遠,可是因爲其中個別的點距離比較近就被合併了,而且這樣合併以後 Chaining 效應會進一步擴大,最後會獲得比較鬆散的 cluster 。
CompleteLinkage:這個則徹底是 Single Linkage 的反面極端,取兩個集合中距離最遠的兩個點的距離做爲兩個集合的距離。其效果也是恰好相反的,限制很是大,兩個 cluster 即便已經很接近了,可是隻要有不配合的點存在,就頑固到底,老死不相合並,也是不太好的辦法。這兩種類似度的定義方法的共同問題就是指考慮了某個有特色的數據,而沒有考慮類內數據的總體特色。
Average-linkage:這種方法就是把兩個集合中的點兩兩的距離所有放在一塊兒求一個平均值,相對也能獲得合適一點的結果。
average-linkage的一個變種就是取兩兩距離的中值,與取均值相比更加可以解除個別偏離樣本對結果的干擾。
8.4 聚類(Clustering) hierarchical clustering 層次聚類應用
from numpy import *
"""
Code for hierarchical clustering, modified from
Programming Collective Intelligence by Toby Segaran
(O'Reilly Media 2007, page 33).
"""
class cluster_node:
def __init__(self,vec,left=None,right=None,distance=0.0,id=None,count=1):
self.left=left
self.right=right
self.vec=vec
self.id=id
self.distance=distance
self.count=count #only used for weighted average
def L2dist(v1,v2):
return sqrt(sum((v1-v2)**2))
def L1dist(v1,v2):
return sum(abs(v1-v2))
# def Chi2dist(v1,v2):
# return sqrt(sum((v1-v2)**2))
def hcluster(features,distance=L2dist):
#cluster the rows of the "features" matrix
distances={}
currentclustid=-1
# clusters are initially just the individual rows
clust=[cluster_node(array(features[i]),id=i) for i in range(len(features))]
while len(clust)>1:
lowestpair=(0,1)
closest=distance(clust[0].vec,clust[1].vec)
# loop through every pair looking for the smallest distance
for i in range(len(clust)):
for j in range(i+1,len(clust)):
# distances is the cache of distance calculations
if (clust[i].id,clust[j].id) not in distances:
distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)
d=distances[(clust[i].id,clust[j].id)]
if d<closest:
closest=d
lowestpair=(i,j)
# calculate the average of the two clusters
mergevec=[(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 \
for i in range(len(clust[0].vec))]
# create the new cluster
newcluster=cluster_node(array(mergevec),left=clust[lowestpair[0]],
right=clust[lowestpair[1]],
distance=closest,id=currentclustid)
# cluster ids that weren't in the original set are negative
currentclustid-=1
del clust[lowestpair[1]]
del clust[lowestpair[0]]
clust.append(newcluster)
return clust[0]
def extract_clusters(clust,dist):
# extract list of sub-tree clusters from hcluster tree with distance<dist
clusters = {}
if clust.distance<dist:
# we have found a cluster subtree
return [clust]
else:
# check the right and left branches
cl = []
cr = []
if clust.left!=None:
cl = extract_clusters(clust.left,dist=dist)
if clust.right!=None:
cr = extract_clusters(clust.right,dist=dist)
return cl+cr
def get_cluster_elements(clust):
# return ids for elements in a cluster sub-tree
if clust.id>=0:
# positive id means that this is a leaf
return [clust.id]
else:
# check the right and left branches
cl = []
cr = []
if clust.left!=None:
cl = get_cluster_elements(clust.left)
if clust.right!=None:
cr = get_cluster_elements(clust.right)
return cl+cr
def printclust(clust,labels=None,n=0):
# indent to make a hierarchy layout
for i in range(n): print ' ',
if clust.id<0:
# negative id means that this is branch
print '-'
else:
# positive id means that this is an endpoint
if labels==None: print clust.id
else: print labels[clust.id]
# now print the right and left branches
if clust.left!=None: printclust(clust.left,labels=labels,n=n+1)
if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)
def getheight(clust):
# Is this an endpoint? Then the height is just 1
if clust.left==None and clust.right==None: return 1
# Otherwise the height is the same of the heights of
# each branch
return getheight(clust.left)+getheight(clust.right)
def getdepth(clust):
# The distance of an endpoint is 0.0
if clust.left==None and clust.right==None: return 0
# The distance of a branch is the greater of its two sides
# plus its own distance
return max(getdepth(clust.left),getdepth(clust.right))+clust.distance