Alias Method:時間複雜度O(1)的離散採樣方法node
python3 Alias Samplepython
class Solution: def __init__(self, w: List[int]): N = len(w) sum_ = sum(w) prob = [p / sum_ for p in w] alias = [0] * N alias_prob = [p * N for p in prob] small_q = [] large_q = [] for i, p in enumerate(alias_prob): if p < 1: small_q.append(i) else: large_q.append(i) while small_q and large_q: small = small_q.pop(0) large = large_q.pop(0) alias[small] = large alias_prob[large] -= (1 - alias_prob[small]) if alias_prob[large] < 1: small_q.append(large) else: large_q.append(large) self.alias = alias self.N = N self.alias_prob = alias_prob def pickIndex(self) -> int: ix = random.randint(0, self.N - 1) p = random.random() if p < self.alias_prob[ix]: return ix else: return self.alias[ix]
補充其餘與隨機數相關的算法:git
linear congruential generator (LCG)github
R a n d S e e d = ( A ∗ R a n d S e e d + B ) % M RandSeed = (A * RandSeed + B) \% M RandSeed=(A∗RandSeed+B)%M算法
LCG的週期最大爲 M,但大部分狀況都會少於M。要令LCG達到最大週期,應符合如下條件:數組
反函數法網絡
通常,一種機率分佈,若是其分佈函數爲 y = F ( x ) y=F(x) y=F(x),那麼,y的範圍是0~1,求其反函數 G G G,而後產生0到1之間的隨機數做爲輸入,那麼輸出的就是符合該分佈的隨機數了:app
y = G ( x ) y= G(x) y=G(x)dom
中心極限定理ide
分大量的批次,每一個批次生成12個 [ 0 , 1 ] [0,1] [0,1]間的隨機數,而後求和。求和後的隨機變量方差爲 1 / 12 × 12 = 1 1/12\times 12=1 1/12×12=1,均值爲 1 / 2 × 12 = 6 1/2 \times 12=6 1/2×12=6。而後-6。只要產生的批次足夠多,就能生成整體服從 N ( 0 , 1 ) \mathcal{N}(0,1) N(0,1)的分佈。
import numpy as np import pylab as plt n = 12 N = 5000 x = np.zeros([N]) for j in range(N): a = np.random.rand(n) u = sum(a) x[j] = u - n * 0.5 plt.hist(x) plt.show()
Box Muller
import numpy as np import pylab as plt N = 1000 x1 = np.random.rand(1, N) x2 = np.random.rand(1, N) y1 = np.sqrt(-2 * np.log(x1)) * np.cos(2 * np.pi * x2) y2 = np.sqrt(-2 * np.log(x1)) * np.sin(2 * np.pi * x2) y = np.hstack([y1, y2]) plt.hist(y) plt.show()
逆採樣(Inverse Sampling)和拒絕採樣(Reject Sampling)原理詳解
from math import sqrt from random import random def sample_pi(max_cnt=100000): acc_cnt=0 for i in range(max_cnt): x=random() y=random() if sqrt(x**2+y**2)<1: acc_cnt+=1 print("pi =",acc_cnt/max_cnt*4) sample_pi()
a r c t a n ( z 0 z 1 ) + 0.5 arctan(\frac{z0}{z1})+0.5 arctan(z1z0)+0.5
生成二維標準正態分佈的方法就是取兩個獨立的標準正態分佈變量X和Y放在一塊兒(X, Y)就好了
而後二維標準正態分佈在直角座標系裏有各向同性,也就是(X, Y)這個點所指的方向和X軸(或者任何一個給定方向)的夾角是均勻分佈的
很好理解, t a n ( θ ) = y x tan(\theta )=\frac{y}{x} tan(θ)=xy
反着用Box-Muller
AUC曲線計算
按照prob對【標籤-樣本】pairs進行排序,若是模型具備良好的排序能力,結果應該是 [ 0 , 0 , 1 , 1 , 1 ] [0,0,1,1,1] [0,0,1,1,1],正樣本數 M = 3 M=3 M=3,負樣本數 N = 2 N=2 N=2
只考慮正樣本獲取對應的排序值rankList
,爲 [ 3 , 4 , 5 ] [3,4,5] [3,4,5],求和爲12 。右邊公式算出來是6
,分母6
,結果1,符合定義。
def calAUC(prob,labels): f = list(zip(prob,labels)) rank = [values2 for values1,values2 in sorted(f,key=lambda x:x[0])] rankList = [i+1 for i in range(len(rank)) if rank[i]==1] posNum = 0 negNum = 0 for i in range(len(labels)): if(labels[i]==1): posNum+=1 else: negNum+=1 auc = 0 auc = (sum(rankList)- (posNum*(posNum+1))/2)/(posNum*negNum) print(auc) return auc蓄水池抽樣
用 O ( N ) O(N) O(N)的時間複雜度從 N N N個數中無放回等可能抽樣 K K K個數
用於不知道數據規模的狀況,保證每一個樣本被抽中的機率是等可能的
假設數據序列的規模爲 n n n,須要採樣的數量的爲 k k k。
首先構建一個可容納 k k k個元素的數組,將序列的前 k k k個元素放入數組中。
而後從第 k + 1 k+1 k+1個元素開始,以 k / n k/n k/n ( n n n表示動態增加的數據規模)的機率來決定該元素是否被替換到數組中(數組中的元素被替換的機率是相同的)。 當遍歷完全部元素以後,數組中剩下的元素即爲所需採起的樣本。
class Solution: def __init__(self, head: ListNode): """ @param head The linked list's head. Note that the head is guaranteed to be not null, so it contains at least one node. """ self.res=[] self.K=1 self.head=head def getRandom(self) -> int: """ Returns a random node's value. """ k=self.K p=self.head while k and p: self.res.append(p.val) k-=1 p=p.next p=self.head i=0 while p: ix=random.randint(0, i) if ix < self.K: self.res[ix]=p.val i+=1 p=p.next return self.res[0]
這題也是蓄水池抽樣
shuffle算法本質上也是蓄水池抽樣,就是動做換成swap
全Shuffle和抽m個Shuffle:
from random import randint def shuffle(nums): n = len(nums) for i in range(n): ri = randint(0, i) nums[i], nums[ri] = nums[ri], nums[i] return nums def shuffle_m(nums, m): n = len(nums) for i in range(n): ri = randint(0, i) if ri < m: nums[i], nums[ri] = nums[ri], nums[i] return nums[:m] print(shuffle(list(range(10)))) print(shuffle_m(list(range(10)),3))1 手推公式與實現簡單模型
主要背這幾個代碼
L = − 1 N Σ [ y ⋅ log ( y ^ ) + ( 1 − y ) ⋅ log ( 1 − y ^ ) ] L=-\frac{1}{N}\Sigma[y\cdot \log(\hat{y})+(1-y)\cdot \log(1-\hat{y})] L=−N1Σ[y⋅log(y^)+(1−y)⋅log(1−y^)]
本質上就是信息熵的 − p log p -p \log{p} −plogp的公式,不難記憶
loss = -(y @ np.log(y_hat) + (1 - y) @ np.log(1 - y_hat)) / n
就sigmoid, σ ( X W ) \sigma(XW) σ(XW)
return 1 / (1 + np.exp(-X @ self.w))
∂ L ∂ W = ( y ^ − y ) X \frac{\partial{L}}{\partial{W}}=(\hat{y}-y)X ∂W∂L=(y^−y)X
dw = (y_hat - y) @ X dw /= n
class LR: def __init__(self, lr=0.1, max_iter=1000, eps=1e-5): self.eps = eps self.max_iter = max_iter self.lr = lr def predict_proba(self, X): return 1 / (1 + np.exp(-X @ self.w)) def fit(self, X: np.ndarray, y: np.ndarray): n, m = X.shape X = np.hstack([X, np.ones([n, 1])]) m += 1 self.w = np.zeros([m]) pre_loss = 0 for i in range(self.max_iter): y_hat = self.predict_proba(X) # 交叉熵 loss = -(y @ np.log(y_hat) + (1 - y) @ np.log(1 - y_hat)) / n if abs(loss - pre_loss) < self.eps: print('偏差再也不減小') break pre_loss = loss print(loss) # 計算梯度 dw = (y_hat - y) @ X dw /= n # 梯度降低 self.w -= dw * self.lr print(self.w)
代碼:https://github.com/auto-flow/numpy-machine-learning/
符號 | 含義 |
---|---|
Z Z Z | 線性變換輸出 |
A A A | 激活層輸出 |
L L L | 損失值 |
L \mathcal{L} L | 損失函數 |
前向傳播:
Z = X W + b Z=XW+b Z=XW+b
A = σ ( Z ) A=\sigma(Z) A=σ(Z)
L = L ( Y , A ) L=\mathcal{L}(Y,A) L=L(Y,A)
反向傳播:
神經元上的梯度
∂ L ∂ Z = ∂ L ∂ A ∂ A ∂ Z \frac{\partial{L}}{\partial{Z}} = \frac{\partial{L}}{\partial{A}} \frac{\partial{A}}{\partial{Z}} ∂Z∂L=∂A∂L∂Z∂A
權重梯度
∂ L ∂ W = ∂ L ∂ A ∂ A ∂ Z ∂ Z ∂ W \frac{\partial{L}}{\partial{W}} = \frac{\partial{L}}{\partial{A}} \frac{\partial{A}}{\partial{Z}} \frac{\partial{Z}}{\partial{W}} ∂W∂L=∂A∂L∂Z∂A∂W∂Z
搞清楚以上概念以後,咱們能夠用PyTorch風格的函數接口寫一個簡單的神經網絡:
Linear線性層的正向傳播與反向傳播:
反正就是 Z = X W + b Z=XW+b Z=XW+b
class Linear(Module): def forward(self, x): self.input = x # Z = XW + b self.output = np.dot(self.input, self.weight) + self.bias return self.output
反向傳播的話,先求 W W W和 b b b的梯度,再求神經元上的梯度
求 b b b的梯度:
∂ L ∂ b = ∂ L ∂ Z ∂ Z ∂ b \frac{\partial{L}}{\partial{b}} = \frac{\partial{L}}{\partial{Z}} \frac{\partial{Z}}{\partial{b}} ∂b∂L=∂Z∂L∂b∂Z,其中 ∂ Z ∂ b = 1 \frac{\partial{Z}}{\partial{b}} = 1 ∂b∂Z=1,因此b的梯度直接是輸出神經元的梯度 ∂ L ∂ Z \frac{\partial{L}}{\partial{Z}} ∂Z∂L,也就是代碼中的grad
求W的梯度:
∂ L ∂ W = ∂ L ∂ Z ∂ Z ∂ W \frac{\partial{L}}{\partial{W}} = \frac{\partial{L}}{\partial{Z}} \frac{\partial{Z}}{\partial{W}} ∂W∂L=∂Z∂L∂W∂Z,其中 ∂ Z ∂ W = X \frac{\partial{Z}}{\partial{W}} = X ∂W∂Z=X,
這裏須要注意一點,舉個例子,若是當前層的線性映射是 4 × 9 4\times 9 4×9,那麼 g r a d = [ B , 9 ] , W = [ 4 , 9 ] , b = [ 9 ] grad=[B, 9],W=[4, 9],b=[9] grad=[B,9],W=[4,9],b=[9]
求bgrad ,直接在axis=0
方面求mean
, b g r a d = [ 9 ] bgrad = [9] bgrad=[9]
求wgrad , X × g r a d = [ B , 4 ] × [ B , 9 ] = [ B , 4 , 9 ] X\times grad=[B,4]\times[B,9]=[B,4,9] X×grad=[B,4]×[B,9]=[B,4,9],而後對axis=0
求mean
,獲得 w g r a d = [ 4 , 9 ] wgrad=[4, 9] wgrad=[4,9]
求上一層的grad, ∂ L ∂ X = ∂ L ∂ Z ∂ Z ∂ X = ∂ L ∂ Z W \frac{\partial{L}}{\partial{X}} = \frac{\partial{L}}{\partial{Z}} \frac{\partial{Z}}{\partial{X}} = \frac{\partial{L}}{\partial{Z}} W ∂X∂L=∂Z∂L∂X∂Z=∂Z∂LW, ∂ L ∂ Z \frac{\partial{L}}{\partial{Z}} ∂Z∂L就是下一層的grad
def backward(self, grad): self.bgrad = np.mean(grad, axis=0) # dL/dW = dL/dZ * dZ/dW = X * dL/dZ self.wgrad += np.mean(mul(self.input, grad), axis=0) # dL/dX = dL/dZ * W # 至關於用權重W對梯度grad作線性變換,將梯度傳到上一層的神經元 grad = np.dot(grad, self.weight.T) return grad
注意,numpy的 [ B , 4 ] × [ B , 9 ] = [ B , 4 , 9 ] [B,4]\times[B,9]=[B,4,9] [B,4]×[B,9]=[B,4,9]是這麼算的:
def mul(A: np.ndarray, B: np.ndarray): # A: 100x9 B: 100x1 # AxB = 100x9x1 na = A.shape[1] nb = B.shape[1] A = np.repeat(A[:, :, np.newaxis], nb, 2) B = np.repeat(B[:, np.newaxis,:], na, 1) return A * B
ReLU的前向傳播:對於input<0的部分置爲0,至關於引入稀疏性,對於服從標準正態的輸入讓一半的神經元爲0
ReLU的反向傳播:記錄以前的input<0的神經元索引,使這些神經元的梯度爲0
class Relu(Module): def __init__(self): super(Relu, self).__init__() self.input = None self.output = None def forward(self, x): self.input = x x[self.input <= 0] *= 0 self.output = x return self.output def backward(self, grad): grad[self.input > 0] *= 1 # 這麼寫實際上是***子放屁 grad[self.input <= 0] *= 0 return grad
σ ( x ) = 1 1 + e x p ( − x ) \sigma(x)=\frac{1}{1+exp(-x)} σ(x)=1+exp(−x)1
σ ′ ( x ) = σ ( x ) ⋅ ( 1 − σ ( x ) ) \sigma^{\prime}(x)=\sigma(x)\cdot (1-\sigma(x)) σ′(x)=σ(x)⋅(1−σ(x))
因此代碼中的output
就是在記錄 σ ( x ) \sigma(x) σ(x)值,反向傳播本質就是grad
乘以 σ ( x ) ⋅ ( 1 − σ ( x ) ) \sigma(x)\cdot (1-\sigma(x)) σ(x)⋅(1−σ(x))
class Sigmoid(Module): def __init__(self): super(Sigmoid, self).__init__() self.input = None self.output = None def forward(self, x): self.input = x self.output = 1 / (1 + np.exp(-self.input)) return self.output def backward(self, grad): grad *= self.output * (1 - self.output) return grad
t a n h ( x ) = e x p ( x ) − e x p ( − x ) e x p ( x ) + e x p ( − x ) tanh(x)=\frac{exp(x)-exp(-x)}{exp(x)+exp(-x)} tanh(x)=exp(x)+exp(−x)exp(x)−exp(−x)
記憶:上減下加,左加右減
t a n h ′ ( x ) = 1 − t a n h 2 ( x ) tanh^{\prime}(x)=1-tanh^2(x) tanh′(x)=1−tanh2(x)
class Tanh(Module): def __init__(self): super(Tanh, self).__init__() self.input = None self.output = None def forward(self, x): self.input = x self.output = (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x)) return self.output def backward(self, grad): grad *= (1 - np.square(self.output)) return grad
L = L ( y , y ^ ) = 1 2 N Σ ( y − y ^ ) 2 L=\mathcal{L}(y,\hat{y})=\frac{1}{2N}\Sigma(y-\hat{y})^2 L=L(y,y^)=2N1Σ(y−y^)2
∂ L ∂ y ^ = y ^ − y \frac{\partial{L}}{\partial{\hat{y}}}=\hat{y}-y ∂y^∂L=y^−y
注意 y ^ \hat{y} y^放在前面,由於帶負號。
class MSE(object): def __init__(self): self.label = None self.pred = None self.grad = None self.loss = None def __call__(self, pred, label): return self.forward(pred, label) def forward(self, pred, label): self.pred, self.label = pred, label self.loss = np.mean(0.5 * np.square(self.pred - self.label)) return self.loss def backward(self, grad=None): self.grad = (self.pred - self.label) return self.grad2 機率統計