吳恩達Deep Learning課程練習題參考答案——R語言版

前言

吳恩達有一套課程Deep Learning,對機器學習的基礎理論作了很是好的介紹,上課視頻質量很是好,並且練習題都設計得頗有水平,並提供了Matlab的答案。本文針對這些練習題,提供了一份R語言版的答案。php

練習2-線性迴歸

題目與數據請點這裏html

# read data: x is age, y is height
x <- read.csv("ex2x.dat", header=F)
y <- read.csv("ex2y.dat", header=F)
colnames(x) <- c("age")
colnames(y) <- c("height")

# scatter plot
input <- data.frame(age=x$age,height=y$height)
library(ggplot2)
p <- ggplot(aes(x=age, y=height), data=input)
p + geom_point(size=3,shape=3) + theme_bw() + xlab("Age in years") + ylab("Height in meters")

# gradient descend algorithm implementation
x <- cbind(c(rep(1, nrow(x))), x$age)
y <- as.matrix(y)
theta <- c(rep(0,ncol(x)))
alpha <- 0.07
MAX_ITR <- 300
for(i in 1:MAX_ITR){
    grad <- 1/nrow(x) * t(x) %*% (x %*% theta - y) 
    theta <- theta - alpha * grad
}
print(theta)

# add fitted line to plot
p + geom_point(size=3,shape=4) + geom_abline(intercept=theta[1], slope=theta[2], size=1) + geom_smooth(method="lm", size=1) + theme_bw()+ xlab("Age in years") + ylab("Height in meters")

擬合效果以下圖,藍色線爲最小二乘法獲得的直線,黑色爲最大迭代次數MAX_ITR設爲300時的直線。當MAX_ITR設爲1500時,黑色直線將於藍色直線很是接近。機器學習

clipboard.png

練習3-多元線性迴歸

題目與數據請點這裏ide

# read raw data
x <- read.table("ex3x.dat", header=F)
y <- read.table("ex3y.dat", header=F)
colnames(x) <- c("area", "room_num")
colnames(y) <- c("price")

# scatter plot
# input <- data.frame(area=x$area, room_num=x$room_num, price=y$price)
# plot(input)

# normalization
x <- cbind(rep(1, nrow(x)), x$area, x$room_num)
x[, 2:3] <- scale(x[, 2:3])
y <- as.matrix(y)

# learning rate experiment
library(ggplot2)
iter_num <- 50
alpha <- c(0.01, 0.03, 0.1, 0.3, 1, 1.3)
plot_data <- NULL
plot_color <- c('black', 'red', 'green', 'blue', 'orange', 'purple')
for(j in 1:length(alpha)){ 
  theta <- c(rep(0, ncol(x)))  
  Jtheta <- c(rep(0, iter_num))  
  for(i in 1:iter_num){    
    Jtheta[i] <- (1/(2*nrow(x))) * t(x %*% theta - y) %*% (x %*% theta - y)
    grad <- 1/nrow(x) * t(x) %*% (x %*% theta -y)   
    theta <- theta -alpha[j] * grad   
  }

  if(is.null(plot_data)){
    plot_data <- data.frame(iter=c(1:iter_num), cost=Jtheta, rate=alpha[j])
  }
  else {
    plot_data <- rbind(plot_data,data.frame(iter=c(1:iter_num), cost=Jtheta, rate=alpha[j])
  }
}
p <- ggplot(plot_data)
p <- p + geom_line(aes(x=iter, y=cost, colour=as.factor(rate), group=rate))
p + theme_bw() + scale_colour_hue(name="learning rate")

# We find best convergency occurs when learning rate is 1
alpha <- 1.0
iter_num <- 100
theta <- c(rep(0, ncol(x)))  
for(i in 1:iter_num){    
  grad <- 1/nrow(x) * t(x) %*% (x %*% theta -y)   
  theta <- theta -alpha * grad   
}

# prediction for price of an apartment with area=1650 and room_num=3
x <- read.table("ex3x.dat", header=F)
y <- read.table("ex3y.dat", header=F)
colnames(x) <- c("area", "room_num")
colnames(y) <- c("price")
price_pred <- t(theta) %*% c(1, (1650-mean(x$area))/sd(x$area), (3-mean(x$room_num))/sd(x$room_num))
print(price_pred)

其中學習率選擇的曲線以下,能夠看到設定爲1時收斂最快。學習

clipboard.png

練習4-邏輯迴歸

題目和數據請點這裏ui

# read raw data
x <- read.table("ex4Data/ex4x.dat", header=F)
y <- read.table("ex4Data/ex4y.dat", header=F)
colnames(x) <- c("t1_mark", "t2_mark")
colnames(y) <- c("admit")

# plot the data
input <- data.frame(t1_mark=x$t1_mark, t2_mark=x$t2_mark, admit=y$admit)
library(ggplot2)
p <- ggplot(data=input, aes(x=t1_mark, y= t2_mark, colour=as.factor(admit)))

p + geom_point() + theme_bw() + scale_colour_hue(name="Admit result",breaks=c(0,1),labels=c("Not admitted","Admitted")) + xlab("Test 1 score") + ylab("Test 2 score")

# Newton-Raphson's method implementation
sigmoid_value <- function(z){
  return (1/(1+exp(-z)))
}
x <- cbind(rep(1,nrow(x)), x$t1_mark, x$t2_mark)
y <- as.matrix(y) 
iter_num <- 7
theta <- rep(0, ncol(x))
Jtheta <- rep(0, iter_num)
for(i in 1:iter_num){
  z <- x %*% theta
  h <- sigmoid_value(z)
  # get gradient, Hession and cost function
  grad <- 1/nrow(x) * t(x)%*%(h-y)
  H <- (1/nrow(x)) * t(x) %*% diag(as.vector(h)) %*% diag(1-as.vector(h)) %*% x
  Jtheta[i] <- 1/nrow(x) * sum(-y*log(h)-(1-y)*log(1-h)) 
  theta <- theta - solve(H)%*%grad
}
curve <- data.frame(iter=c(1:iter_num), cost=Jtheta)
ggplot(data=curve,aes(x=iter,y=cost)) + geom_line() + geom_point() + theme_bw()
theta

# Predict the not admit probability of a student with t1_score=20 and t2_score=80
prob <- 1 - sigmoid_value(c(1,20,80)%*%theta)

# Plot the decision boundary line
plot_x <- c(min(x[,2])-2, max(x[,2])+2)
plot_y <- (-1/theta[3])*(theta[2]*plot_x+theta[1])
p + geom_point() + theme_bw() + geom_segment(x=plot_x[1],xend=plot_x[2],y=plot_y[1],yend=plot_y[2], colour="black") + scale_colour_hue(name="Admit result",breaks=c(0,1),labels=c("Not admitted","Admitted")) + xlab("Test 1 score") + ylab("Test 2 score")

# 此處嘗試用梯度降低法求解邏輯迴歸,發現好久不收斂,不知道是否是寫錯了
# Gradient dscend algorithm implementation
iter_num <- 10000
#alpha <- c(0.01, 0.03, 0.1, 0.3, 1, 1.3)
alpha <- c(0.00001,0.00003,0.0001,0.0003,0.001,0.0013,0.0014)
plot_data <- NULL
for(j in 1:length(alpha)){
  Jtheta <- rep(0, iter_num)
  theta <- rep(0, ncol(x))
  for(i in 1:iter_num){
    z <- x%*%theta
    h <- sigmoid_value(z)
    Jtheta[i] <- 1/nrow(x) * sum(-y*log(h)-(1-y)*log(1-h)) 
    grad <- 1/nrow(x) * t(x)%*%(h-y)
    theta <- theta - alpha[j] * grad
  }

  if(is.null(plot_data)){
    plot_data <- data.frame(iter=c(1:iter_num), cost=Jtheta, rate=alpha[j])
  }
  else {
    plot_data <- rbind(plot_data,data.frame(iter=c(1:iter_num), cost=Jtheta, rate=alpha[j]))
  }  
}
p <- ggplot(plot_data)
p <- p + geom_line(aes(x=iter, y=cost, colour=as.factor(rate), group=rate))
p + theme_bw() + scale_colour_hue(name="learning rate")

下圖顯示了牛頓法的收斂過程,4步就cost收斂到較低水平了。spa

clipboard.png

下圖是使用邏輯迴歸分類的效果和決策邊界:ssr

clipboard.png

下圖是梯度降低的收斂過程,久久不收斂,估計是代碼哪裏寫錯了,暫時沒搞明白。設計

clipboard.png

練習5-正則化

題目和數據請點這裏code

該題目分爲線性迴歸與邏輯迴歸兩個部分。

線性迴歸

# read raw data
x <- read.table("ex5Linx.dat",header=F)
y <- read.table("ex5Liny.dat",header=F)

# plot the data
library(ggplot2)
input <- cbind(x,y)
colnames(input) <- c("x","y")
p <- ggplot() + theme_bw()
p <- p + geom_point(data=input,aes(x=x,y=y),colour=rgb(213/255,26/255,33/255)) 
p

#normal equation method
f1 <- function(x, theta){
  return (as.vector((matrix(c(rep(1,length(x)),x,x^2,x^3,x^4,x^5),nrow=length(x), ncol=6, byrow=FALSE) %*% theta)))
}

lamda <- c(0,1,10)
x <- as.matrix(cbind(rep(1,nrow(x)),x,x^2,x^3,x^4,x^5))
y <- as.matrix(y)
rm <- diag(c(0, rep(1,ncol(x)-1)))

curve <- NULL
for(i in 1:length(lamda)){
  theta <- solve((t(x)%*%x+lamda[i]*rm))%*%t(x)%*%y
  xx <- seq(-1,1,0.001)
  yy <- f1(xx, theta)
  if(is.null(curve)){
    curve <- data.frame(x=xx,y=yy,lamda=rep(lamda[i],length(xx)))
  }
  else {
    curve <- rbind(curve, data.frame(x=xx,y=yy,lamda=rep(lamda[i],length(xx))))
  }
  print(theta)
}

p <- p + geom_line(data=curve, aes(x=x,y=y,group=lamda,colour=as.factor(lamda)))

p

下圖是擬合效果。能夠看到當正則項的懲罰係數越大時,擬合效果越差;當懲罰係數太小時,又存在過擬合問題。

clipboard.png

邏輯迴歸

x <- read.table("ex5Logx.dat",sep=",", header=F)
y <- read.table("ex5Logy.dat", header=F)
colnames(x) <- c("u","v")
colnames(y) <- c("type")

#plot the data
input <- data.frame(u=x$u,v=x$v,type=as.factor(y$type))
p <- ggplot() + theme_bw()
p <- p + geom_point(data=input,aes(x=u,y=v,shape=type, size=3))
#p <- p + scale_colour_manual(values=c('blue','red')) + labs(colour='Type')
p <- p + scale_shape_manual(values=c('o','+')) + labs(shape='Type')
p <- p + scale_size(guide="none")
p

#utility function
map_feature <- function(feat1, feat2){
  degree <- 6
  out = rep(1,length(feat1))
  for(i in 1:degree){
    for(j in 0:i){
      out <- cbind(out,(feat1^(i-j)*feat2^j))
    }
  }
  return (out)
}

f1 <- function(x, theta){
  return (as.vector(map_feature(x$u,x$v) %*% theta))
}

#sigmoid function
sigmoid_value <- function(z){
  return (1/(1+exp(-z)))
}

#Newton-Raphson's Method

lambda <- c(0,1,10)
out <- map_feature(x$u,x$v)
y <- as.matrix(y)
curve <- NULL
con_store <- NULL
u <- seq(-1,1.5,0.05)
v <- seq(-1,1.5,0.05)
for(i in 1:length(lambda)){
  iter_num <- 15
  theta <- rep(0, ncol(out))
  Jtheta <- rep(0, iter_num)
  con <- matrix(0, nrow=length(u)*length(v), ncol=4)
  for(j in 1:iter_num){
    h <- sigmoid_value(out%*%theta)
    #cost function, judge if converged
    Jtheta[j] <- 1/nrow(out)*sum(-y*log(h)-(1-y)*log(1-h)) + lambda[i]/(2*nrow(out))*sum(theta[-1]^2)
    #gradient
    grad <- 1/nrow(out)*t(out)%*%(h-y) + lambda[i]/nrow(out)*c(0, theta[-1])
    #hession
    H <- 1/nrow(out)*t(out)%*%diag(as.vector(h))%*%diag(1-as.vector(h))%*%out + lambda[i]/nrow(out)*diag(c(0, rep(1,ncol(out)-1)))
    theta <- theta - solve(H)%*%grad
  }
  if(is.null(curve)){
    curve <- data.frame(iter=c(1:iter_num), cost=Jtheta, lambda=lambda[i])
  }
  else {
    curve <- rbind(curve,data.frame(iter=c(1:iter_num), cost=Jtheta, lambda=lambda[i]))
  }

  for(k in 1:length(u)){
    for(l in 1:length(v)){
      z <- map_feature(u[k],v[l])%*%theta
      con[(k-1)*length(v)+l, 1] = u[k]
      con[(k-1)*length(v)+l, 2] = v[l]
      con[(k-1)*length(v)+l, 3] = z
      con[(k-1)*length(v)+l, 4] = lambda[i]
    }
  }
  if(is.null(con_store)){
    con_store <- as.data.frame(con)
  }
  else {
    con_store <- rbind(con_store,con)
  }
}

#plot convergency draft
ggplot(data=curve) + theme_bw() + geom_point(aes(x=iter, y=cost)) + geom_line(aes(x=iter,y=cost, group=lambda, colour=as.factor(lambda))) + scale_color_hue(name="lambda")
colnames(con_store) <- c("x","y","z","lambda")
p <- p + stat_contour(data=con_store,aes(x=x,y=y,z=z,group=as.factor(lambda), colour=as.factor(lambda), size=2),breaks=c(0)) + scale_color_hue(name="Lambda")
p

下圖是不一樣lambda懲罰係數的cost收斂效果。

clipboard.png

下圖是邏輯迴歸在不一樣懲罰係數下的分類效果,lambda=0時存在過擬合。

clipboard.png

相關文章
相關標籤/搜索