R 語言實戰-Part 5-1筆記

時間 2019-11-12

標籤語言實戰筆記简体版

原文原文鏈接

R 語言實戰（第二版）

## part 5-1 技能拓展

----------第19章使用ggplot2進行高級繪圖-------------------------

#R的四種圖形系統：
#①base：基礎圖形系統
#②grid圖形系統： grid包，靈活，無完整繪圖函數，適用開發者
#③lattice包：適用網格圖形，即多變量/水平關係。基於grid包
#④ggplot2包：數據可視化利器
#前三者在基礎安裝中已包含，後三者使用時需顯示加載

#1.以一個例子介紹ggplot2
library(ggplot2)
ggplot(data=mtcars,aes(x=wt,y=mpg))+ #aes(aesthetics)指定每一個變量扮演角色
  geom_point()+
  labs(title = "automobile data",x="weight",y="miles per gallon")

#參數拓展
ggplot(data=mtcars,aes(x=wt,y=mpg))+
  geom_point(pch=17,color="red",size=2)+
  geom_smooth(method = "lm",color="red",linetype=2)+ #平滑曲線陰影默認95%置信區間
  labs(title = "automobile data",x="weight",y="miles per gallon")

#分組/刻面
mtcars$am <- factor(mtcars$am,levels = c(0,1),labels = c("automatic","manual"))
mtcars$vs <- factor(mtcars$vs,levels = c(0,1),labels = c("v-engine","straight engine"))
mtcars$cyl <- factor(mtcars$cyl)
ggplot(mtcars,aes(hp,mpg,shape=cyl,color=cyl))+
  geom_point(size=3)+
  facet_grid(am~vs)
#am和vs是刻面變量，cyl是分組變量

#2.幾何函數指定圖類型
#共37個
  # geom_bar() options: color,fill,alpha
  # geom_boxplot() color,fill,alpha,notch,width
  # geom_density() color,fill,alpha,linetype
  # geom_histogram() color,fill,alpha,linetype,binwidth
  # geom_hline()  color,alpha,linetype,size
  # geom_jitter() #抖動點 color,alpha,shape
  # geom_line() clorvalpha,linetype,size
  # geom_point() color,alpha,shape,size
  # geom_rug() #d地毯圖 color,side
  # geom_smooth() method,formula,color,fill,linetype,size
  # geom_text() #文字註解，不少參數
  # geom_violin() color,fill,alpha,linetype
  # geom_vline() olor,alpha,linetype,size

data(singer,package = "lattice")
ggplot(singer,aes(height))+geom_histogram()
ggplot(singer,aes(voice.part,height))+geom_boxplot()
ggplot(singer,aes(voice.part,height))+geom_violin()

library(car)
ggplot(Salaries,aes(rank,salary))+
  geom_boxplot(fill="cornflowerblue",color="black",notch = T)+
  geom_point(position = "jitter",color="blue",alpha=.5)+
  geom_rug(sides = "l",color="black") #sides地毯圖安置方向：b底，l左，t頂，r右，bl左下
#箱線圖槽口沒有重疊，差別顯著？

#幾何函數組合的威力
ggplot(singer,aes(voice.part,height))+
  geom_violin(fill="lightblue")+
  geom_boxplot(fill="lightgreen",width=.2)
  
#3.分組
#組：分類變量的水平（因子），用形狀、顏色、填充、尺寸、線型等特徵映射，aes分配變量（特徵）
head(Salaries)
ggplot(Salaries,aes(salary,fill=rank))+geom_density(alpha=.3)
ggplot(Salaries,aes(yrs.since.phd,salary,color=rank,shape=sex))+geom_point()

ggplot(Salaries,aes(rank,fill=sex))+geom_bar(position = "stack")+ #堆疊
  labs(title = "postion=stack")
ggplot(Salaries,aes(rank,fill=sex))+geom_bar(position = "dodge") #並排
ggplot(Salaries,aes(rank,fill=sex))+geom_bar(position = "fill")+#按比例
  labs(y="proportion")

#參數在aes()內和外的區別
ggplot(Salaries,aes(rank,fill=sex))+geom_bar()
ggplot(Salaries,aes(rank))+geom_bar(aes(fill=sex)) #同上
ggplot(Salaries,aes(rank))+geom_bar(fill="blue")
ggplot(Salaries,aes(rank,fill="blue"))+geom_bar() #此時的"blue"視爲一個變量名
#通常來講，變量應設在aes內，常數應分配在aes外

#4. 刻面
#即網格圖形
facet_wrap(~var,ncol=n) #var個水平排成n列
facet_wrap(~var,nrow = n)#var個水平排成n行

facet_grid(rowvar~colvar) #排成rowvar和colvar水平組合的圖
facet_grid(rowvar~.) #每一個rowvar水平的單列圖
facet_grid(.~colvar) #每一個colvar水平的單行圖

ggplot(singer,aes(height))+
  geom_histogram()+
  facet_wrap(~voice.part,nrow=4)

##刻面和分組相結合
ggplot(Salaries,aes(yrs.since.phd,salary,color=rank,shape=rank))+
  geom_point()+facet_grid(.~sex)

ggplot(singer,aes(height,fill=voice.part))+
  geom_density()+facet_grid(voice.part~.)

#5.添加光滑曲線
#平滑曲線包括線性、非線性、非參數(loess)等
#參數：method——smooth(默認)/lm/glm/rml/gam；formula——y~x(默認)/y~log(x)/y~ploy(x,n)；
#se——置信區間（T/F），默認T；level——默認95%置信區間水平；fullrange——擬合涵蓋全圖（T）/僅數據（F），默認F

ggplot(Salaries,aes(yrs.since.phd,salary))+geom_smooth()+geom_point()

#按性別擬合一個二次多項式迴歸
ggplot(Salaries,aes(yrs.since.phd,salary,linetype=sex,shape=sex,color=sex))+
  geom_smooth(method = lm,formula = y~poly(x,2),se=F,size=1)+
  geom_point(size=2)

#geom_smooth函數依賴於stat_smooth()函數來計算畫出一個擬合曲線及其置信限所需的數量。更多信息介紹在stat_smooth函數中

#6.修改ggplot2圖形外觀
#R基本圖形參數如par()函數等對ggplot2無做用，但它有特定函數來改變圖形外觀
##座標軸：
scale_x_continuous(breaks = c(1,10,2),labels = c("a","b","c"),limits = c(1,12)) #連續變量
scale_y_continuous()
scale_x_discrete(breaks,labels,limits) #因子水平
scale_y_discrete()
coord_flip()

p <- ggplot(Salaries,aes(rank,salary,fill=sex))+geom_boxplot()+
  scale_x_discrete(breaks=c("asstprof","assocprof","prof"),
                   labels=c("assistant\nprofessor",
                            "associate\nprofessor",
                            "full\nprofessor"))+
  scale_y_continuous(breaks = c(50000,100000,150000,200000),
                     labels = c("$50k","$100k","$150k","$200k"))
p
#橫軸標籤不見了？？

#7.圖例
#常定製標題和位置。映射aes中的fill，所以在labs()中經過fill=來修改標題。
p+labs(title = "faculty salary by rank and sex",x="",y="",fill="Gender")+
  theme(legend.position = c(0.2,0.8)) #分別據左側和底部邊緣的百分比距離。"left/right(default)/top/bottom/none"
#刪除圖例"none"

#8.標尺
#把數值映射到可視化空間
ggplot(mtcars,aes(wt,mpg,size=disp))+ #disp發動機排量連續變量
  geom_point(shape=21,color="black",fill="cornsilk")

ggplot(Salaries,aes(yrs.since.phd,salary,color=rank))+ #rank離散變量，對應設置的顏色
  scale_color_manual(values = c("orange","olivedrab","navy"))+
  geom_point(size=2)

ggplot(Salaries,aes(yrs.since.phd,salary,color=rank))+ 
  scale_color_brewer(palette = "Set1")+ #指定顏色集，同理sale_fill_brewer
  geom_point(size=2)

#查看顏色集
library(RColorBrewer)
display.brewer.all()

#9.主題
#theme函數調整字體、背景、顏色、網格線等
library(ggplot2)
mytheme <- theme(plot.title = element_text(face = "bold.italic",size = 14,color="brown"), #圖標題
                 axis.title = element_text(face = "bold.italic",size = 10,color = "brown"), #軸標題
                 axis.text = element_text(face = "bold",size = 9,color = "darkblue"), #軸標籤
                 panel.background = element_rect(fill="white",color = "darkblue"), #畫圖區域
                 panel.grid.major.y = element_line(color = "grey",linetype = 1), #主水平網格線
                 panel.grid.minor.y = element_line(color="grey",linetype = 2), #次水平網格線
                 panel.grid.minor.x = element_blank(),
                 legend.position = "top")
ggplot(Salaries,aes(rank,salary,fill=sex))+
  geom_boxplot()+
  labs(title="salary by rank and sex",x="rank",y="salary")+
  mytheme

#10.多重圖
#基礎函數中的mfrow和layout函數不適用於ggplot2，而是grid.arrange函數
p1 <- ggplot(Salaries,aes(rank))+geom_bar()
p2 <- ggplot(Salaries,aes(sex))+geom_bar()
p3 <- ggplot(Salaries,aes(yrs.since.phd,salary))+geom_point()
library(gridExtra)
grid.arrange(p1,p2,p3,ncol=3) #默認按行

#11.保存圖形
myplot <- ggplot(mtcars,aes(mpg))+geom_histogram()
ggsave(file="mygraph.png",plot=myplot,width = 5,height = 4) #英寸*約1.25=cm
ggsave("mygraph.png")#保存最近建立的圖形

------------------------第20章高級編程-----------------------------

#面向對象（可被存儲和命名的數據、函數和其餘任何東西）編程
#每一個對象都有屬性，一個關鍵的屬性是對象的類，R函數根據對象類的信息來處理對象
#attributes()函數羅列屬性，attr()函數設置屬性，class()函數讀取和設置對象的類

#1.數據結構
#1）原子向量：向量、矩陣、數組
x <- c(1,2,3,4,5,6,7,8)
class(x)
print(x)

attr(x,"dim") <- c(2,4) #給x加一個dim屬性
print(x)
class(x)
attributes(x)

attr(x,"dimnames") <- list(c("A1","A2"),
                           c("B1","B2","B3","B4")) #給x再加一個dimnames屬性
print(x)

attr(x,"dim") <- NULL #去除dim屬性
class(x)
print(x)

#2）泛型向量（即列表）：數據框、列表
#列表很重要，R函數一般返回列表做爲值
#數據框是一種特殊的列表，集合中每一個原子向量都有相同的長度
head(iris)
unclass(iris) #這個數據框是5個原子向量的列表
attributes(iris) #3個屬性

set.seed(1234)
fit <- kmeans(iris[1:4],3)
str(fit) #查看對象結構
unclass(fit) #查看對象內容
length(fit)
names(fit)
attributes(fit)
sapply(fit,class)

#任何對象中的元素均可經過索引來提取
x <- c(20,30,40)
x[3]
x[c(2,3)]
x <- c(A=20,B=30,C=40)
x[c(2,3)]
x[c("B","C")]

fit[c(2,7)]
fit[2] #返回列表
fit[[2]] #返回矩陣
fit$centers #數據框也是列表，因此才也可如此操做

#例子：畫出K means聚類分析的中心
set.seed(1234)
fit <- kmeans(iris[1:4],3)
means <- fit$centers
library(reshape2)
dfm <- melt(means)
names(dfm) <- c("Cluster","Measurement","Centimeters")
dfm$Cluster <- factor(dfm$Cluster)
head(dfm)
library(ggplot2)
ggplot(data = dfm,aes(x=Measurement,y=Centimeters,group=Cluster))+
  geom_point(size=3,aes(shape=Cluster,color=Cluster))+
  geom_line(size=1,aes(color=Cluster))+
  ggtitle("Profiles for Iris Clusters")

#2.控制結構
#for/if()else/ifelse/for/while/repeat/switch

for(i in 1:5){print(1:i)}
for (i in 5:1) print(i:1)

if(interactive()){ #代碼是否交互運行
  plot(x,y)
}else{
  png("myplot.png")
  plot(x,y)
  dev.off()
}

#向量循環
pvalues <- c(0.234,0.12,0.002,0.34,0.004)
ifelse(pvalues<0.05,"significant","not significant")

#顯示循環(耗時)
results <- vector(mode = "character",length = length(pvalues)) #初始化
for(i in 1:length(pvalues)){
  if(pvalues[i]<0.05) results[i] <- "significant"
  else results[i] <- "not significant"
}
results

#3.建立函數
f <- function(x,y,z=1){
  result <- x+2*y+3*z
  return(result)
}
f(2,3,4)
f(2,3)
f(x=2,y=3)
f(z=4,y=2,3)

args(f) #arg查看函數的參數及其默認值（主要用於交互式觀測）
formals(f) #formals查看參數及其默認值（主要用於編程中，返回列表）

#參數是按值傳遞的，而不是按地址
result <- lm(height~weight,data=women) #women生成副本後再傳給函數

#對象的做用範圍：全局/局部
x=2;y=3;z=4
f <- function(w){
  z <- 2
  x <- w*y*z
  return(x)
}
f(x) #x的副本傳入f函數中，return返回結果。x自己不變
x
y
z


#4.環境
#包括框架和外殼
#框架："對象-內容"的集合；外殼：指向封閉環境（父環境）的一個指針

x <- 5 #當前處於全局環境
myenv <- new.env() #建立一個新環境
assign("x","Homer",env=myenv) #在新環境中建立任務：x的對象其值爲Homer
ls()
ls(myenv)
x
get("x",env=myenv) #從環境中獲得對象的值

#也可用$符號
myenv <- new.env()
myenv$x <- "Homer"
myenv$x

#展現父環境
parent.env(myenv)
#myenv的父環境就是全局環境

#函數是對象，因此也有環境。函數一旦被建立，裏面的對象就存在環境中。這在函數閉包（以建立時狀態被打包的函數）中很重要
trim <- function(p){
  trimit <- function(x){
    n <- length(x)
    lo <- floor(n*p)+1
    hi <- n+1-lo
    x <- sort.int(x,partial = unique(c(lo,hi)))[lo:hi]
  } 
  trimit
}
#以上trim(p)函數返回一個函數，即從是兩種修剪掉高低值的p%
x <- 1:10
trim10p <- trim(0.1) #p在trimit函數的環境中
y <- trim10p(x)
y
trim20p <- trim(0.2)
y <- trim20p(x)
y

ls(environment(trim10p))
get("p",env=environment(trim10p))

#函數中包含函數的另外一個例子
makeFunction <- function(k){
  f <- function(x){
    print(x+k)
  }
}
g <- makeFunction(10)
g(4)
k <- 2
g(5) #不管在全局環境中設置k是什麼，g()函數使用k=10？，由於當g函數被建立時即已賦值


#5.面向對象的編程
#兩個分離的面向對象編程的模型：S3模型（更老、簡單、結構少），S4模型（更新且複雜）
#對象——>類屬性——>泛型函數——>執行

#example:
summary(women)
fit <- lm(weight~height,data=women)
summary(fit)

summary #查看該泛型函數代碼。UseMethod函數將對象分給一個泛型函數，前提是該泛型函數有擴展與對象的類屬性匹配
class(women) #data.frame類屬性,即判斷是否有summary.data.frame(women)存在，不然執行summary.default(women)
class(fit) #fit對象類屬性，即判斷是否有summary.lm(fit)存在，不然執行summary.default(fit)

methods(summary) #列出可得到的S3泛型函數

#查看泛型函數源碼（針對可見函數，即名字後沒加星號的函數）
summary.data.frame
summary.lm

getAnywhere(summary.ggplot)#針對不可見函數，即名字後加星號的函數，查看時去掉星號
getAnywhere(summary.ecdf)

#常見的對象的類屬性：umeric/matrix/data.frame/array/lm/glm/table.....
#常見的泛型函數：print/plot/summary......
#實際上，對象的類屬性能夠是任意字符串，泛型函數也能夠是任意函數

#一個自定義的任意的泛型函數例子：
#定義泛型函數
mymethod <- function(x,...) UseMethod("mymethod")
mymethod.a <- function(x) print("using A")
mymethod.b <- function(x) print("using B")
mymethod.default <- function(x) print("using Default")

#給對象分配類
x <- 1:5
y <- 6:10
z <- 10:15
class(x) <- "a"
class(y) <- "b"

#將泛型函數應用到對象中
mymethod(x)
mymethod(y)
mymethod(z)

#將泛型函數應用到包含兩個類的對象中
class(z) <- c("a","b")
mymethod(z) #分配到多個類時，第一類來決定哪一個泛型函數被調用

#泛型函數沒有默認爲「c"的類
class(z) <- c("c","a","b")
mymethod(z) #尋找第一個可用的泛型函數


##S3模型的限制
#任意的類能分配到任意的對象上，沒有完整性檢驗。如：
class(women) <- "lm"
summary(women)

#S4面向對象編程的模型更加正式、嚴格，旨在克服由S3結構化程度低的困難


#6.編寫有效的代碼
#R的缺陷：大數據集、高重複任務，速度慢
#高效技巧：①只讀取須要的數據②儘可能矢量化循環③建立大小正確對象，而非反覆調整④並行

#有效數據輸入:colClasses指定變量類型
read.table(data,header = T,sep = ",",colClasses = c("numeric","numeric","character",NULL,"numeric",NULL,NULL))

#矢量化：使用R中的函數
set.seed(1234)
mymatrix <- matrix(rnorm(10000000),ncol = 10)
#計算各列的和
accum <- function(x){
  sums <- numeric(ncol(x))
  for (i in 1:ncol(x)) {
    for (j in nrow(x)) {
      sums[i] <- sums[i]+x[j,i]
    }
  }
}
system.time(accum(mymatrix))
system.time(colSums(mymatrix))

#大小正確的對象：
x <- rnorm(1000000)
y <- 0 #初始化一個值
system.time(for(i in 1:length(x)) y[i] <- x[i]^2)

y <- numeric(length = 1000000) #初始化100000個值
head(y)
system.time(for(i in 1:1000000) y[i] <- x[i]^2)

system.time(y <- x^2) #直接矢量化操做

#並行化：foreach和doParallel包
#需重複獨立執行數字密集型函數的任務（如蒙特卡洛方法、自助法等）
#查看哪些函數耗時：
Rprof()
Rprof(NULL)
summaryRprof()

#7.調試
args(mad) #查看參數
debug(mad) #標記函數進行調試
mad(1:10) #設置調試函數,ls()列出對象查看
#或輸入n，經過代碼單步運行
#輸入c移出單步運行並執行當前函數剩餘部分

#調試會話
f <- function(x,y){
  z <- x+y
  g(z)
}
g <- function(x){
  z <- round(x)
  h(z)
}
h <- function(x){
  set.seed(1234)
  z <- rnorm(x)
  print(z)
}
options(error = recover) #出現錯誤時打印調用的棧
f(2,3)
f(2,-3)
#按c返回列表，0退出到R提示