#加載R包app
> library(psych)
> library(reshape2)
> library(ggplot2)
> library(factoextra)函數
##提早把名爲ehbio_salmon.DESeq2.normalized.symbol.txt的文件放在工做目錄,以製表符分割的.txt文件spa
> exprData <- "ehbio_salmon.DESeq2.normalized.symbol.txt" ##這裏只是定義exprData和sampleFile兩個變量,文本類型的(一開始我還覺得就把文件內容給賦值
> sampleFile <- "sampleFile" ##了)3d
> data <- read.table(exprData,header = T,row.names = NULL,sep ="\t") ###read.table()讀入文件。
> class(data$id) ##判斷某一列的數據類型
[1] "charactercode
> data <- read.table(exprData,header = T,row.names = "id",sep ="\t") ##一開始不明白row.names=NULL是怎麼回事,而且我試了不加這個參數時data是同樣的。
Error in read.table(exprData, header = T, row.names = "id", sep = "\t") : ##看文檔知道row.names是指定哪一列爲行名的,
'row.names'裏不能有重複的名字 ##看了錯誤提示有些懵懂,之因此不能以id列爲行名,是由於這列id有相同的。我下面驗證了一下orm
> rownames(data) <- data$id
Error in `.rowNamesDF<-`(x, value = value) : 不容許有重複的'row.names'
此外: Warning message:
non-unique values when setting 'row.names': ‘ALG1L9P’, ‘ATXN7’, ‘BMS1P21’, ‘BMS1P4’, ‘CCDC39’, ‘CYB561D2’, ‘DIABLO’, ‘DNAJC9-AS1’, ‘DUXAP8’, ‘GOLGA8M’, ‘HSPA14’, ‘IPO5P1’, ‘ITFG2-AS1’, ‘LINC-PINT’, ‘LINC00484’, ‘LINC00941’, ‘LINC01238’, ‘LINC01297’, ‘LINC01422’, ‘LINC01481’, ‘MATR3’, ‘OR7E47P’, ‘PKD1P1’, ‘POLR2J3’, ‘POLR2J4’, ‘RAET1E-AS1’, ‘RF00012’, ‘RF00017’, ‘RF00019’, ‘RF02271’, ‘RGS5’, ‘RMRP’, ‘SCO2’, ‘SNHG28’, ‘SNX29P2’, ‘SPATA13’, ‘TBCE’, ‘TMSB15B’, ‘ZNF503’ blog
# 處理重複名字,謹慎處理,先找到名字重複的緣由再決定是否須要按一下方式都保留排序
> rownames_data <- make.names(data[,1],unique=T) ##有效名稱由字母 數字及dot和下劃線組成,字符串中的空格或者-等「非法」字符都會被轉爲.。
> length(rownames_data) ##上面講第一列導出到rownames_data,趕上重複名稱,按順序加上.1 .2 ...等後綴。
[1] 27186索引
> data <- data[,-1,drop = F] ##[1,]訪問第一行or 1:n;[,1]訪問第一列or 1:n。這裏是獲得去掉第一列的剩餘數據,drop=F表示更改是餘下數據類型不變。ip
> rownames(data) <- rownames_data ##將沒有重複的第一列數據保存到data中,做爲行名。
****
> write.table(rownames_data,file = "rownames_data.csv",quote = F) ##不加quote參數,字符串及行號都會加上引號。行號默認有1 2 3。。列名x
> write.table(rownames_data,file = "rownames_data_1.csv",quote = F,row.names = F) ##
> write.table(rownames_data,file = "rownames_data_1.csv",quote = F,row.names = F,col.names = F) ##不要引號,不要行列名
****
> data <- data[rowSums(data)>0,] #去掉和爲零的行
> nrow(data)
[1] 27186
> data <- data[apply(data, 1,var)!= 0,] ##!=表示不等於;去掉方差爲0 的行,這些自己沒有意義,也妨礙後續運算;1表示對陣列的行進行操做
> ?rowSums ##計算一個陣列的行的和,返回的是一個向量,長度爲這個陣列的行數。
> x <- cbind(x1 = 3, x2 = c(4:1, 2:5))
> x
x1 x2
[1,] 3 4
[2,] 3 3
[3,] 3 2
[4,] 3 1
[5,] 3 2
[6,] 3 3
[7,] 3 4
[8,] 3 5
> rowSums(x)
[1] 7 6 5 4 5 6 7 8
> colSums(x)
x1 x2
24 24
> x1 <- x[rowSums(x)>5,]
> x1
x1 x2
[1,] 3 4
[2,] 3 3
[3,] 3 3
[4,] 3 4
[5,] 3 5
## apply(X, MARGIN, FUN, ...) 函數:Returns a vector or array or list of values obtained by applying a function to margins of an array or matrix.
##
X
an array, including a matrix.MARGIN
a vector giving the subscripts which the function will be applied over. E.g., for a matrix 1 indicates rows, 2 indicates columns, c(1, 2) indicates rows and columns. Where X has named dimnames, it can be a character vector selecting dimension names.
****
> mads <- apply(data,1,mad) ##計算中值絕對誤差 (MAD, median absolute deviation)度量基因表達變化幅度
> data <- data[rev(order(mads)),] ##將數據先按索引排序-升序order(),而後在反過來,而後在按照反過來的索引順序造成data。
> a <- c(1,4,0,9,57,32,6) > order(a) [1] 3 1 2 7 4 6 5 ##0在a中的索引值是3,故3在order()以後的第一位,order()返回的是索引值,以此類推 > rev(order(a)) [1] 5 6 4 7 2 1 3 > b <- a[rev(order(a))] > b <- a[rev(order(a)),] Error in a[rev(order(a)), ] : 量度數目不對 > b <- a[rev(order(a))] > b [1] 57 32 9 6 4 1 0
> abc <- data.frame(a = c(1:3),b = c(4:6),c = c(7:9)) > abc a b c 1 1 4 7 2 2 5 8 3 3 6 9 > mads <- apply(abc,1,mad) > mads [1] 4.4478 4.4478 4.4478 > cbind(abc,d = c(1,1,1)) a b c d 1 1 4 7 1 2 2 5 8 1 3 3 6 9 1 > abc a b c 1 1 4 7 2 2 5 8 3 3 6 9 > abc <- cbind(abc,d = c(1,1,1)) ##給加一列 > abc a b c d 1 1 4 7 1 2 2 5 8 1 3 3 6 9 1 > mads <- apply(abc,1,mad) > abc a b c d 1 1 4 7 1 2 2 5 8 1 3 3 6 9 1 > mads [1] 2.2239 2.9652 3.7065 > abc <- abc[rev(order(mads)),] ##從結果看,按行的mad的降序從新排列了數據框 > abc a b c d 3 3 6 9 1 2 2 5 8 1 1 1 4 7 1
##data[x:y],訪問數據框時,若是不加標點訪問的是啥呢?
> data_t[1:5] [1] 245667.7 427435.1 221687.5 371144.2 240187.2 > data_t[,1:5] FN1 DCN CEMIP CCDC80 IGFBP5 untrt_N61311 245667.7 212953.1 40996.34 137229.15 77812.65 untrt_N052611 427435.1 360796.2 137783.10 232772.17 288609.20 untrt_N080611 221687.5 258977.3 53813.92 86258.13 210628.87 untrt_N061011 371144.2 408573.1 91066.80 212237.32 168067.42 trt_N61311 240187.2 210002.2 62301.12 136730.76 96021.74 trt_N052611 450103.2 316009.1 223111.85 226070.89 217439.21 trt_N080611 280226.2 225547.4 212724.84 124634.56 162677.38 trt_N061011 376518.2 393843.7 157919.47 236237.81 168387.36
> data_t <- t(data) ##t()函數對數據框進行轉置,行變列+列變行,對角線不變。
> abc_t <- t(abc) > abc a b c d 3 3 6 9 1 2 2 5 8 1 1 1 4 7 1 > abc_t 3 2 1 a 3 2 1 b 6 5 4 c 9 8 7 d 1 1 1
##文中說rows are samples and columns are variables,這句話不太懂。
> variableL <- ncol(data_t)
> variableL
[1] 27186
> if(sampleFile != ""){ ##這裏的if()是何意???
+ sample <- read.table(sampleFile,header = T,row.names = 1,sep ="\t") + data_t_m <- merge(data_t,sample,by = 0) + rownames(data_t_m) <- data_t_m$Row.names + data_t <- data_t_m[,-1] + } > data_t[,1:5] FN1 DCN CEMIP CCDC80 IGFBP5 trt_N052611 450103.2 316009.1 223111.85 226070.89 217439.21 trt_N061011 376518.2 393843.7 157919.47 236237.81 168387.36 trt_N080611 280226.2 225547.4 212724.84 124634.56 162677.38 trt_N61311 240187.2 210002.2 62301.12 136730.76 96021.74 untrt_N052611 427435.1 360796.2 137783.10 232772.17 288609.20 untrt_N061011 371144.2 408573.1 91066.80 212237.32 168067.42 untrt_N080611 221687.5 258977.3 53813.92 86258.13 210628.87 untrt_N61311 245667.7 212953.1 40996.34 137229.15 77812.65
## row.names = 1?
## merge()?
## rownames(data_t_m) <- data_t_m$Row.names?
row.names = 1?
> sample_2 <- read.table("new_1.txt",header = T,row.names = 2,sep ="\t") > sample_2 Samp untrt1 untrt_N61311 untrt2 untrt_N052611 untrt3 untrt_N080611 untrt4 untrt_N061011 trt5 trt_N61311 trt6 trt_N052611 trt7 trt_N080611 trt8 trt_N061011 > sample_2 <- read.table("new_1.txt",header = T,row.names = NULL,sep ="\t") > sample_2 Samp conditions 1 untrt_N61311 untrt1 2 untrt_N052611 untrt2 3 untrt_N080611 untrt3 4 untrt_N061011 untrt4 5 trt_N61311 trt5 6 trt_N052611 trt6 7 trt_N080611 trt7 8 trt_N061011 trt8
##看來參數row.names是指定哪一列做爲行名。
##因此上面的row.names = 1表示的是以第一列爲行名,因此原數據框中第一行第一列的Samp就去掉了。
merge()?
> data_t[,1:5] FN1 DCN CEMIP CCDC80 IGFBP5 untrt_N61311 245667.7 212953.1 40996.34 137229.15 77812.65 untrt_N052611 427435.1 360796.2 137783.10 232772.17 288609.20 untrt_N080611 221687.5 258977.3 53813.92 86258.13 210628.87 untrt_N061011 371144.2 408573.1 91066.80 212237.32 168067.42 trt_N61311 240187.2 210002.2 62301.12 136730.76 96021.74 trt_N052611 450103.2 316009.1 223111.85 226070.89 217439.21 trt_N080611 280226.2 225547.4 212724.84 124634.56 162677.38 trt_N061011 376518.2 393843.7 157919.47 236237.81 168387.36 > sample <- read.table(sampleFile,header = T,row.names = 1,sep ="\t") > sample conditions untrt_N61311 untrt untrt_N052611 untrt untrt_N080611 untrt untrt_N061011 untrt trt_N61311 trt trt_N052611 trt trt_N080611 trt trt_N061011 trt ##這裏能夠看出data_t與sample的行名是一致的。隱隱約約以爲有個問題:這倆只是行名一致,又不是某列,這也能合併啊?? > data_t_m <- merge(data_t,sample,by = 0) ##merge()中參數all默認爲FALSE,即合併時取交集--返回匹配的行;all = TRUE時,取按相同列取並集。 > data_t_m[,1:5] ##data_t_m的前5列 ###爲什麼相同的列合併以後有了行名,還叫Row.names?? Row.names FN1 DCN CEMIP CCDC80 1 trt_N052611 450103.2 316009.1 223111.85 226070.89 2 trt_N061011 376518.2 393843.7 157919.47 236237.81 3 trt_N080611 280226.2 225547.4 212724.84 124634.56 4 trt_N61311 240187.2 210002.2 62301.12 136730.76 5 untrt_N052611 427435.1 360796.2 137783.10 232772.17 6 untrt_N061011 371144.2 408573.1 91066.80 212237.32 7 untrt_N080611 221687.5 258977.3 53813.92 86258.13 8 untrt_N61311 245667.7 212953.1 40996.34 137229.15 > ncol(sample) [1] 1 > ncol(data_t) [1] 27186 > ncol(data_t_m) [1] 27188 > data_t_m[,27182:27188] ##後7列
FSTL4 COL23A1 BEST2 IBSP LTF TAC1 conditions 1 1.515631 0.0000000 0.000000 0.0000000 1.722211 0.000000 trt 2 0.000000 1.7616370 1.181002 0.0000000 0.000000 0.000000 trt 3 0.000000 0.7821581 1.935371 0.7084139 2.398343 0.000000 trt 4 0.000000 3.7734772 2.274773 0.0000000 0.000000 3.956904 trt 5 5.018991 0.0000000 0.000000 4.1753179 0.000000 0.960596 untrt 6 0.000000 0.0000000 0.000000 0.0000000 0.000000 0.000000 untrt 7 2.512400 0.0000000 0.000000 0.0000000 2.629531 0.000000 untrt 8 0.000000 0.0000000 0.000000 0.0000000 0.000000 0.000000 untrt
####不明白的地方是data_t和sample合併以後爲什麼行的順序會發生變化,且變化以後的順序又是什麼順序呢?
rownames(data_t_m) <- data_t_m$Row.names?
> rownames(data_t_m) <- data_t_m$Row.names ##把Row.names這一列(是第一列)內容設置爲行名,可是這一列不變!
> data_t <- data_t_m[,-1] ##去掉第一列,
> pca <- prcomp(data_t[,1:variableL],scale = T)
> fviz_eig(pca, addlabels = TRUE) ##addlabel參數爲T,顯示百分比。
> fviz_pca_ind(pca, repel=T) ##repel=T 自動調整文本位置
> fviz_pca_ind(pca)
後面略過,沒跟着學了
參考https://mp.weixin.qq.com/s/4R14xJkQVPtaufaoXOcPIw