文庫大小標準化
某個sample的文庫大小(read counts * amplicon length) / 平均每一個ampicon文庫大小(sum(read counts * amplicon length) / amplicon size))
標準化前
標準化後
app
使用loess進行長度、GC標準化(標準化時使用log NRC)
GC標準化前
GC標準化後
GC再次標準化後(去除第一次標準化後小於或大於threthold的amplicon)
GC標準化後,長度標準化前
長度標準化後
包括GC和amplicon length 極端值時的點圖
less
取baseline > -2的amplicon出來,logNRC與主成分值進行線性迴歸,求殘差,獲得去除實驗誤差形成的影響,對這些點進行方差校訂
主成分標準化前,logNRC Vs PC1
主成分標準化後,logNRC Vs PC1
主成分標準化前,logNRC Vs PC2
主成分標準化後,logNRC Vs PC2
ide
線性迴歸的係數很是小,因此影響不大函數
Coefficients: Value Std. Error t value (Intercept) 0.0287 0.0050 5.7175 S[tt, ]1 0.0054 0.0031 1.7508 S[tt, ]2 -0.0076 0.0041 -1.8356 S[tt, ]3 -0.0018 0.0040 -0.4615 S[tt, ]4 -0.0023 0.0028 -0.8080 S[tt, ]5 -0.0016 0.0021 -0.7327 S[tt, ]6 -0.0012 0.0027 -0.4470 S[tt, ]7 0.0000 0.0024 0.0087 S[tt, ]8 -0.0020 0.0018 -1.1415 S[tt, ]9 -0.0012 0.0021 -0.5675 S[tt, ]10 -0.0008 0.0016 -0.4773 S[tt, ]11 -0.0022 0.0018 -1.2418 S[tt, ]12 0.0010 0.0016 0.6654 S[tt, ]13 -0.0014 0.0018 -0.7660 S[tt, ]14 -0.0015 0.0013 -1.1455
而且能夠看出logNRC方差隨着PC1(logNRC)增大而減小this
不一樣樣本總的方差能夠看出不一樣,有的樣本更加離散
spa
認爲最後的logNRC服從正態分佈,均值是0,而方差能夠由(PC1解釋)
取logNRC > -2的amplicon,求出全部logNRC的絕對中位差(MAD),sd.all,再求出每一個sample絕對中位差sd.sample,最後每一個sample中,logNRC進行方差校訂logNRC = (logNRC - median(logNRC))/(sd.sample/sd.all) + median(logNRC)
校訂後
3d
對bin mad和bin baseline用loess做線性迴歸,獲得loess模型,將全部amplicon baseline輸入,獲得amplicon mad 的預測值,再求出每一個sample 的amplicon logNRC的mad值,將該預測值/最大sample mad值,獲得的比值做爲該amplicon的sd
code
library(mclust) #for clustering to detect gender of control samples library(MASS) library(scales) #for graph transparency library(fastICA) #for ICA library(psych) library(ggplot2) inputFile <-"~/project/ONCOCNV_RES/Control.stats.txt" controlFile <- "~/project/ONCOCNV_RES/MyControl.stats.Processed.txt" GCfile <- "~/project/ONCOCNV_RES/target.GC.txt" NUMBEROFPC=3 minFractionOfShortOrLongAmplicons=0.05 #change if needed NUMBEROFPC=3 if (NUMBEROFPC<1) { NUMBEROFPC=3 } if (NUMBEROFPC>3) { cat("Can keep maximum 3 principal components") NUMBEROFPC=3 } magicThreshold=-2 #normalized read count less than this value will not be considered if it is not consistent between the control replicates maxLoess=50000 #maximum number of points for the regression, starting from v6.5 dataTable <-read.table(inputFile, header=TRUE); data<-data.frame(dataTable) ncont <- length(data)-6 all.observations <- data[,7:length(data)] data$len<-data$end-data$start #pairs.panels(all.observations[,c(1:5)]) if (ncont >1) { totalTargetLen<-sum(all.observations[,1]*data$len) }else { totalTargetLen<-sum(all.observations*data$len) } nulInd <- NULL if (ncont >1) { for (i in (1:(ncont))) { nulInd<-c(nulInd,which(all.observations[,i]==0 | is.na(all.observations[,i])==TRUE | data$chr=="chrX"| data$chr=="chrY"| data$chr=="X"| data$chr=="Y") ) all.observations[,i]<- all.observations[,i]*data$len/totalTargetLen*length(data$len) #for ampli-based, correct back for amplicon length } noNulIdex <-c(1:length(all.observations[,1])) }else { nulInd<-c(nulInd,which(all.observations==0 | is.na(all.observations)==TRUE | data$chr=="chrX"| data$chr=="chrY" | data$chr=="X"| data$chr=="Y") ) all.observations<- all.observations*data$len/totalTargetLen*length(data$len) #for ampli-based, correct back for amplicon length noNulIdex <-c(1:length(all.observations)) } #pairs.panels(all.observations[,c(1:5)]) if (length(nulInd)>0) { indexNoNul<- noNulIdex[-sort(unique(nulInd))] } else { indexNoNul<- noNulIdex } if (ncont >1) { observations <- all.observations[indexNoNul,] } else { observations <- all.observations[indexNoNul] } noMakeNA<-NULL if (ncont >1){ NUMBEROFPC = ncont-1; #detect female/male: starting from version 5.0: sex.vector=NULL tt <- which(all.observations[,i]>0 & is.na(all.observations[,i])==FALSE & (data$chr=="chrX" | data$chr=="X")) if (length(tt)>0) { for (i in (1:ncont)) { sex.vector[i] <- sum(all.observations[,i][tt] )/ sum(all.observations[tt,] )*ncont } mc <- Mclust(sex.vector,G=c(1,2)) #plot(mc) if (mc$G==2) { cat ("Warning: you have both male and female samples in the control. We will try to assign sex using read coverage on chrX\n") sex.vector <- mc$classification/2 cat (sex.vector);cat("\n") }else { cat ("all your samples have the same number of chrX. We assume they are all male; change the code for assume they are all female\n") propX = sum(all.observations[tt,])/length(tt)/sum(all.observations[-tt,])*length(all.observations[-tt,1]) for (i in (1:ncont)) { if (propX<0.9) {sex.vector[i] <- 0.5;} else {sex.vector[i] <- 1;} #sex.vector[i] <- 1 #uncomment if all your control samples are female } } #correct the number of reads on chr X for Gender #starting from version 5.0: for (i in (1:ncont)) { tt <- which(all.observations[,i]>0 & is.na(all.observations[,i])==FALSE & (data$chr=="chrX" | data$chr=="chrY" | data$chr=="X" | data$chr=="Y")) all.observations[,i][tt]<- all.observations[,i][tt] / sex.vector[i] } } lmin<-quantile(data$len,probs =0.005, na.rm = TRUE) lmax<-quantile(data$len,probs = 0.995, na.rm = TRUE) minFrac = length(which(data$len<lmin))/length(data$len) #strictly from v6.5 maxFrac = length(which(data$len>lmax))/length(data$len) #remove too short or too long amplicons (only if there are less than 5% of them: parameter minFractionOfShortOrLongAmplicons) if (minFrac< minFractionOfShortOrLongAmplicons &maxFrac<minFractionOfShortOrLongAmplicons) { nulInd <- which(data$len<lmin | data$len>lmax) } else if (minFrac< minFractionOfShortOrLongAmplicons &maxFrac>=minFractionOfShortOrLongAmplicons) { nulInd <- unique(which(data$len<lmin )) } else if (minFrac>= minFractionOfShortOrLongAmplicons &maxFrac<minFractionOfShortOrLongAmplicons) { nulInd <- unique(which( data$len>lmax)) } Control.stats.GC <- read.delim(GCfile) data$gc<-Control.stats.GC$GC lmin<-quantile(data$gc,probs =0.01, na.rm = TRUE) lmax<-quantile(data$gc,probs = 0.99, na.rm = TRUE) nulInd <- unique(c(nulInd,which(data$gc<=lmin | data$gc>=lmax))) data$mean <-rowMeans(all.observations) lmin<-quantile(data$mean,probs =0.01, na.rm = TRUE) lmax<-quantile(data$mean,probs = 0.99, na.rm = TRUE) minFrac = length(which(data$mean<=lmin))/length(data$mean) maxFrac = length(which(data$mean>=lmax))/length(data$mean) #remove too short or too long amplicons (only if there are less than 5% of them: parameter minFractionOfShortOrLongAmplicons) if (minFrac< minFractionOfShortOrLongAmplicons &maxFrac<minFractionOfShortOrLongAmplicons) { nulInd <- unique(c(nulInd,which(data$mean<=lmin | data$mean>=lmax))) } else if (minFrac< minFractionOfShortOrLongAmplicons &maxFrac>=minFractionOfShortOrLongAmplicons) { nulInd <- unique(c(nulInd,which(data$mean<=lmin ))) } else if (minFrac>= minFractionOfShortOrLongAmplicons &maxFrac<minFractionOfShortOrLongAmplicons) { nulInd <- unique(c(nulInd,which( data$mean>=lmax))) } noNulIdex <-c(1:length(all.observations[,1])) indexNoNul<- noNulIdex[-sort(unique(nulInd))] observations <- all.observations[indexNoNul,] correctedObs=matrix(nrow=length(all.observations[,1]),ncol=length(all.observations),byrow=T) for (i in (1:(ncont))) { # tt <- which(all.observations[,i][indexNoNul]>0 & log(all.observations[,i][indexNoNul])>magicThreshold) # gcCount.spl <- smooth.spline(data$gc[indexNoNul][tt], log(all.observations[,i][indexNoNul][tt])) # predictions <- predict(gcCount.spl,data$gc)$y # a1 <- log(all.observations[,i])-predictions # # #recorrect the at the tails of GC-content #need it because of the strange cases like control 3 and 9... # tt <- which(all.observations[,i][indexNoNul]>0 & a1[indexNoNul] > magicThreshold) # gcCount.spl <- smooth.spline(data$gc[indexNoNul][tt], a1[indexNoNul][tt]) # predictions <- predict(gcCount.spl,data$gc)$y # resi <- a1-predictions # len.spl <- smooth.spline(data$len[indexNoNul][tt], resi[indexNoNul][tt]) # correctedObs[,i] <- resi-predict(len.spl,data$len)$y # correctedObs[,i][-indexNoNul] <- NA tt <- which(all.observations[,i][indexNoNul]>0 & log(all.observations[,i][indexNoNul])>magicThreshold) #starting from 5.3 - LOESS if (length(tt)>maxLoess) {tt=sort(sample(tt, maxLoess, replace = FALSE))} #starting from v6.5 to use on exome data #plot(data$gc[indexNoNul][tt],log(all.observations[,i][indexNoNul][tt]),cex=0.5,pch=19,col="lightblue") gcCount.loess <- loess(log(all.observations[,i][indexNoNul][tt])~data$gc[indexNoNul][tt], control = loess.control(surface = "direct"),degree=2) #loess starting from version 5.2 predictions1<- predict (gcCount.loess,data$gc) a1 <- log(all.observations[,i])-predictions1 #plot(data$gc[indexNoNul][tt],a1[indexNoNul][tt],cex=0.5,pch=19,col="lightblue") tt <- which(all.observations[,i][indexNoNul]>0 & a1[indexNoNul] > magicThreshold) if (length(tt)>maxLoess) {tt=sample(tt, maxLoess, replace = FALSE)} #starting from v6.5 to use on exome data #plot(data$gc[indexNoNul][tt],a1[indexNoNul][tt],cex=0.5,pch=19,col="lightblue") gcCount.loess <- loess(a1[indexNoNul][tt]~data$gc[indexNoNul][tt],control = loess.control(surface = "direct"),degree=2) #loess starting from version 5.2 predictions<- predict (gcCount.loess,data$gc) resi <- a1-predictions #plot(data$gc[indexNoNul][tt],resi[indexNoNul][tt],cex=0.5,pch=19,col="lightblue") #plot(data$len[indexNoNul][tt],resi[indexNoNul][tt],cex=0.5,pch=19,col="lightblue") len.loess <- loess( resi[indexNoNul][tt]~data$len[indexNoNul][tt], control = loess.control(surface = "direct"),degree=2) correctedObs[,i] <- resi-predict(len.loess,data$len) #plot(data$len[indexNoNul][tt],correctedObs[,i][indexNoNul][tt],cex=0.5,pch=19,col="lightblue") #plot(data$gc[indexNoNul][tt],correctedObs[,i][indexNoNul][tt],cex=0.5,pch=19,col="lightblue") #plot(data$len,correctedObs[,i],cex=0.5,pch=19,col="lightblue") #plot(data$gc,correctedObs[,i],cex=0.5,pch=19,col="lightblue") } for (i in (1:(ncont))) { correctedObs[,i][-indexNoNul] <- NA correctedObs[,i][which(is.infinite(correctedObs[,i]))] <- NA #plot(data$len,correctedObs[,i],col=2) } observations <- correctedObs[indexNoNul,] #they are already in LOG scale!!! makeNA <- NULL for (i in (1:ncont)) { for (j in (1:ncont)) { if (i!=j) { my.lm <- rlm((observations[,j]) ~(observations[,i])) #hist((observations[,j])-my.lm$fitted,n=100) predictions <- (correctedObs[,i])*my.lm$coefficients[2]+my.lm$coefficients[1] tt <- (which ((correctedObs[,j])-predictions<magicThreshold)) tt2 <- which(is.na(correctedObs[,j])) makeNA<-unique(c(makeNA,tt,tt2)) } } } if(length(makeNA)>0) { noNulIdex <-c(1:length(correctedObs[,1])) indexToFit<- noNulIdex[-sort(unique(makeNA))] indexToFit<- intersect(indexToFit,indexNoNul) } else { indexToFit<- indexNoNul } observations <- correctedObs[indexToFit,] #do ICA1 to for remaining points: X<-observations #observations are in LOG scale a <- fastICA(X,NUMBEROFPC, alg.typ = "parallel", fun = "logcosh", alpha = 1, method = "C", row.norm = FALSE, maxit = 200, tol = 0.000001, verbose = TRUE) my.PCA <- princomp(X) cumExplainVar<- cumsum(my.PCA[["sdev"]]^2)/sum(my.PCA[["sdev"]]^2) cat("Explained variance by the first pronicpal components of PCA:") cat(names(cumExplainVar)) cat (cumExplainVar) #screeplot(my.PCA) mainComp=1;myCor=0; for (i in (1:NUMBEROFPC)) { if(abs(cor(rowMeans(X),(a$X %*% a$K)[,i]))>myCor) { myCor=abs(cor(rowMeans(X),a$X %*% a$K[,i])); mainComp=i; } } if (cor(rowMeans(X),a$X %*% a$K[,mainComp])<0) {a$K=-a$K;} shifts<-colMeans(X) controlFilePDF<-paste(controlFile,".pdf",sep="") pdf(file = controlFilePDF, width=7*ncont/3, height=7*ncont/3) par(mfrow=c((ncont-1),(ncont-1))) atmp<-NULL CellCounter=1 for (i in (1:(ncont-1))) { for (j in (1:(ncont-1))) { if (i<=j) {atmp<-c(atmp,CellCounter);CellCounter=CellCounter+1} else {atmp<-c(atmp,0);} } } layout(matrix(c(atmp), ncont-1, ncont-1, byrow = TRUE)) for (i in (1:(ncont-1))) { for (j in ((i+1):ncont)) { my.lm <- rlm((observations[,j]) ~(observations[,i])) myCstr1 <- paste("control",i) myCstr2 <- paste("control",j) plot((observations[,i]),(observations[,j]),xlab=bquote(~NRC[.(myCstr1)] ),ylab=bquote(~NRC[.(myCstr2)] ), col="white",pch=21,cex=0.8) #553 points((correctedObs[,i])[makeNA],(correctedObs[,j])[makeNA], col="darkgrey",bg=alpha("grey",0.5),pch=21,cex=0.8) tt<-which((observations[,i])<(observations[,j])) points((observations[,i])[tt],(observations[,j])[tt], col=colors()[553],bg=alpha(colors()[553],0.5),pch=21,cex=0.8) points((observations[,i])[-tt],(observations[,j])[-tt], col=colors()[90],bg=alpha(colors()[90],0.5),pch=21,cex=0.8) abline(my.lm,col="grey",lwd=2) abline(a=0,b=1,col="black",lwd=2,lty=3) } } dev.off() } else { NUMBEROFPC = 1; #detect female/male: starting from version 5.0: tt <- which(all.observations>0 & is.na(all.observations)==FALSE & (data$chr=="chrX" | data$chr=="X")) tt2 <- which(all.observations>0 & is.na(all.observations)==FALSE & (data$chr!="chrX"| data$chr=="X")) if (length(tt)>0) { propX = sum(all.observations[tt])/length(tt)/(sum(all.observations[tt2])/length(tt2)) if (propX<0.9) {sex.vector <- 0.5;} else {sex.vector <- 1;} #correct the number of reads on chr X for Gender #starting from version 5.0: tt <- which(all.observations>0 & is.na(all.observations)==FALSE & (data$chr=="chrX" | data$chr=="chrY" | data$chr=="X" | data$chr=="Y")) all.observations[tt]<- all.observations[tt] / sex.vector } lmin<-quantile(data$len,probs =0.005, na.rm = TRUE) lmax<-quantile(data$len,probs = 0.995, na.rm = TRUE) minFrac = length(which(data$len<lmin))/length(data$len) #strictly from v6.5 maxFrac = length(which(data$len>lmax))/length(data$len) #remove too short or too long amplicons (only if there are less than 5% of them: parameter minFractionOfShortOrLongAmplicons) if (minFrac< minFractionOfShortOrLongAmplicons &maxFrac<minFractionOfShortOrLongAmplicons) { nulInd <- which(data$len<lmin | data$len>lmax) } else if (minFrac< minFractionOfShortOrLongAmplicons &maxFrac>=minFractionOfShortOrLongAmplicons) { nulInd <- unique(which(data$len<lmin )) } else if (minFrac>= minFractionOfShortOrLongAmplicons &maxFrac<minFractionOfShortOrLongAmplicons) { nulInd <- unique(which( data$len>lmax)) } Control.stats.GC <- read.delim(GCfile) data$gc<-Control.stats.GC$GC lmin<-quantile(data$gc,probs =0.01, na.rm = TRUE) lmax<-quantile(data$gc,probs = 0.99, na.rm = TRUE) nulInd <- unique(c(nulInd,which(data$gc<=lmin | data$gc>=lmax))) data$mean <-all.observations lmin<-quantile(data$mean,probs =0.01, na.rm = TRUE) lmax<-quantile(data$mean,probs = 0.99, na.rm = TRUE) nulInd <- unique(c(nulInd,which(data$mean<=lmin | data$mean>=lmax))) noNulIdex <-c(1:length(all.observations)) indexNoNul<- noNulIdex[-sort(unique(nulInd))] observations <- all.observations[indexNoNul] tt <- which(all.observations[indexNoNul]>0 & log(all.observations[indexNoNul])>magicThreshold) #starting from 5.3 - LOESS if (length(tt)>maxLoess) {tt=sample(tt, maxLoess, replace = FALSE)} #starting from v6.5 to use on exome data gcCount.loess <- loess(log(all.observations[indexNoNul][tt])~data$gc[indexNoNul][tt], control = loess.control(surface = "direct"),degree=2) #loess starting from version 5.2 predictions1<- predict (gcCount.loess,data$gc) a1 <- log(all.observations)-predictions1 tt <- which(all.observations[indexNoNul]>0 & a1[indexNoNul] > magicThreshold) if (length(tt)>maxLoess) {tt=sample(tt, maxLoess, replace = FALSE)} #starting from v6.5 to use on exome data gcCount.loess <- loess(a1[indexNoNul][tt]~data$gc[indexNoNul][tt], control = loess.control(surface = "direct"),degree=2) #loess starting from version 5.2 predictions<- predict (gcCount.loess,data$gc) resi <- a1-predictions len.loess <- loess( resi[indexNoNul][tt]~data$len[indexNoNul][tt], control = loess.control(surface = "direct"),degree=2) correctedObs<- resi-predict(len.loess,data$len) correctedObs[-indexNoNul] <- NA correctedObs[which(is.infinite(correctedObs))] <- NA } if (ncont >1){ #calculate baseline for all datapoints (should be similar to take a mean(log)) X<-correctedObs #correctedObs are in LOG scale library(RColorBrewer) layout(matrix(data=c(1,2), nrow=1, ncol=2), widths=c(8,1), heights=c(1,1)) par(mar = c(5,6,4,7),oma=c(0.2,0.2,0.2,0.2),mex=0.5) breaks=seq(-2,4,1) pal=brewer.pal(7,"YlOrRd") image(x=1:ncol(correctedObs[indexNoNul,]),y=1:nrow(correctedObs[indexNoNul,]),z=t(correctedObs[indexNoNul,]),xlab="control",ylab="amplicon",col=pal[2:7],breaks=breaks) par(mar = c(6,0,4,3)) breaks2<-breaks[-length(breaks)] image(x=1, y=0:length(breaks2),z=t(matrix(breaks2))*1.001, col=pal[(1:length(breaks2))+1],axes=FALSE,breaks=breaks, xlab="", ylab="",xaxt="n") axis(4,at=0:(length(breaks2)-1), labels=breaks2, col="white", las=1) abline(h=0:length(breaks2),col="grey",lwd=1,xpd=F) for (i in (1:(ncont))) { X[,i][which(is.infinite(X[,i]))]=NA X[,i]<- (X[,i]-shifts[i]) } layout(matrix(data=c(1,2), nrow=1, ncol=2), widths=c(8,1), heights=c(1,1)) par(mar = c(5,6,4,7),oma=c(0.2,0.2,0.2,0.2),mex=0.5) breaks=seq(-2,4,1) pal=brewer.pal(7,"YlOrRd") image(x=1:ncol(X[indexNoNul,]),y=1:nrow(X[indexNoNul,]),z=t(X[indexNoNul,]),xlab="control",ylab="amplicon",col=pal[2:7],breaks=breaks) par(mar = c(6,0,4,3)) breaks2<-breaks[-length(breaks)] image(x=1, y=0:length(breaks2),z=t(matrix(breaks2))*1.001, col=pal[(1:length(breaks2))+1],axes=FALSE,breaks=breaks, xlab="", ylab="",xaxt="n") axis(4,at=0:(length(breaks2)-1), labels=breaks2, col="white", las=1) abline(h=0:length(breaks2),col="grey",lwd=1,xpd=F) # weights <- a$K %*% a$W weights <- a$K X[is.na(X)] <- 0 # S <- (X[indexNoNul,] %*% a$K %*% a$W) #ICs S <- (X[indexNoNul,] %*% a$K) #PCs observations<- correctedObs[indexNoNul,] #already in LOG scale baseLine <- NULL for (i in (1:(length(X[,1])))) { baseLine[i] <- weighted.mean(X[i,],w=weights[,mainComp],na.rm = TRUE) } for (i in (1:(ncont))) { tt <- which(baseLine[indexNoNul]>magicThreshold & observations[,i]>magicThreshold) my.lm <-rlm(observations[tt,i] ~ S[tt,]) #qplot(S[tt,1],observations[tt,i],xlab="PC1",ylab="Sample1logNRC",alpha=0.1,colour=3) #qplot(S[tt,2],observations[tt,i],xlab="PC2",ylab="Sample1logNRC",alpha=0.1,colour=3) #qplot(S[tt,2],resi[tt],xlab="PC2",ylab="Sample1logNRC",alpha=0.1,colour=3) resi<- observations[,i]-my.lm$coefficients[1] for (PC in c(1:NUMBEROFPC)) { resi<- resi-my.lm$coefficients[PC+1]*S[,PC] } #plotRatioFree(observations[,i],data$chr[indexNoNul],data$start[indexNoNul]) # #check sign: ## if (my.lm$coefficients[mainComp+1]<0) {resi<- -resi} - bug in version 5.5 observations[,i] <- resi } mu<-(baseLine[indexNoNul]) correctedObs[indexNoNul,]<-observations data$IC1<-NA if (NUMBEROFPC>1) { data$IC2<-NA } if (NUMBEROFPC>2) { data$IC3<-NA } tmp <- 1:NUMBEROFPC data$IC1[indexNoNul] <- S[,mainComp] tmp <- setdiff(tmp,mainComp) if (NUMBEROFPC>1) { data$IC2[indexNoNul]<- S[,tmp[1]] } if (NUMBEROFPC>2) { data$IC3[indexNoNul]<- S[,tmp[2]] } } else { baseLine <- correctedObs data$IC1=correctedObs } #par(mfrow=c(1,2)) #plot(S[tt,1],resi[tt],xlab="PC1",ylab="Sample2logNRC",col="lightblue",pch=19,cex=0.5) if (ncont >1){ #observations<-correctedObs[indexToFit,] #fit SD on indexToFit points: #mu<- baseLine[indexToFit] observations<-correctedObs[indexNoNul,] #fit SD on "indexNoNul" points starting from v.5.2 mu<- baseLine[indexNoNul] #some control samples are noisier than others.., normalyze for it starting from version 4.0 sd.all <- mad(unlist(observations),na.rm=TRUE) #"mad" instead of "sd" starting from version 4.0 : 0.185[indexToFit] => 0.24[indexNoNul] for (j in (1:(ncont))) { #starting from version 4.0 correct for variance tt <- which(observations[,j]>=magicThreshold) sd.j <- mad(unlist(observations[,j][tt]),na.rm=TRUE) alph <- sd.j/sd.all observations[,j]<-(observations[,j]-median(observations[,j],na.rm=TRUE))/alph+median(observations[,j],na.rm=TRUE) #use starting from version 4.0 } # mu <- baseLine[indexToFit] #fit SD on "indexNoNul" points starting from v.5.2 #plot(S[,1],observations[,j],xlab="PC1",ylab="Sample1logNRC",col="lightblue",pch=19,cex=0.5) totalPoints <- length(mu) evalVal=50 if (totalPoints/evalVal<5) { evalVal=10 } numSeg=floor(totalPoints/evalVal) possibleMu<-NULL sigma<-NULL observations <- observations[order(mu),] mu <- mu[order(mu)] for (i in c(0:(numSeg-1))) { possibleMu[i+1] <- mean(mu[(i*evalVal+1):(i*evalVal+evalVal)]) sigma[i+1]<-max(apply(observations[(i*evalVal+1):(i*evalVal+evalVal),],FUN=mad,MARGIN=2,center=0,na.rm=TRUE)) #"mad" instead of "sd" starting from version 4.0 } #sigma.spl <- smooth.spline(possibleMu, sigma) #data$sd<- predict(sigma.spl,baseLine)$y/max(apply(observations,FUN=mad,MARGIN=2,center=0,na.rm=TRUE)) sigma.spl <- loess(sigma~possibleMu, control = loess.control(surface = "direct"),degree=2) #loess starting from version 5.2 data$sd<- predict (sigma.spl,baseLine)/max(apply(observations,FUN=mad,MARGIN=2,center=0,na.rm=TRUE)) #plot(possibleMu,sigma,col="orange",cex=0.5,pch=19) #lines(possibleMu,predict (sigma.spl,possibleMu),col="red",lwd=0.5) tt <- which(data$sd<0) if (length(tt)>0) { cat(paste("Warning:",length(tt),"of amplicons have negative SD!!!")) cat ("Will assign the maximum value of SD") data$sd[tt] <- max(data$sd,na.rm=TRUE) } } else { data$sd=rep(1,length(data$mean)) } data$mean <- baseLine data$mean[-indexNoNul]<-NA #plot(baseLine[indexNoNul],data$sd[indexNoNul],col="orange",pch=19,cex=0.5) #plot(baseLine[indexNoNul],data$sd[indexNoNul],col="orange",pch=19,cex=0.5,xlim=c(-2,3)) write.table(data,file=controlFile,quote=FALSE,sep="\t",row.names=FALSE)
ONCOCNV文獻:https://academic.oup.com/bioinformatics/article/30/24/3443/2422154
ONCOCNV代碼:http://boevalab.com/ONCOCNV/component