Commit 6b88e214 authored by Lena Hersemann's avatar Lena Hersemann

updated PCA with and without batch effect and updated to datautils/v1.42/R/ggplot_commons.R

parent fc0cca22
......@@ -18,7 +18,8 @@ Options:
--lfc <lfc_cutoff> Just report genes with abs(lfc) > lfc_cutoff as hits [default: 1.0]
'
#commandArgs <- function(x) c("--contrast", "Documents/deseq_example/contrasts.txt", "Documents/deseq_example/star_counts_matrix.txt", "Documents/deseq_example/basic_design.txt")
#commandArgs <- function(x) c("--design", "'batch+condition'", "--contrasts", "../contrasts.txt star_counts_matrix_reduced_sampleset.txt", "basic_design_reduced_sampleset.txt")
#commandArgs <- function(x) c("--design", "'condition'", "--contrasts", "../contrasts_pairwise.txt", "../alignments/star_counts_matrix.txt", "../basic_design_pairwise.txt")
opts = docopt(doc, commandArgs(TRUE))
......@@ -32,7 +33,7 @@ library(DESeq2)
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.40/R/core_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.40/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.42/R/ggplot_commons.R")
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.40/R/bio/diffex_commons.R")
......@@ -141,6 +142,7 @@ designFormula
#' For more information about designs see section 1.6 "multi-factor designs" in [deseq-manual](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf)
#' The contrasts of interest are
#' The contrasts of interest are
contrasts %>% kable()
......@@ -160,7 +162,7 @@ contrasts %>% kable()
## new mf-approach
exDesign = data_frame(replicate = colnames(countMatrix)) %>%
mutate(col_index = row_number()) %>%
full_join(replicates2samples, by = "replicate") %>%
right_join(replicates2samples, by = "replicate") %>%
arrange(col_index) #%>% transmute(condition=sample_2ndwt) %>% fac2char
## infer the sample to be used from the formula
......@@ -192,9 +194,7 @@ dds = DESeqDataSetFromMatrix(countMatrix, exDesign, formula(as.formula(paste("~"
#dds = estimateDispersions(dds)
dds = DESeq(dds, parallel = T)
install_package("session")
session::save.session(".fc_debug.dat");
# session::restore.session(".fc_debug.dat");
########################################################################################################################
......@@ -244,19 +244,111 @@ plotDispEsts(dds, main = "Dispersion plot")
rld = rlogTransformation(dds)
#varstab = vst(dds) ## vst is supposed to be much 1k x faster
install_package("session")
session::save.session(".fc_debug.dat");
# session::restore.session(".fc_debug.dat");
#plotPCA(rld, intgroup = "sample")
# plotPCA(rld, intgroup = "batch")
# plotPCA(rld, intgroup = "run")
designFormula %>%
str_split("[+]") %>%
unlist %>%
map(~ plotPCA(rld, intgroup = .x) +
ggtitle(paste("effect of ", .x)) +
guides(color = guide_legend(title = .x))) %>%
walk(print)
#designFormula %>%
# str_split("[+]") %>%
# unlist %>%
# map(~ plotPCA(rld, intgroup = .x) +
# ggtitle(paste("effect of ", .x)) +
# guides(color = guide_legend(title = .x))) %>%
# walk(print)
# plotPCA(rld, intgroup = contrastAttribute)
load_pack(limma)
load_pack(stringi)
load_pack(htmltools)
library(plotly)
# extract rlogTransformation normalized count data
norm_counts <- assay(rld)
#
#for (i in {designFormula %>% str_split("[+]") %>% unlist}) {
# condition = pull(exDesign, i)
# names(condition) = exDesign$replicate
# # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
# makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") %>% ggplotly() %>% print
#}
# effects = designFormula %>% str_split("[+]") %>% unlist
# htmltools::tagList(lapply(effects, function(batchFactor) {
# condition = pull(exDesign, batchFactor)
# names(condition) = exDesign$replicate
suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
# makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") %>% ggplotly()
# }))
htmltools::tagList(lapply(effects, function(batchFactor) {
# batchFactor=effects[1]
condition = pull(exDesign, batchFactor)
names(condition) = exDesign$replicate
# suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
{
makePcaPlot(t(norm_counts), color_by = condition, scale = T, title = "DESeq normalized count data WITHOUT batch correction") +
guides(color=guide_legend(title=batchFactor))
} %>% ggplotly()
}))
# res <- lapply(1:3, function(i) {
# fig <- list(data=list(list(x=rnorm(100), type="histogram")), layout=list(title=paste(c('plot', i, sep=" "))))
# fig %>% offline
# })
# htmltools::tagList(res)
# extract batch effect variables from the design formula and the corresponding data from the exDesign data.frame
adjvar <- designFormula %>% str_split("[+]") %>% unlist %>% stri_subset_regex("condition", negate = T)
adjvar_data <- data.frame(exDesign[,which(names(exDesign) %in% head(adjvar))])
if (length(adjvar_data) == 1 ){
corr_data <- removeBatchEffect(norm_counts, batch = adjvar_data[,1])
}else if (length(adjvar_data) == 2 ){
corr_data <- removeBatchEffect(norm_counts, batch = adjvar_data[,1], batch2 = adjvar_data[,2])
}else if (length(adjvar_data) > 2){
print ("Please note, the number of adjustment variables exceeds the maximal number of removable batch effects.")
}else{
print ("The design formula does not contain an adjustment variable for batch effect correction")
}
# for (i in colnames(adjvar_data)) {
#for (i in {designFormula %>% str_split("[+]") %>% unlist}) {
# condition = pull(exDesign, i)
# names(condition) = exDesign$replicate
# # suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
# makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print
#}
htmltools::tagList(lapply(effects, function(batchFactor) {
# batchFactor=effects[1]
condition = pull(exDesign, batchFactor)
names(condition) = exDesign$replicate
# suppressWarnings(suppressMessages( makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") %>% ggplotly() %>% print))
{
makePcaPlot(t(corr_data), color_by = condition, scale = T, title = "DESeq normalized count data WITH batch correction") +
guides(color=guide_legend(title=batchFactor))
} %>% ggplotly()
}))
#' The Euclidean distance between samples are calculated after performing the regularized log transformation.
#' Using the calculated distance matrix, the samples are projected on a two-dimensional graph such that the distance between samples approximately corresponds to the biological coefficient of variation between those samples.
#' More information can be found here: https://en.wikipedia.org/wiki/Principal_component_analysis
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment