Commit 241be808 authored by Lena Hersemann's avatar Lena Hersemann

started script for sub-clustering with Seurat

parent c01e9ddd
#!/usr/bin/env Rscript
#' # Subclustering with Seurat
#'
#' Created by: `r system("whoami", intern=T)`
#'
#' Created at: `r format(Sys.Date(), format="%B %d %Y")`
#-----------------------------------------------------------------------------------------------------------------------
#+ include=FALSE
suppressMessages(require(docopt))
doc = '
Perform subclustering of clusters identified for scRNAseq data using Seurat
Usage: subclustering.R [options] <count_matrix> <annot_data>
--hvg_selection <hvg_selection> settings used to select highy variable genes (pattern: low_mean+high_mean+low_disp)
--regress_out <regress_out> variables to regress out [default: nUMI]
--test_pcs <test_pcs> PCs to test for explained variance [default: 20]
--selected_pc <selected_pc> PC selected for subclustering
'
# commandArgs = function(x) c("--selected_pc", "5", "--hvg_selection", "0.25+4+1.5", "counts/cluster_pc50_7.txt", "design/cluster_pc50_7.txt")
# commandArgs = function(x) c("--hvg_selection", "0.25+4+1.5", "counts/cluster_pc50_2.txt", "design/cluster_pc50_2.txt")
opts = docopt(doc, commandArgs(TRUE))
#-----------------------------------------------------------------------------------------------------------------------
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.49/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.49/R/ggplot_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v10/dge_workflow/diffex_commons.R")
load_pack(knitr)
load_pack(Seurat)
load_pack(knitr)
load_pack(data.table)
load_pack(GGally)
load_pack(parallel)
#-----------------------------------------------------------------------------------------------------------------------
## process input arguments
count_matrix_file = opts$count_matrix
annot_data_file = opts$annot_data
cluster = str_replace_all(count_matrix_file, "counts/|.txt", "")
vec_as_df(unlist(opts)) %>%
filter(! str_detect(name, "^[<-]")) %>%
kable()
## load count matrix and annotation data
count_data <- fread(count_matrix_file)
count_matrix <- as.data.frame(count_data) %>%
column_to_rownames("ensembl_gene_id") %>%
as.matrix()
pheno_data <- fread(annot_data_file)
rownames(pheno_data) <- pheno_data$cell
if (ncol(count_matrix) != nrow(pheno_data)) {stop("Count matrix and design have different sample numbers")}
## create Surat object
# we do not filter at this stage, i.e. min.cells, min.genes and is.expr are set to 0
seo <- CreateSeuratObject(raw.data = count_matrix, meta.data = pheno_data, project = "SC2")
rm(count_data)
## convert to sparse matrix
seo <- MakeSparse(seo)
## log-normalize count data
# The following CPP codes are the implementation for the LogNormalize.
# The normalize algorithm is log1p(value/colSums[cell-idx] *scale_factor), and colSums[cell-idx] is total expression value for each cell.
# log1p(x) is equivalent to log(x + 1)
seo <- NormalizeData(object = seo, normalization.method = "LogNormalize", scale.factor = 10000)
#' ## Detection of variable genes
dir.create(".hvg_selection", showWarnings = F)
if(is.null(opts$hvg_selection)) {
pdf(paste0(".hvg_selection/", cluster, ".pdf"))
seo <- FindVariableGenes(object = seo, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 1, x.high.cutoff = 1.1, y.cutoff = 0.5, display.progress = FALSE)
dev.off()
} else {
hvgs <- str_split(opts$hvg_selection, "[+]") %>% unlist() %>% as.numeric()
if(length(hvgs) != 3) {stop("ATTENTION: Please provide 3 parameters for the detection of highly variable genes (pattern: low_mean+high_mean+low_disp)")}
seo <- FindVariableGenes(object = seo, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = hvgs[1], x.high.cutoff = hvgs[2], y.cutoff = hvgs[3], display.progress = FALSE)
}
if(is.null(opts$hvg_selection)) { stop("ATTENTION: Please provide information for the --hvg_selection argument to proceed") }
## regress out
regress_out <- str_split(opts$regress_out, "[+]") %>% unlist()
seo <- ScaleData(object = seo, vars.to.regress = regress_out)
## run PCA
pcs <- as.numeric(opts$test_pcs)
seo <- RunPCA(object = seo, pc.genes = seo@var.genes, do.print = TRUE, pcs.compute = pcs, pcs.print = 0, genes.print = 0)
#' <br>
#' ## PCA results
# TODO change axis lable font size
# pc_data <- GetCellEmbeddings(object = seo, reduction.type = "pca", dims.use = 1:pcs) %>%
# as.data.frame() %>%
# mutate(cell_id = rownames(.))
# vis_pc <- ifelse(pcs >= 10, 10, pcs)
# pc_data[,1:vis_pc] %>%
# ggpairs(., ggplot2::aes(), lower = list(continuous = wrap("points", alpha = 0.3, size=0.1), combo = wrap("dot", alpha = 0.4, size=0.2))) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
sdev <- GetDimReduction(
object = seo,
reduction.type = "pca",
slot = "sdev"
)
dir.create(".PCs", showWarnings = F)
data.frame(standard_deviation = sdev, variance = sdev^2) %>% mutate(pc = 1:n()) %>%
gather(method, value, -pc) %>%
ggplot(aes(pc, value)) +
geom_point() +
facet_wrap(~method, scale = "free") +
ggtitle(cluster)
ggsave(paste0(".PCs/", cluster, ".pdf"))
if(is.null(opts$selected_pc)) { stop("ATTENTION: Please specify with --selected_pc which PC you would like to use for clustering") }
sel_pc <- opts$selected_pc %>% as.numeric
seo <- FindClusters(object = seo, reduction.type = "pca", dims.use = 1:sel_pc, k.param = 30, n.start = 100, algorithm = 1, n.iter = 10, modularity.fxn = 1, prune.SNN = 1/15, resolution = 0.8, print.output = FALSE, save.SNN = TRUE)
# export cluster information
dir.create(".cluster", showWarnings = F)
cluster_info <- seo@ident %>%
as.data.frame() %>%
setNames(paste0("cluster_pc", sel_pc)) %>%
mutate(cell_id = rownames(.)) %>%
push_left("cell_id")
cluster_info %>% write_tsv(paste0(".cluster/", cluster, ".txt"))
# tsne
dir.create(".tSNE", showWarnings = F)
seo <- RunTSNE(object = seo, dims.use = 1:sel_pc, perplexity=20)
TSNEPlot(object = seo, do.label = TRUE) + ggtitle(paste0(cluster))
ggsave(paste0(".tSNE/", cluster, ".pdf"))
# find markers of clusters
dir.create(".marker", showWarnings = F)
seo.markers <- FindAllMarkers(object = seo, only.pos = TRUE, min.pct = 0.1)
seo.markers %>% write_tsv(paste0(".marker/", cluster, ".txt"))
#-----------------------------------------------------------------------------------------------------------------------
# writeLines(capture.output(devtools::session_info()), ".sessionInfo.txt")
# session::save.session(".seurat_analysis.R.dat");
# session::restore.session(".seurat_analysis.R.dat")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment