Commit 3ac57df7 authored by Lena Hersemann's avatar Lena Hersemann

replaced `library()` by `loadpack`; updated functions for the current scater...

replaced `library()` by `loadpack`; updated functions for the current scater version; added PCA based on quality metrices in order to detect outliers; added explanatory variables plot based on all input information in the design file; exports now information on gene name, description and coordinates if ensembl gene ids are provided in the count matrix; added code to copy the used script to the working directory and to export session information
parent 9701af73
#!/usr/bin/env Rscript
#+ include=FALSE
#**************************************************************
#***********************************************************************************************************************
#' # Quality check of single-cell RNASeq data
#+ include=FALSE
#**************************************************************
# https://www.bioconductor.org/help/workflows/simpleSingleCell/
#'
#' Created by: `r system("whoami", intern=T)`
#'
#' Created at: `r format(Sys.Date(), format="%B %d %Y")`
#***********************************************************************************************************************
#+ include=FALSE
suppressMessages(require(docopt))
doc = '
......@@ -20,7 +23,7 @@ Options:
# commandArgs <- function(x) c("../metrices/SC1_star_counts_matrix.txt", "../metrices/SC1_basic_design.txt")
# commandArgs <- function(x) c("../segerstolpe_counts.txt", "../segerstolpe_infos.txt")
opts = docopt(doc, commandArgs(TRUE))
......@@ -30,23 +33,19 @@ opts = docopt(doc, commandArgs(TRUE))
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.45/R/ggplot_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v6/common/cp_utils.R")
library(knitr)
# require(gridExtra)
library(scran)
library(NOISeq)
# install.packages("kableExtra")
library(kableExtra)
library(plotly)
library(bsplus)
library(htmltools)
library(shiny)
library(htmltools)
library(BiocParallel)
library(data.table)
# # library(scater)
# # detach("package:scater", unload = TRUE)
# load_pack(SingleCellExperiment)
#
load_pack(SingleCellExperiment)
load_pack(knitr)
load_pack(scran)
load_pack(NOISeq)
load_pack(kableExtra)
load_pack(plotly)
load_pack(bsplus)
load_pack(htmltools)
load_pack(shiny)
load_pack(htmltools)
load_pack(BiocParallel)
load_pack(data.table)
load_pack(mvoutlier)
# FUNCTIONS ------------------------------------------------------------------------------------
......@@ -69,13 +68,12 @@ prefix <- opts$file_prefix
countData <- fread(counts_file)
countMatrix <- countData %>%
mutate(sum = select(., -ensembl_gene_id) %>% rowSums()) %>%
filter(sum > 0) %>%
select(-sum) %>%
column2rownames("ensembl_gene_id") %>%
column2rownames(colnames(countData)[str_detect(colnames(countData), "gene_id")]) %>%
as.matrix()
countMatrix <- countMatrix[rowSums(countMatrix)>0, ]
design <- read_tsv(design_file)
design <- fread(design_file) %>% as.data.frame()
if (ncol(countMatrix) != nrow(design)) {print("Count matrix and design have different sample numbers")}
......@@ -84,6 +82,9 @@ if (ncol(countMatrix) != nrow(design)) {print("Count matrix and design have diff
sce <- SingleCellExperiment(assays = list(counts=countMatrix), colData = design)
rm(countData)
rm(countMatrix)
# Quality control and filtering --------------------------------------------------------------
......@@ -121,10 +122,10 @@ bs_modal(id = "lib_median", title = "Median library size per cell in million", b
# number of expressed genes
gene_median <- median(sce$total_features_endogenous)
gene_median <- median(sce$total_features_by_counts_endogenous)
plot_expressed_genes <- colData(sce) %>%
as.data.frame() %>%
ggplot(aes(total_features)) +
ggplot(aes(total_features_by_counts_endogenous)) +
geom_histogram(col = "white", fill = "grey") +
ylab("Number of cells") +
xlab("Number of expressed genes") +
......@@ -140,7 +141,7 @@ if (max(spike_data$pct_counts_ERCC) > 0) {
spike_plot <- ggplot(spike_data, aes(pct_counts_ERCC)) +
geom_histogram(col = "white", fill = "grey") +
ylab("Number of cells") +
xlab("ERCC proportion (%)") +
xlab("ERCC proportion (%)")
spike_ins <- "present"
bs_modal(id = "spike_plot", title = "Proportion of spike-ins:", body = htmltools::tagList(ggplotly(spike_plot, width = 800)), size = "large")
plot_spike_ins <- '<a data-toggle="modal" data-target="#spike_plot">plot</a>'
......@@ -203,8 +204,7 @@ do.call(rbind,lapply(names(sat_data), function(x){
xlab("sequencing depth (million reads)") +
ylab("Number of detected features") +
ggtitle(paste("Sequencing saturation of 500 out of", nrow(colData(sce)), "cells", sep = " ")) +
theme(legend.position="none") +
coord_fixed(ratio = 10/10^6)
theme(legend.position="none")
......@@ -225,25 +225,32 @@ scater::calcAverage(sce) %>%
#' in a single cell, coloured by the total number of features in that cell.
scater::plotQC(sce, type = "highest-expression", exprs_values = "counts")
# # <br><br>
# # ## Frequency of expression
# # Frequency of number of cells with expression for a gene above the defined threshold vs mean expression level.
# scater::plotQC(sce, type = "exprs-freq-vs-mean")
#' <br><br>
#' ## Frequency of expression
#' The number of cells with non-zero expression and the mean expression should be positively correlated
scater::plotExprsFreqVsMean(sce)
#' <br><br>
#' ## Check for outliers based on quality control metrices
#' Quality control metrices include the number of total features, total counts, log10_total_features, pct_counts_top_50_features, pct_counts_top_100_features, pct_counts_top_200_features as well as pct_counts_top_500_features.
# PCA <- colData(sce) %>% as.data.frame() %>% select(total_features, log10_total_features, log10_total_counts, pct_counts_top_50_features, pct_counts_top_100_features, pct_counts_top_200_features, pct_counts_top_500_features) %>% as.matrix() %>% prcomp()
PCA <- colData(sce) %>% as.data.frame() %>% select(total_features, log10_total_counts, pct_counts_top_50_features, pct_counts_top_100_features, pct_counts_top_200_features, pct_counts_top_500_features) %>% as.matrix() %>% prcomp()
percent <- round((((PCA$sdev)^2 / sum(PCA$sdev^2))*100))
PCA$x %>%
as.data.frame() %>%
ggplot(aes(PC1, PC2)) +
geom_point() +
xlab(paste("PC1 (", percent[1], "%)", sep = "")) +
ylab(paste("PC2 (", percent[2], "%)", sep = ""))
sce <- scater::runPCA(sce, use_coldata = TRUE, detect_outliers = TRUE)
scater::plotReducedDim(sce, use_dimred="PCA_coldata")
print("Number of outliers detected")
summary(sce$outlier)
#' <br><br>
#' ## Relationship of experimental factors and expression
# log-normalization
sce <- normalize(sce)
scater::plotExplanatoryVariables(sce, variables = c(colnames(design)))
#' <br><br>
......@@ -254,14 +261,20 @@ PCA$x %>%
# So far only mouse and human cycle markers are available as standard data set for checking the cell cycle phase.
# However, scran::sandbag() can be used to train a classifier for cell cylce phases in other organisms.
#https://www.bioconductor.org/packages/devel/bioc/vignettes/AnnotationDbi/inst/doc/IntroToAnnotationPackages.pdf
db <- guess_anno_db(rownames(counts(sce))[1])
if(any(str_detect(rownames(counts(sce)), "^FBgn|^ENSCAFG|^ENSMUSG|^ENSDARG|^ENSPTRG|^ENSG|^ENSCGRG"))) {
db <- guess_anno_db(rownames(counts(sce))[1])
} else {
db <- ""
}
if (db == "org.Mm.eg.db"){
cc.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran"))
} else if (db == "org.Hs.eg.db") {
cc.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))
} else {
print("For this organism no cell cycle markers are provided by the Bioconductor package scran")
print("No cell cycle marker information found (if human or mouse data analyzed please make sure to provide ensembl gene ids)")
}
# phase <- data.frame(sample = character())
......@@ -269,7 +282,7 @@ if (exists("cc.pairs")) {
# system.time( assignments <- cyclone(sce, pairs = cc.pairs, BPPARAM=MulticoreParam(10)) )
# system.time( assignments <- cyclone(sce, pairs = cc.pairs) )
assignments <- cyclone(sce, pairs = cc.pairs, BPPARAM=MulticoreParam(10))
data <- cbind(assignments$scores, data.frame(cell = rownames(colData(sce)), phase = assignments$phases))
data <- cbind(assignments$scores, data.frame(cell_id = rownames(colData(sce)), phase = assignments$phases))
sce$phase <- data$phase
......@@ -277,7 +290,7 @@ if (exists("cc.pairs")) {
data %>% write_tsv(paste(prefix, "cell_cycle_phases.txt", sep = ""))
# ...and only the phase as additional batch effect in the design file
design %>%
left_join(data, by = "cell") %>%
left_join(data, by = "cell_id") %>%
select(-G1, -S, -G2M) %>%
write_tsv(paste(prefix, "basic_design_incl_ccp.txt", sep = ""))
......@@ -293,7 +306,25 @@ if (exists("cc.pairs")) {
}
# if ensembl_gene_ids were provided, information on gene name, description and coordinates will be downloaded from biomaRt
if (nchar(db) > 0) {
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = guess_mart(rownames(counts(sce))[1]), host = paste0("oct2018.archive.ensembl.org"), path = "/biomart/martservice", archive = FALSE)
gene_info = biomaRt::getBM(mart = mart, c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") ) %>% tbl_df
} else {
gene_info = data.frame(gene_id = rownames(rowData(sce)))
}
gene_info %>% write_tsv("gene_information.txt")
colData(sce) %>% as.data.frame() %>% write_tsv("scater_quality_metrics.txt")
#-----------------------------------------------------------------------------------------------------------------------
# save copy of the used sc_quality_check.R script
system("cp ${NGS_TOOLS}/sc_workflow/sc_quality_check.R ./.ngs_tools_sc_quality_check_$(date +%Y%m%d_%H%M).R")
# get R version and package infos
writeLines(capture.output(devtools::session_info()), ".sessionInfo.txt")
#-----------------------------------------------------------
session::save.session(paste(".", prefix, "quality_check_filtered_genes.dat", sep = ""))
#session::restore.session(".quality_check_filtered_genes.dat")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment