#!/usr/bin/env Rscript #' # Quality check of single-cell RNASeq data #' #' Created by: `r system("whoami", intern=T)` #' #' Created at: `r format(Sys.Date(), format="%B %d %Y")` #----------------------------------------------------------------------------------------------------------------------- #+ include=FALSE suppressMessages(require(docopt)) doc = ' Quality control and filtering of scRNAseq data Usage: sc_quality_check.R [options] Options: --file_prefix add prefix to output files if storing output of multiple analysis in one folder [default: ] --gene_info export gene information [default: FALSE] ' # commandArgs <- function(x) c("--gene_info", "TRUE", "count_data.txt", "cell_infos.txt") opts = docopt(doc, commandArgs(TRUE)) # LOAD packages ------------------------------------------------------------------------------- #https://www.bioconductor.org/help/workflows/simpleSingleCell/ devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/core_commons.R") devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.45/R/ggplot_commons.R") devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v10/dge_workflow/diffex_commons.R") devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v6/common/cp_utils.R") load_pack(SingleCellExperiment) load_pack(knitr) load_pack(scran) load_pack(NOISeq) load_pack(kableExtra) load_pack(plotly) # load_pack(bsplus) # load_pack(htmltools) load_pack(shiny) load_pack(htmltools) load_pack(BiocParallel) load_pack(data.table) load_pack(mvoutlier) load_pack(gridExtra) # FUNCTIONS ------------------------------------------------------------------------------------ get.cellcycl.phase <- function(x, annot.db){ cur.anno <- select(annot.db, rownames(x), keytype="ENSEMBL", "ENSEMBL") cur.ensembl <- anno$ENSEMBL[match(rownames(x), cur.anno$ENSEMBL)] cur.assignments <- cyclone(x, mm.pairs, gene.names = cur.ensembl) cur.phase <- cur.assignments$phases return(cur.phase) } # HANDLE input data ------------------------------------------------------------------------------- counts_file <- opts$count_data cell_infos_file <- opts$cell_infos prefix <- opts$file_prefix export_gene_info <- opts$gene_info countData <- fread(counts_file) countMatrix <- countData %>% column2rownames(colnames(countData)[str_detect(colnames(countData), "gene_id")]) %>% as.matrix() countMatrix <- countMatrix[rowSums(countMatrix)>0, ] cell_infos <- fread(cell_infos_file) %>% as.data.frame() if (ncol(countMatrix) != nrow(cell_infos)) {print("Count matrix and cell_infos have different sample numbers")} # create SingleCellExperiment object: sce <- SingleCellExperiment(assays = list(counts=countMatrix), colData = cell_infos) rm(countData) rm(countMatrix) # get gene information if requested if(any(str_detect(rownames(counts(sce)), "^FBgn|^ENSCAFG|^ENSMUSG|^ENSDARG|^ENSPTRG|^ENSG|^ENSCGRG"))) { db <- guess_anno_db(rownames(counts(sce))[which(str_detect(rownames(counts(sce)), "^FBgn|^ENSCAFG|^ENSMUSG|^ENSDARG|^ENSPTRG|^ENSG|^ENSCGRG"))]) } else { db <- "" } # if ensembl_gene_ids were provided, information on gene name, description and coordinates will be downloaded from biomaRt if (nchar(db) > 0) { mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = guess_mart(rownames(counts(sce))[1]), host = paste0("oct2018.archive.ensembl.org"), path = "/biomart/martservice", archive = FALSE) gene_info = biomaRt::getBM(mart = mart, c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") ) %>% tbl_df } else if (nchar(db) == 0){ gene_info = data.frame(gene_id = rownames(rowData(sce))) } if (export_gene_info == TRUE) { gene_info %>% write_tsv("gene_information.txt") } # Quality control and filtering -------------------------------------------------------------- #' Due to low quantities of RNA in single cells, scRNAseq data are much noiser than bulk RNAseq data. Here we perform #' quality control and filtering of scRNAseq data (i.e. removal of problamatic cells) by using quality metrices recommended by #' [Lun et al. 2017](https://www.bioconductor.org/help/workflows/simpleSingleCell/). #' #' Run configuration was vec_as_df(unlist(opts)) %>% filter(! str_detect(name, "^[<-]")) %>% kable() %>% kable_styling() # check for spike-ins and mitochondrial genes and include those information in the general calculation of quality metrices is.spike <- grepl("^ERCC", rownames(sce)) # is.mito <- grepl("^mt-", rownames(sce)) # sce <- scater::calculateQCMetrics(sce, feature_controls=list(ERCC=is.spike, Mt=is.mito)) sce <- scater::calculateQCMetrics(sce, feature_controls=list(ERCC=is.spike)) #' #'

#' #' ## General characteristics of this dataset # calculate quality control features and plot data ------------------- # median library size lib_median = median(sce$total_counts/1e6) plot_lib_median <- colData(sce) %>% as.data.frame() %>% ggplot(aes(total_counts/1e6)) + geom_histogram(col = "white", fill = "grey") + ylab("Number of cells") + xlab("Library size (millions)") + geom_vline(xintercept = lib_median, color = "red") # bs_modal(id = "lib_median_plot", title = paste("\n Median library size per cell in million: ", lib_median , sep = ""), body = htmltools::tagList(ggplotly(plot_lib_median, width = 800)), size = "large") # bs_modal(id = "lib_median", title = "Median library size per cell in million", body = "The library size is defined by the sum of all counts per cell and corresponds to the number of reads mapped to the reference genome") # number of expressed genes gene_median <- median(sce$total_features_by_counts) plot_expressed_genes <- colData(sce) %>% as.data.frame() %>% ggplot(aes(total_features_by_counts)) + geom_histogram(col = "white", fill = "grey") + ylab("Number of cells") + xlab("Number of expressed genes") + geom_vline(xintercept = gene_median, color = "red") # bs_modal(id = "expressed_genes_plot", title = paste("\n Median genes per cell: ", gene_median, sep = ""), body = htmltools::tagList(ggplotly(plot_expressed_genes, width = 800)), size = "large") # bs_modal(id = "expressed_genes", title = "Expressed genes per cell", body = "The number of expressed genes corresponds to the number of genes per cell with an actual count (i.e. > 0)") # proportion of spike-ins spike_data <- colData(sce) %>% as.data.frame() # bs_modal(id = "spikes", title = "Spike-ins", body = "Occassionally, spike-in RNAs are added before sequencing to serve as a standard for quality control, filtering and normalization. The proportion of spike-in RNAs will be reported if any spike-in counts are detected within the alignment results.") if (max(spike_data$pct_counts_ERCC) > 0) { spike_plot <- ggplot(spike_data, aes(pct_counts_ERCC)) + geom_histogram(col = "white", fill = "grey") + ylab("Number of cells") + xlab("ERCC proportion (%)") spike_ins <- "present" # bs_modal(id = "spike_plot", title = "Proportion of spike-ins:", body = htmltools::tagList(ggplotly(spike_plot, width = 800)), size = "large") # plot_spike_ins <- 'plot' plot_spike_ins <- spike_plot grid.arrange(plot_lib_median, plot_expressed_genes, plot_spike_ins, nrow = 1) } else { spike_ins <- "not present" plot_spike_ins <- "no data plotted" grid.arrange(plot_lib_median, plot_expressed_genes, nrow = 1) } #+ echo=FALSE, eval=TRUE # bs_modal(id = "total_features", title = "Total number of features", body = "Number of unique features (i.e. genes, spike-ins, etc.) in the whole data set.") # #+ include=TRUE # tribble(~feature, ~value, ~visualization, # # "test", "test", "test", 'Title' # 'Median library size', lib_median, 'plot', # 'Total number of features', nrow(counts(sce)), "no data plotted", # 'Expressed genes per cell', gene_median, 'plot', # 'Spike-ins', spike_ins, plot_spike_ins # # 'Mitochondrial genes', mito, plot_mito, # ) %>% kable(escape = TRUE) #---------------------------------------------------------- #' #'

#' #' ## Sequencing saturation #' The following saturation plot of the invidivudal cells is based on the sequencing depth of each cell as well as 6 higher #' and lower simulated sequencing depths. For this plot all features with a counts > 0 were taken into account. Due to memory usage #' only a subset of 500 cells will be used to indicate sequencing saturation of the data set. dat <- readData(data = counts(sce)[,1:500], factors = colData(sce)[1:500,]) mysaturation = dat(dat, k = 0, ndepth = 6, type = "saturation") sat_data <- dat2save(mysaturation) do.call(rbind,lapply(names(sat_data), function(x){ data <- sat_data[[x]] %>% as.data.frame() data.frame(cell = x, depth = data$depth, global = data$global) })) %>% ggplot(aes(depth/10^6, global, group = cell, alpha = 0.8)) + geom_point() + geom_line() + xlab("sequencing depth (million reads)") + ylab("Number of detected features") + ggtitle(paste("Sequencing saturation of 500 out of", nrow(colData(sce)), "cells", sep = " ")) + theme(legend.position="none") #' #'

#' #' ## Average counts per feature #' The following plot visualizes the log10 of the average counts per feature which are observed across all samples. Values with a log10(average count) >= 0 represent average counts of >= 1. scater::calcAverage(sce) %>% as.data.frame() %>% `colnames<-`(c("average")) %>% mutate(ensembl_gene_id = rownames(.)) %>% ggplot(aes(log10(average))) + geom_histogram(col = "white", binwidth = 0.1, fill = "grey") + ylab("frequency") + xlab(expression(paste(log[10], " average count"))) + # labs(title = x) + theme(plot.title = element_text(size = 10, face = "bold")) + geom_vline(xintercept = 0, color = "red", linetype=2) #' #'

#' #' ## 50 most highly abundant features #' Percentage of total counts assigned to the top 50 most highly-abundant features. Each bar represents the percentage count of the individual feature (i.e. gene, spike-in) #' in a single cell, coloured by the total number of features in that cell. scater::plotQC(sce, type = "highest-expression", exprs_values = "counts") top50 <- rowData(sce) %>% as.data.frame() %>% transmute(gene_id = rownames(.), total_counts) %>% arrange(desc(total_counts)) %>% slice(1:50) gene_info %>% right_join(top50, by = c("ensembl_gene_id" = "gene_id")) %>% arrange(desc(total_counts)) %>% transmute(gene_id = ensembl_gene_id, gene_name = external_gene_name, chr = chromosome_name, start = start_position, stop = end_position, description) %>% table_browser() #'

#' #' ## Frequency of expression #' The number of cells with non-zero expression and the mean expression should be positively correlated scater::plotExprsFreqVsMean(sce) #'

#' #' ## Check for outliers based on quality control metrices #' Quality control metrices include the number of total features, total counts, log10_total_features, pct_counts_top_50_features, pct_counts_top_100_features, pct_counts_top_200_features as well as pct_counts_top_500_features. # PCA <- colData(sce) %>% as.data.frame() %>% select(total_features, log10_total_features, log10_total_counts, pct_counts_top_50_features, pct_counts_top_100_features, pct_counts_top_200_features, pct_counts_top_500_features) %>% as.matrix() %>% prcomp() sce <- scater::runPCA(sce, use_coldata = TRUE, detect_outliers = TRUE) scater::plotReducedDim(sce, use_dimred="PCA_coldata") print("Number of outliers detected") summary(sce$outlier) # log-normalization sce <- normalize(sce) #'

#' #' ## Cell cycle phase #' Cells are classified as being (i) in G1 phase if the G1 score is above 0.5 and greater than the G2/M score, (ii) in G2/M phase if the G2/M score is above 0.5 and greater than the G1 score and (iii) in S phase if neither score is above 0.5. #' A file with the suffix '_cell_cycle_phases.txt' will be saved and can be subsequently used to either filter the cells or correct for differences in cell cycle phases as a batch effect. # So far only mouse and human cycle markers are available as standard data set for checking the cell cycle phase. # However, scran::sandbag() can be used to train a classifier for cell cylce phases in other organisms. #https://www.bioconductor.org/packages/devel/bioc/vignettes/AnnotationDbi/inst/doc/IntroToAnnotationPackages.pdf if (db == "org.Mm.eg.db"){ cc.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran")) } else if (db == "org.Hs.eg.db") { cc.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran")) } else { print("No cell cycle marker information found (if human or mouse data analyzed please make sure to provide ensembl gene ids)") } # phase <- data.frame(sample = character()) if (exists("cc.pairs")) { # system.time( assignments <- cyclone(sce, pairs = cc.pairs, BPPARAM=MulticoreParam(10)) ) # system.time( assignments <- cyclone(sce, pairs = cc.pairs) ) assignments <- cyclone(sce, pairs = cc.pairs, BPPARAM=MulticoreParam(10)) data <- cbind(assignments$scores, data.frame(cell_id = rownames(colData(sce)), phase = assignments$phases)) sce$phase <- data$phase # export cell cycle data with its actual data... data %>% write_tsv(paste(prefix, "cell_cycle_phases.txt", sep = "")) # ...and only the phase as additional batch effect in the cell_infos file cell_infos %>% left_join(data, by = "cell_id") %>% select(-G1, -S, -G2M) %>% write_tsv(paste(prefix, "basic_cell_infos_incl_ccp.txt", sep = "")) # plot data cc_plot <- ggplot(data, aes(G1, G2M, color = phase)) + geom_point(alpha = 0.3) + xlab("G1 score") + ylab("G2/M score") + coord_fixed(ratio = 1) print(cc_plot) datatable(data.frame(G1 = length(sce$phase[sce$phase == "G1"]), S = length(sce$phase[sce$phase == "S"]), G2M = length(sce$phase[sce$phase == "G2M"]))) cell_infos %<>% left_join(data %>% select(cell_id, phase)) } #'

#' #' ## Relationship of experimental factors and expression scater::plotExplanatoryVariables(sce, variables = c(colnames(cell_infos))) colData(sce) %>% as.data.frame() %>% write_tsv("scater_quality_metrics.txt") #----------------------------------------------------------------------------------------------------------------------- # get R version and package infos writeLines(capture.output(devtools::session_info()), ".sessionInfo.txt") session::save.session(paste(".", prefix, "sc_quality_check.dat", sep = "")) #session::restore.session(".quality_check_filtered_genes.dat")