sc_quality_check.R 14.9 KB
Newer Older
1 2
#!/usr/bin/env Rscript

Lena Hersemann's avatar
Lena Hersemann committed
3
#' # Quality check of single-cell RNASeq data
4 5 6 7
#'
#' Created by: `r system("whoami", intern=T)`
#'
#' Created at: `r format(Sys.Date(), format="%B %d %Y")`
8

9
#-----------------------------------------------------------------------------------------------------------------------
10
#+ include=FALSE
11 12 13 14
suppressMessages(require(docopt))

doc = '
Quality control and filtering of scRNAseq data
15
Usage: sc_quality_check.R [options] <count_data> <cell_infos>
16 17 18

Options:
--file_prefix <sample_name>          add prefix to output files if storing output of multiple analysis in one folder [default: ]
19
--gene_info <gene_info>    export gene information [default: FALSE]
20 21
'

22 23


24
# commandArgs <- function(x) c("--gene_info", "TRUE", "count_data.txt", "cell_infos.txt")
25 26 27 28 29 30
opts = docopt(doc, commandArgs(TRUE))



# LOAD packages -------------------------------------------------------------------------------
#https://www.bioconductor.org/help/workflows/simpleSingleCell/
31 32
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.45/R/ggplot_commons.R")
33
devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v10/dge_workflow/diffex_commons.R")
34
devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v6/common/cp_utils.R")
35 36 37 38 39 40
load_pack(SingleCellExperiment)
load_pack(knitr)
load_pack(scran)
load_pack(NOISeq)
load_pack(kableExtra)
load_pack(plotly)
41 42
# load_pack(bsplus)
# load_pack(htmltools)
43 44 45 46 47
load_pack(shiny)
load_pack(htmltools)
load_pack(BiocParallel)
load_pack(data.table)
load_pack(mvoutlier)
48
load_pack(gridExtra)
49 50 51 52 53 54 55 56 57 58 59 60 61


# FUNCTIONS ------------------------------------------------------------------------------------

get.cellcycl.phase <- function(x, annot.db){
    cur.anno <- select(annot.db, rownames(x), keytype="ENSEMBL", "ENSEMBL")
    cur.ensembl <- anno$ENSEMBL[match(rownames(x), cur.anno$ENSEMBL)]
    cur.assignments <- cyclone(x, mm.pairs, gene.names = cur.ensembl)
    cur.phase <- cur.assignments$phases
    return(cur.phase)
}


62 63
# HANDLE input data -------------------------------------------------------------------------------

64 65
counts_file <- opts$count_data
cell_infos_file <- opts$cell_infos
66
prefix <- opts$file_prefix
67
export_gene_info <- opts$gene_info
68 69 70 71

countData <- fread(counts_file)

countMatrix <- countData %>%
72
    column2rownames(colnames(countData)[str_detect(colnames(countData), "gene_id")]) %>%
73
    as.matrix()
74
countMatrix <- countMatrix[rowSums(countMatrix)>0, ]
75

76

77
cell_infos <- fread(cell_infos_file) %>% as.data.frame()
78

79
if (ncol(countMatrix) != nrow(cell_infos)) {print("Count matrix and cell_infos have different sample numbers")}
80 81 82


# create SingleCellExperiment object:
83
sce <- SingleCellExperiment(assays = list(counts=countMatrix), colData = cell_infos)
84 85


86 87 88
rm(countData)
rm(countMatrix)

89

90 91
# get gene information if requested
if(any(str_detect(rownames(counts(sce)), "^FBgn|^ENSCAFG|^ENSMUSG|^ENSDARG|^ENSPTRG|^ENSG|^ENSCGRG"))) {
92
    db <- guess_anno_db(rownames(counts(sce))[which(str_detect(rownames(counts(sce)), "^FBgn|^ENSCAFG|^ENSMUSG|^ENSDARG|^ENSPTRG|^ENSG|^ENSCGRG"))])
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
} else {
    db <- ""
}


# if ensembl_gene_ids were provided, information on gene name, description and coordinates will be downloaded from biomaRt
if (nchar(db) > 0) {
    mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = guess_mart(rownames(counts(sce))[1]), host = paste0("oct2018.archive.ensembl.org"), path = "/biomart/martservice", archive = FALSE)
    gene_info = biomaRt::getBM(mart = mart, c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") ) %>% tbl_df
} else if (nchar(db) == 0){
    gene_info = data.frame(gene_id = rownames(rowData(sce)))
}

if (export_gene_info == TRUE) { gene_info %>% write_tsv("gene_information.txt") }


109 110 111 112 113
# Quality control and filtering --------------------------------------------------------------

#' Due to low quantities of RNA in single cells, scRNAseq data are much noiser than bulk RNAseq data. Here we perform
#' quality control and filtering of scRNAseq data (i.e. removal of problamatic cells) by using quality metrices recommended by
#' [Lun et al. 2017](https://www.bioconductor.org/help/workflows/simpleSingleCell/).
114 115
#'
#' Run configuration was
116 117
vec_as_df(unlist(opts)) %>%
    filter(! str_detect(name, "^[<-]")) %>%
118
    kable() %>% kable_styling()
119 120

# check for spike-ins and mitochondrial genes and include those information in the general calculation of quality metrices
121
is.spike <- grepl("^ERCC", rownames(sce))
122 123 124
# is.mito <- grepl("^mt-", rownames(sce))
# sce <- scater::calculateQCMetrics(sce, feature_controls=list(ERCC=is.spike, Mt=is.mito))
sce <- scater::calculateQCMetrics(sce, feature_controls=list(ERCC=is.spike))
125

126
#'
127
#'<br><br>
128 129
#'
#' ## General characteristics of this dataset
130 131 132
# calculate quality control features and plot data -------------------

# median library size
133
lib_median = median(sce$total_counts/1e6)
134 135 136 137 138 139
plot_lib_median <- colData(sce) %>%
    as.data.frame() %>%
    ggplot(aes(total_counts/1e6)) +
        geom_histogram(col = "white", fill = "grey") +
        ylab("Number of cells") +
        xlab("Library size (millions)") +
140
        geom_vline(xintercept = lib_median, color = "red")
141
# bs_modal(id = "lib_median_plot", title = paste("\n Median library size per cell in million: ", lib_median , sep = ""), body = htmltools::tagList(ggplotly(plot_lib_median, width = 800)), size = "large")
142

143
# bs_modal(id = "lib_median", title = "Median library size per cell in million", body = "The library size is defined by the sum of all counts per cell and corresponds to the number of reads mapped to the reference genome")
144 145


146
# number of expressed genes
147
gene_median <- median(sce$total_features_by_counts)
148 149
plot_expressed_genes <- colData(sce) %>%
    as.data.frame() %>%
150
    ggplot(aes(total_features_by_counts)) +
151 152 153 154
        geom_histogram(col = "white", fill = "grey") +
        ylab("Number of cells") +
        xlab("Number of expressed genes") +
        geom_vline(xintercept = gene_median, color = "red")
155 156
# bs_modal(id = "expressed_genes_plot", title = paste("\n Median genes per cell: ", gene_median, sep = ""), body = htmltools::tagList(ggplotly(plot_expressed_genes, width = 800)), size = "large")
# bs_modal(id = "expressed_genes", title = "Expressed genes per cell", body = "The number of expressed genes corresponds to the number of genes per cell with an actual count (i.e. > 0)")
157 158 159 160 161 162 163 164 165


# proportion of spike-ins
spike_data <- colData(sce) %>% as.data.frame()
# bs_modal(id = "spikes", title = "Spike-ins", body = "Occassionally, spike-in RNAs are added before sequencing to serve as a standard for quality control, filtering and normalization. The proportion of spike-in RNAs will be reported if any spike-in counts are detected within the alignment results.")
if (max(spike_data$pct_counts_ERCC) > 0) {
    spike_plot <- ggplot(spike_data, aes(pct_counts_ERCC)) +
        geom_histogram(col = "white", fill = "grey") +
        ylab("Number of cells") +
166
        xlab("ERCC proportion (%)")
167
    spike_ins <- "present"
168 169 170 171
    # bs_modal(id = "spike_plot", title = "Proportion of spike-ins:", body = htmltools::tagList(ggplotly(spike_plot, width = 800)), size = "large")
    # plot_spike_ins <- '<a data-toggle="modal" data-target="#spike_plot">plot</a>'
    plot_spike_ins <- spike_plot
    grid.arrange(plot_lib_median, plot_expressed_genes, plot_spike_ins, nrow = 1)
172 173 174
} else {
    spike_ins <- "not present"
    plot_spike_ins <- "no data plotted"
175
    grid.arrange(plot_lib_median, plot_expressed_genes, nrow = 1)
176 177
}

178 179 180 181



#+ echo=FALSE, eval=TRUE
182 183 184 185 186 187 188 189 190 191 192 193
# bs_modal(id = "total_features", title = "Total number of features", body = "Number of unique features (i.e. genes, spike-ins, etc.) in the whole data set.")

# #+ include=TRUE
# tribble(~feature, ~value, ~visualization,
# # "test", "test", "test", '<a data-toggle="modal" data-target="#modal">Title</a>'
# '<a data-toggle="modal" data-target="#lib_median">Median library size</a>', lib_median, '<a data-toggle="modal" data-target="#lib_median_plot">plot</a>',
# '<a data-toggle="modal" data-target="#total_features">Total number of features</a>', nrow(counts(sce)), "no data plotted",
# '<a data-toggle="modal" data-target="#expressed_genes">Expressed genes per cell</a>', gene_median, '<a data-toggle="modal" data-target="#expressed_genes_plot">plot</a>',
# '<a data-toggle="modal" data-target="#spikes">Spike-ins</a>', spike_ins, plot_spike_ins
# # '<a data-toggle="modal" data-target="#mito">Mitochondrial genes</a>', mito, plot_mito,
# ) %>% kable(escape = TRUE)

194 195 196 197 198



#----------------------------------------------------------

199
#'
200
#'<br><br>
201
#'
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
#' ## Sequencing saturation
#' The following saturation plot of the invidivudal cells is based on the sequencing depth of each cell as well as 6 higher
#' and lower simulated sequencing depths. For this plot all features with a counts > 0 were taken into account. Due to memory usage
#' only a subset of 500 cells will be used to indicate sequencing saturation of the data set.
dat <- readData(data = counts(sce)[,1:500], factors = colData(sce)[1:500,])
mysaturation = dat(dat, k = 0, ndepth = 6, type = "saturation")
sat_data <- dat2save(mysaturation)


do.call(rbind,lapply(names(sat_data), function(x){
    data <- sat_data[[x]] %>% as.data.frame()
    data.frame(cell = x, depth = data$depth, global = data$global)
})) %>% ggplot(aes(depth/10^6, global, group = cell, alpha = 0.8)) +
    geom_point() +
    geom_line() +
    xlab("sequencing depth (million reads)") +
    ylab("Number of detected features") +
    ggtitle(paste("Sequencing saturation of 500 out of", nrow(colData(sce)), "cells", sep = " ")) +
220
    theme(legend.position="none")
221 222


223
#'
224
#' <br><br>
225
#'
226 227 228 229 230 231 232 233 234
#' ## Average counts per feature
#' The following plot visualizes the log10 of the average counts per feature which are observed across all samples. Values with a log10(average count) >= 0 represent average counts of >= 1.
scater::calcAverage(sce) %>%
    as.data.frame() %>% `colnames<-`(c("average")) %>% mutate(ensembl_gene_id = rownames(.)) %>%
        ggplot(aes(log10(average))) + geom_histogram(col = "white", binwidth = 0.1, fill = "grey") + ylab("frequency") + xlab(expression(paste(log[10], " average count"))) +
            # labs(title = x) +
            theme(plot.title = element_text(size = 10, face = "bold")) +
            geom_vline(xintercept = 0, color = "red", linetype=2)

235
#'
236
#' <br><br>
237
#'
238 239 240 241
#' ## 50 most highly abundant features
#' Percentage of total counts assigned to the top 50 most highly-abundant features. Each bar represents the percentage count of the individual feature (i.e. gene, spike-in)
#' in a single cell, coloured by the total number of features in that cell.
scater::plotQC(sce, type = "highest-expression", exprs_values = "counts")
242

243

244 245 246
top50 <- rowData(sce) %>% as.data.frame() %>% transmute(gene_id = rownames(.), total_counts) %>% arrange(desc(total_counts)) %>% slice(1:50)
gene_info %>% right_join(top50, by = c("ensembl_gene_id" = "gene_id")) %>% arrange(desc(total_counts)) %>% transmute(gene_id = ensembl_gene_id, gene_name = external_gene_name, chr = chromosome_name, start = start_position, stop = end_position, description) %>% table_browser()

247 248

#' <br><br>
249
#'
250 251 252 253
#' ## Frequency of expression
#' The number of cells with non-zero expression and the mean expression should be positively correlated
scater::plotExprsFreqVsMean(sce)

254 255 256


#' <br><br>
257
#'
258 259 260
#' ## Check for outliers based on quality control metrices
#' Quality control metrices include the number of total features, total counts, log10_total_features, pct_counts_top_50_features, pct_counts_top_100_features, pct_counts_top_200_features as well as pct_counts_top_500_features.
# PCA <- colData(sce) %>% as.data.frame() %>% select(total_features, log10_total_features, log10_total_counts, pct_counts_top_50_features, pct_counts_top_100_features, pct_counts_top_200_features, pct_counts_top_500_features) %>% as.matrix() %>% prcomp()
261

262 263 264 265 266 267 268 269
sce <- scater::runPCA(sce, use_coldata = TRUE, detect_outliers = TRUE)
scater::plotReducedDim(sce, use_dimred="PCA_coldata")
print("Number of outliers detected")
summary(sce$outlier)


#  log-normalization
sce <- normalize(sce)
270 271 272


#' <br><br>
273
#'
274
#' ## Cell cycle phase
275 276 277
#' Cells are classified as being (i) in G1 phase if the G1 score is above 0.5 and greater than the G2/M score, (ii) in G2/M phase if the G2/M score is above 0.5 and greater than the G1 score and (iii) in S phase if neither score is above 0.5.
#' A file with the suffix '_cell_cycle_phases.txt' will be saved and can be subsequently used to either filter the cells or correct for differences in cell cycle phases as a batch effect.

278 279
# So far only mouse and human cycle markers are available as standard data set for checking the cell cycle phase.
# However, scran::sandbag() can be used to train a classifier for cell cylce phases in other organisms.
280
#https://www.bioconductor.org/packages/devel/bioc/vignettes/AnnotationDbi/inst/doc/IntroToAnnotationPackages.pdf
281 282


283 284 285 286 287
if (db == "org.Mm.eg.db"){
    cc.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran"))
} else if (db == "org.Hs.eg.db") {
    cc.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))
} else {
288
    print("No cell cycle marker information found (if human or mouse data analyzed please make sure to provide ensembl gene ids)")
289 290
}

291
# phase <- data.frame(sample = character())
292
if (exists("cc.pairs")) {
293 294 295
    # system.time( assignments <- cyclone(sce, pairs = cc.pairs, BPPARAM=MulticoreParam(10)) )
    # system.time( assignments <- cyclone(sce, pairs = cc.pairs) )
    assignments <- cyclone(sce, pairs = cc.pairs, BPPARAM=MulticoreParam(10))
296
    data <- cbind(assignments$scores, data.frame(cell_id = rownames(colData(sce)), phase = assignments$phases))
297 298 299 300 301

    sce$phase <- data$phase

    # export cell cycle data with its actual data...
    data %>% write_tsv(paste(prefix, "cell_cycle_phases.txt", sep = ""))
302 303
    # ...and only the phase as additional batch effect in the cell_infos file
    cell_infos %>%
304
        left_join(data, by = "cell_id") %>%
305
        select(-G1, -S, -G2M) %>%
306
        write_tsv(paste(prefix, "basic_cell_infos_incl_ccp.txt", sep = ""))
307 308 309 310 311 312 313 314 315 316

    # plot data
    cc_plot <- ggplot(data, aes(G1, G2M, color = phase)) +
        geom_point(alpha = 0.3) +
        xlab("G1 score") +
        ylab("G2/M score") +
        coord_fixed(ratio = 1)

    print(cc_plot)
    datatable(data.frame(G1 = length(sce$phase[sce$phase == "G1"]), S = length(sce$phase[sce$phase == "S"]), G2M = length(sce$phase[sce$phase == "G2M"])))
317 318

    cell_infos %<>% left_join(data %>% select(cell_id, phase))
319 320
}

321

322 323 324 325 326
#' <br><br>
#'
#' ## Relationship of experimental factors and expression
scater::plotExplanatoryVariables(sce, variables = c(colnames(cell_infos)))

327 328 329 330 331 332 333

colData(sce) %>% as.data.frame() %>% write_tsv("scater_quality_metrics.txt")


#-----------------------------------------------------------------------------------------------------------------------
# get R version and package infos
writeLines(capture.output(devtools::session_info()), ".sessionInfo.txt")
334

335
session::save.session(paste(".", prefix, "sc_quality_check.dat", sep = ""))
336
#session::restore.session(".quality_check_filtered_genes.dat")