comp_star_kallisto.R 8.04 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#!/usr/bin/env Rscript
#+ include=FALSE

#*****************************************
#' # Comparison of STAR and kallisto results
#*****************************************
# https://scilifelab.github.io/courses/rnaseq/labs/kallisto

suppressMessages(require(docopt))

doc = '
Compare STAR and kallisto results
Usage: comp_star_kallisto.R <star_counts_matrix> <kallisto_results_counts>
'

16
# commandArgs <- function(x) c("../star_counts_matrix.txt", "kallisto_results_counts.txt")
17 18 19 20 21
opts = docopt(doc, commandArgs(TRUE))



# LOAD packages -------------------------------------------------------------------------------
22 23
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.43/R/core_commons.R")
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.40/R/bio/diffex_commons.R")
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
install_package("biomaRt")
load_pack(knitr)

# HANDLE input data -------------------------------------------------------------------------------
star_file <- opts$star_counts_matrix
kalli_file <- opts$kallisto_results

star <- read_tsv(star_file)
# head(star_data)
kalli <- read_tsv(kalli_file)
# head(kalli_data)


# FUNCTIONS ------------------------------------------------------------------------------------




# PREPARE -------------------------------------------------------------------------------------

# get gene information
ensembl_dataset = guess_mart(star$ensembl_gene_id)

geneInfo = quote({
    mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = ensembl_dataset, host = "aug2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)
    c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
        biomaRt::getBM(mart = mart) %>%
        tbl_df
}) %>% cache_it("geneInfo")

# geneLengths = transmute(geneInfo, ensembl_gene_id, gene_length = end_position - start_position)
# geneDescs = transmute(geneInfo, ensembl_gene_id, gene_name = external_gene_name, gene_description = description)


# convert STAR count data to TPMs:
# star_data %<>%
#     as.data.frame %>%
#     gather(replicate, num_tags, - ensembl_gene_id) %>%
#     tbl_df %>%
#     left_join(geneLengths) %>%
#     mutate(length_norm_count = num_tags / gene_length) %>%
#     group_by(replicate) %>%
#     mutate(length_norm_sum = sum(length_norm_count, na.rm = T), tpm = (length_norm_count * (1E6 / length_norm_sum)) %>% round(digits = 3)) %>%
#     ungroup() %>%
#     select(ensembl_gene_id, replicate, tpm) %>%
#     na.omit()
# head(star_data)


star_data <- star %>%
    gather(replicate, star_count, -ensembl_gene_id)

# select kallisto data:
kalli_data <- kalli %>% transmute(ensembl_gene_id, replicate, kalli_count = count_by_gene) %>%
    group_by(replicate) %>%
    filter(!duplicated(ensembl_gene_id)) %>%
    ungroup()

combined <- star_data %>%
    left_join(kalli_data)
# head(combined)

# GET STARTED ---------------------------------------------------------------------------------

88
#'<br>
89
# information on gene/isoform level
90 91
#' ## Summary of genes and isoforms
#'<br>
92 93 94 95 96 97 98 99 100 101
#' **Number of total genes/isoforms that were taken into account by the two different mapping processes**
comp_data <- tribble(
~"aligner", ~"genes", ~"isoforms",
"STAR", length(unique(star$ensembl_gene_id)), "-",
"kallisto", length(unique(kalli$ensembl_gene_id)), length(unique(kalli$ensembl_transcript_id))
)

comp_data %>% kable()

# information on replicates and counts
102
#' **Number of genes with counts > 0 predicted by the two mapping tools STAR and kallisto**
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
replicates <- colnames(star[-1])

rep_results <- data.frame(sample = character(0), "star" = numeric(0), "kallisto" = numeric(0))
for (rep in replicates) {
    rep_results  <- rbind(rep_results, data.frame(sample = rep,
        "star" = star[rep] %>%
            filter(star[rep]>0) %>%
            nrow(),
        "kallisto" = kalli %>%
            filter(grepl(rep, replicate)) %>%
            filter(count_by_gene > 0) %>%
            filter(!duplicated(ensembl_gene_id)) %>%
            nrow()
    ))
}

rep_results %>% kable()

121
#'<br><br>
122 123 124 125 126 127 128 129 130 131 132 133
combined %>%
    gather("aligner", "count", -c(ensembl_gene_id, replicate)) %>%
    mutate(aligner = case_when(aligner == "kalli_count"~"kallisto", aligner == "star_count"~"star")) %>%
    mutate(is_count = count > 0) %>%
    group_by(replicate, aligner) %>%
    ggplot(aes(replicate, as.numeric(is_count), fill = aligner)) +
    stat_summary(fun.y = sum, geom = "bar", position = "dodge") +
    ylab("number of counts > 0") +
    xlab("") +
    ggtitle("Number of genes with counts > 0")


134 135 136
#'<br><br>
#' ## Venn diagrams of STAR and kallisto data
#' Number of genes with counts > 0 which were either reported by both or only one of the two mapping tools
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
k <- kalli %>% transmute(ensembl_gene_id, replicate, count = count_by_gene) %>% filter(count > 0) %>% group_by(replicate) %>% filter(!duplicated(ensembl_gene_id))
s <- star %>% gather("replicate", "count", -ensembl_gene_id) %>% filter(count > 0)
load_pack(VennDiagram)

knested = k %>% group_by(replicate) %>% nest %>% ungroup
snested = s %>% group_by(replicate) %>% nest %>% ungroup

# par(mfrow=c(1,3))
inner_join(knested, snested, by = "replicate", suffix=c(".kallisto", ".star")) %>% group_by(replicate) %>% do(with(.,{
    constrastData = .
    overlap_plot = venn.diagram(list(kallisto=constrastData$data.kallisto[[1]]$ensembl_gene_id, star=constrastData$data.star[[1]]$ensembl_gene_id), NULL, fill=c("red","blue"), main=constrastData$replicate)
    grid.newpage(); grid.draw(overlap_plot)
    data.frame()
}))


# combined %>%
#     group_by(ensembl_gene_id) %>%
#     ggplot(aes(log2(tpm), log2(tpm_by_gene))) +
#         geom_point() +
#         xlab("log2(tpms - STAR)") +
#         ylab("log2(tpms - kallisto)")
#
#
# combined %>% filter(tpm >= 1 & tpm_by_gene >= 1) %>%
#     arrange(isoforms_num) %>%
#     group_by(ensembl_gene_id) %>%
#     ggplot(aes(log2(tpm), log2(tpm_by_gene), color = isoforms_num)) +
#     geom_point() +
#     xlab("log2(tpms - STAR)") +
#     ylab("log2(tpms - kallisto)") +
#     geom_smooth()


171 172 173 174
#'<br><br>
#' ## Correlation of STAR and kallisto data
#' The visualized data are based on the log2(counts) of genes common two the output of both STAR and kallisto.
#' Based on the ratio of STAR and kallisto counts per gene, the plot distinguish the 1 % of most divergent genes (blue) from the rest (red).
175 176 177 178
# https://stackoverflow.com/questions/4666590/remove-outliers-from-correlation-coefficient-calculation
cor <- combined %>%
    na.omit() %>%
    mutate(log2_star_count = log2(star_count + 1), log2_kalli_count = log2(kalli_count + 1)) %>%
179
    mutate(log2_count_diff = abs(log2_kalli_count - log2_star_count), is_divergent = log2_count_diff > quantile(abs(log2_kalli_count - log2_star_count), 0.99)) %>%
180 181 182 183 184 185 186 187 188 189 190 191
    left_join(geneInfo)
# head(cor)

# diff_freq <- cor %>% filter(replicate == "mmus_mbRG_rep1") %>% mutate(count_diff = round(count_diff)) %>% group_by(count_diff) %>% summarise(freq = n())
#     ggplot(diff_freq, aes(log10(count_diff), log10(freq))) + geom_point()

cor %>%
    ggplot(aes(log2_kalli_count, log2_star_count, color = is_divergent), alpha=0.2) +
    # ggplot(aes(star_count, kalli_count, color = is_divergent), alpha=0.2) +
    geom_point() +
    xlab("log2(count) - STAR") +
    ylab("log2(count) - kallisto") +
192
    coord_fixed()
193 194 195

# cor %>% mutate(log2_count_diff = round(log2_count_diff)) %>% group_by(log2_count_diff) %>% summarise(freq = n()) %>% ggplot(aes(log2_count_diff, log2(freq))) + geom_bar(stat = "identity", position = "dodge")

196 197
#'<br><br>
#' **The following table contains the 1 percent of genes with highest divergence between star and kallisto count data (see plot)**
198 199
cor %>% filter(is_divergent) %>% arrange(desc(log2_count_diff)) %>% select(-start_position, -end_position, -is_divergent) %>% datatable()

200 201
# export table of compared data
cor %>% write_tsv("comp_star_kalli_data.txt")
202 203 204 205 206

#-----------------------------------------------------------------------------------------------------------------------
session::save.session(".comp_star_kallisto.dat")
# session::restore.session(".collect_kallisto_data.dat");
#-----------------------------------------------------------------------------------------------------------------------