dge_norm_counts.R 4.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
#!/usr/bin/env Rscript

suppressMessages(require(docopt))

# commandArgs = function(x) c("star_counts_matrix.txt")

doc = '
Convert a count matrix into TPM and FPKM format

Usage: dge_norm_counts.R <count_matrix_tsv>
11 12
Options
--gene_info <info_file>         Gene information file downloaded from the ensembl website if bioMart version is not present
13 14 15 16 17
'

opts = docopt(doc, commandArgs(TRUE)) ## does not work when spining


18
devtools::source_url("https://git.mpi-cbg.de/bioinfo/datautils/raw/v1.46/R/core_commons.R")
19
devtools::source_url("https://git.mpi-cbg.de/bioinfo/ngs_tools/raw/v8/dge_workflow/diffex_commons.R")
20 21


22
counts = read_tsv(opts$count_matrix_tsv)
23 24


25 26
gene_info_file = opts$gene_info
assert(is.null(gene_info_file) || file.exists(gene_info_file), "invalid gene_info_file")
27 28 29 30


## fetch gene length info

Holger Brandl's avatar
Holger Brandl committed
31
resultsBase = basename(opts$count_matrix_tsv) %>% trim_ext("txt")
32

33 34 35 36 37 38 39 40 41 42 43
if (is.null(gene_info_file)) {
    ens_build = "aug2017"

    geneInfo = quote({
        mart = biomaRt::useMart(
            "ENSEMBL_MART_ENSEMBL",
            dataset = guess_mart(counts$ensembl_gene_id),
            host = paste0(ens_build, ".archive.ensembl.org"),
            path = "/biomart/martservice",
            archive = FALSE
        )
44

45 46 47 48 49 50 51 52 53 54
        c("ensembl_gene_id", "external_gene_name", "start_position", "end_position") %>%
            biomaRt::getBM(mart = mart) %>% tbl_df
    }) %>% cache_it(paste0("geneInfo_", ens_build))
} else {
    geneInfo = read_tsv(gene_info_file)
    colnames(geneInfo)[1] = "ensembl_gene_id"
}


geneLength = transmute(geneInfo, external_gene_name, ensembl_gene_id, gene_length = end_position - start_position)
55 56 57


cLong = gather(counts, replicate, num_alignments, - ensembl_gene_id)
58 59 60 61

## why left join? genes without annotation can not normalized due to missing length. Resulting tables with have missing data
# cLong %<>% left_join(geneLength)
cLong %<>% inner_join(geneLength)
62

63 64 65 66
cLong %>%
    distinct_all(ensembl_gene_id) %>%
    count(is.na(gene_length))

67

68 69
########################################################################################################################
#' ## FPKMS
70 71 72 73 74 75 76

fpkms = cLong %>%
    group_by(replicate) %>%
    mutate(lib_size = sum(num_alignments), fpkm = ((1E9 / lib_size) * (num_alignments / gene_length)) %>%
    round(digits = 3))


77
# write_tsv(fpkms, paste0(resultsBase, "fpkm_long.txt"))
78 79 80 81 82 83 84 85

fpkms %>%
    ungroup() %>%
    transmute(ensembl_gene_id, replicate, fpkm) %>%
    spread(replicate, fpkm) %>%
    write_tsv(paste0(resultsBase, "fpkms_wide.txt"))


86 87
########################################################################################################################
#' ## TPMS
88 89

tpms = cLong %>%
90
    mutate(length_norm_count = num_alignments / gene_length) %>%
91
    group_by(replicate) %>%
92
    mutate(length_norm_sum = sum(length_norm_count, na.rm = T), tpm = (length_norm_count * (1E6 / length_norm_sum)) %>% round(digits = 3)) %>%
Holger Brandl's avatar
Holger Brandl committed
93
    select(ensembl_gene_id, replicate, external_gene_name, tpm)
94 95 96


# do they sum up to one million?
97
tpms %>% summarize(total = sum(tpm, na.rm = TRUE))
98

99
# write_tsv(tpms, paste0(resultsBase, "tpms_long.txt"))
100 101 102 103 104 105

tpms %>%
    ungroup() %>%
    transmute(ensembl_gene_id, replicate, tpm) %>%
    spread(replicate, tpm) %>%
    write_tsv(paste0(resultsBase, "tpms_wide.txt"))
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151


########################################################################################################################
#' ## Size Factors

# from https://github.com/mgonzalezporta/TeachingMaterial/blob/master/doc/25.normalising.md
# library("Biobase")
load_pack("DESeq2")
# library("pasilla")

# load the count data
# you can skip this if you have already loaded the data in the previous section

idCol = names(counts)[1] #  expose as parameter

countMatrix = counts %>%
    as_df() %>%
    column_to_rownames(idCol) %>%
    as.matrix()

# load the experimental design
colData=data_frame(condition=rep("condition", ncol(countMatrix) ))


# create an object of class DESeqDataSet, which is the data container used by DESeq2
dds = DESeqDataSetFromMatrix(countData = countMatrix, colData = colData, design = ~ 1)
# colData(dds)$condition=factor(colData(dds)$condition, levels=c("untreated","treated"))
# dds

# the DESeqDataSet class is a container for the information we just provided
# head(counts(dds))
# colData(dds)
# design(dds)

dds = estimateSizeFactors(dds)
sizeFactors(dds)


counts(dds, normalized = TRUE) %>%
    as_df %>%
    rownames_to_column("ensembl_gene_id") %>%
    tbl_df %>%
    write_tsv(paste0(resultsBase, "sizenorm_wide.txt"))


# Regularized log transformation for clustering/heatmaps, etc
Holger Brandl's avatar
Holger Brandl committed
152 153
rld = rlogTransformation(dds)
# varstab = vst(dds) ## vst is supposed to be much 1k x faster
154

Holger Brandl's avatar
Holger Brandl committed
155
assay(rld) %>%
156 157 158 159 160
    as_df %>%
    rownames_to_column(idCol) %>%
    tbl_df %>%
    write_tsv(paste0(resultsBase, "vst_wide.txt"))