Commit 173058d5 authored by Holger Brandl's avatar Holger Brandl

added protocols

parent eeeca9dc
## Data processing scripts of Lehman et al. 2018
# Supplementary workflows details for Lehman et al. 2018
This repo contains scripts used to build the paper:
**Mutations in the Drosophila splicing regulator Prp31 as a model for Retinitis pigmentosa 11**<br>
Malte Lehmann, Sarita Hebbar, Holger Brandl, Weihua Leng, Naharajan Lakshmanaperumal, Sylke Winkler, Elisabeth Knust<br>
bioRxiv 147918; doi: https://doi.org/10.1
**TODO**
For an overall overview, please refer to the Materials & Methods section of our paper.
We share these computational protocols in the spirit of open data and reproducible research. So feel welcome to provide comments, report errors, or suggest improvements.
## Contained Workflows
1. Differential Gene Expression Analysis
See [dge_star_prp31.sh](./gene_diffex/dge_star_prp31.sh) for the master script.
2. Intron Retention
See [dexseq_diff_retention.sh](./intron_retention/dexseq_diff_retention.sh) for the master scripts.
3. SRA submission data preparation
See [prp31_geo_submission.sh](./prp31_geo_submission.sh).
## Usage & Disclaimer
The scripts are provided as is, without the intention that an interested reader will able to run them directly. We share our protocol here, not a ready-to-use tool. Still, we think they should provide enough technical detail to allow replicating our analysis.
Not detailed out in this repo are the steps to prepare a linux environment to include the listed tools and dependencies. Required tools include recent versions of R, python, samtools, STAR, bedtools, some ucsc tools, as well as:
* https://git.mpi-cbg.de/bioinfo/ngs_tools
* https://github.com/holgerbrandl/joblist
* https://github.com/holgerbrandl/kscript
## Support
Feel welcome to contact [us](mailto:bioinformatics@mpi-cbg.de) or any of the other authors listed in the original publicationon in case you have any questions.
export project="malte_rnaseq"
export PRJ_DATA=/Volumes/prp31mrna/data
export BIO_BIN_BASE="/projects/bioinfo/holger/bin"
export NGS_TOOLS="/Volumes/furiosa/bioinfo/scripts/ngs_tools/dev"
export PRJ_SCRIPTS="${PRJ_DATA}/../scripts"
cd "${PRJ_DATA}" || exit 1
source ${NGS_TOOLS}/dge_workflow/dge_utils.sh
export PATH=${NGS_TOOLS}/dge_workflow:$PATH
ls ${IGENOME} || { echo "igennome variable is not correctly set" 1>&2; exit 1; }
cd ${PRJ_DATA} || { echo "project directory '$PRJ_DATA' does not exist" 1>&2; exit 1; }
########################################################################################################################
### Fetch the data
mcdir ${PRJ_DATA}/originals
### Basic QC
dge_fastqc $(ls *fastq.gz) &
########################################################################################################################
### Apply renaming and merge lane replicates
#control (w-total_RNA_1)
#knockdown (P18E3_1H_tRNA_1).
mcdir ${PRJ_DATA}/originals/renamed
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.39/R/core_commons.R")
sheetFile <- "../maltel-FC_SN678_271-2014-7-30.xls"
sampleSheet <- read_excel(sheetFile, "Fastqfiles") %>%
select(File, SampleName) %>%
transmute(
File,
replicate=SampleName %>% str_replace_all("w-", "w_"),
sample = str_replace(replicate, "_[0-9]*$", "")
)
write_tsv(sampleSheet, path="renaming_scheme.txt")
#require(ggplot2)
#ggplot(sampleSheet, aes(sample)) + geom_bar() + coord_flip()
sampleSheet %>% rowwise %>% mutate(
zcat=paste("ln -s ", paste0("../", File), paste0(replicate, ".fastq.gz"))
) %$%
zcat %>%
write_lines("lane_merge.cmd")
sampleSheet %>% distinct(replicate, sample) %>%
arrange(sample, replicate) %>%
write_tsv(path="basic_design.txt")
' | R --vanilla -q
cat lane_merge.cmd | while read line; do
eval ${line}
done
zcat *fastq.gz | head
########################################################################################################################
### Alignment the reads
mcdir ${PRJ_DATA}/alignments
star_align.kts ${IGENOME} $(ls ${PRJ_DATA}/originals/renamed/*.fastq.gz) 2>&1 | tee star_algin.log
mailme "$project: mapping done"
## also extract a reverse-directed count-matrix
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.39/R/core_commons.R")
exprCounts <- list.files(".", "ReadsPerGene.out.tab") %>% map_df(function(countFile){
read.delim(countFile, header=F) %>%
select(V1, V4) %>%
set_names("ensembl_gene_id", "num_alignments") %>%
filter(!str_detect(ensembl_gene_id, "^N_")) %>%
mutate(sample=trim_ext(countFile, ".ReadsPerGene.out.tab"))
})
countMatrix = spread(exprCounts, sample, num_alignments)
write_tsv(countMatrix, "star_counts_matrix_revstranded.txt")
' | R --vanilla -q
rendr_snippet "count_correlation" echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.39/R/core_commons.R")
exprCounts <- list.files(".", "ReadsPerGene.out.tab") %>% map_df(function(countFile){
read.delim(countFile, header=F) %>%
set_names("ensembl_gene_id", "unstranded", "forward", "reverse") %>%
filter(!str_detect(ensembl_gene_id, "^N_")) %>%
mutate(sample=trim_ext(countFile, ".ReadsPerGene.out.tab"))
})
exprCounts %>% ggplot(aes(unstranded - reverse)) + geom_histogram()+ scale_y_log10()
countMatrix = spread(exprCounts, sample, stranded)
write_tsv(countMatrix, "star_counts_matrix_revstranded.txt")
'
head star_counts_matrix_revstranded.txt
#grep -F FBgn0002936 star_counts_matrix_revstranded.txt
#grep -F FBgn0031292 star_counts_matrix_revstranded.txt
########################################################################################################################
### Differential Expression Analysis
mcdir ${PRJ_DATA}/dge_analysis
# use protein coding genes only
## create a filtered gtf containing only protein coding ccds transcripts for quantification
echo '
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.39/R/core_commons.R")
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "dmelanogaster_gene_ensembl", host = paste0("mar2017.archive.ensembl.org"), path = "/biomart/martservice", archive = FALSE)
#ccdsTx <- getBM(attributes=c("ensembl_gene_id", "ensembl_transcript_id", "gene_biotype", "transcript_biotype"), filters=c("with_ccds"), values=TRUE, mart=mart)
pcGenes <- c("ensembl_gene_id", "gene_biotype") %>%
biomaRt::getBM(mart=mart) %>%
filter(gene_biotype=="protein_coding")
pcGenes %$% write_lines(ensembl_gene_id, path="dmel_ensv88_pc_genes.txt")
' | R --no-save --no-restore
## filter the count matrix for protein coding genes
csvgrep -t -c ensembl_gene_id -f dmel_ensv88_pc_genes.txt ${PRJ_DATA}/alignments/star_counts_matrix_revstranded.txt | csvformat -T > star_counts_matrix.pc.txt
wc -l ${PRJ_DATA}/alignments/star_counts_matrix.txt star_counts_matrix.pc.txt
## define contrasts of interest
echo "
sample_1, sample_2
P18E3_1H_tRNA,w_total_RNA
" | trim | csvformat -T > contrasts.txt
rend.R -e ${NGS_TOOLS}/dge_workflow/featcounts_deseq_mf.R --contrasts contrasts.txt star_counts_matrix.pc.txt ${PRJ_DATA}/originals/renamed/basic_design.txt 2>&1 | tee diffex_analysis.log
mkdir enrichment_analysis_directed
rend.R -e ${NGS_TOOLS}/common/cp_enrichment.R --qcutoff 0.05 --overlay_expr_data ../plot_score_matrix.txt ../degs_by_contrast_directed.txt contrast
#!/usr/bin/env Rscript
#' Gff flattening http://seqanswers.com/forums/showthread.php?t=42420
#'
#' Here's an example script I wrote in R that takes an annotation GFF file prepared by DEXSeq and adds "intronic_part"
#' records with associated gene_ids. You'll find both the original exonic bins and the newer intronic bins in the resulting
#' file (you'd need to change the file names), which you may or may not want. You could also modify this script to make the
#' intronic bins "exonic bins" and just look at exons and introns at once (I don't know how well that'd work) or remove the
#' old exonic bins and simply label the intronic_parts exonic_parts, which would probably make your life easier in DEXSeq
#' (I just modified DEXSeq). For the latter, just change the "asGFF2()" function.
if(length(commandArgs(TRUE)) != 2){
print("Usage: add_introns_to_gtf.R <inputGFF> <outputGFF>"); stop()
}
library(GenomicRanges)
library(rtracklayer)
library(parallel)
gffInputFile = commandArgs(TRUE)[1]
gffOutputFile = commandArgs(TRUE)[2]
# gffInput = "/local2/igenomes/Mus_Musculus/Ensembl_v81/GRCm38/Annotation/Genes/genes.gtf"
# gffOutput = "Mus_musculus.GRCm38.71.DEXSeq.introns.gff"
gff <- import.gff2(gffInputFile)
# gff <- import.gff2("Mus_musculus.GRCm38.71.DEXSeq.gff", asRangedData=F)
#Add a level for "intronic_part"
elementMetadata(gff)$type <- factor(elementMetadata(gff)$type, levels=c(levels(elementMetadata(gff)$type), "intronic_part"))
#Fix the exonic_part_number to be a properly formatted character
names(elementMetadata(gff))
USE <- which(!is.na(elementMetadata(gff)$exonic_part_number))
exonic_parts <- sprintf("%03i", as.integer(elementMetadata(gff)$exonic_part_number[USE]))
elementMetadata(gff)$exonic_part_number <- as.character(elementMetadata(gff)$exonic_part_number)
elementMetadata(gff)$exonic_part_number[USE] <- exonic_parts
#Split by gene_id
grl <- split(gff, elementMetadata(gff)$gene_id)
#Add the introns as "intronic_parts
add_introns <- function(gr) {
exons <- gr[which(elementMetadata(gr)$type=="exonic_part"),]
if(length(exons) > 1) {
seqname <- seqnames(exons)[-1]
starts <- end(exons)+1
starts <- starts[-length(starts)]
ends <- start(exons)-1
ends <- ends[-1]
bounds <- IRanges(start=starts, end=ends)
strand <- strand(exons)[-1]
introns <- GRanges(seqnames=seqname, ranges=bounds, strand=strand)
intron_ids <- sprintf("%03i", c(1:length(introns)))
#Remove 0-width introns
DISCARD <- which(width(introns) <= 0)
if(length(DISCARD) > 0) {
introns <- introns[-DISCARD]
intron_ids <- intron_ids[-DISCARD] #Set intron numbers so they follow their respective exonic parts
}
if(length(introns) > 0) {
#create the meta-data
df <- as.data.frame(elementMetadata(exons))
nrows <- length(introns)
metadf <- df[1:nrows,] #does this need to deal with gene_id and transcripts differently?
metadf <- transform(metadf, gene_id=as.character(gene_id), transcripts=as.character(transcripts))
metadf$transcripts <- as.character(c(rep(NA, nrows)))
metadf$type <- factor(c(rep("intronic_part", nrows)), levels=levels(metadf$type))
metadf$exonic_part_number <- intron_ids
elementMetadata(introns) <- metadf
#Merge the GRanges
gr <- append(gr, introns)
gr <- gr[order(start(gr), elementMetadata(gr)$type),] #resort
}
}
return(gr)
}
with_introns <- endoapply(grl, add_introns)
#reorder things
chroms <- sapply(with_introns, function(x) as.factor(seqnames(x))[1])
starts <- sapply(with_introns, function(x) start(x)[1])
o <- order(chroms, starts)
with_introns2 <- with_introns[o]
##Merge into a GRange
#with_introns2 <- unlist(with_introns2, use.names=F, recursive=T)
#Create GFF formatted output
asGFF2 <- function(x) {
df <- as.data.frame(x)
aggregates <- which(df$type == "aggregate_gene")
meta <- character(nrow(df))
meta[aggregates] <- sprintf("gene_id \"%s\"", df$gene_id[aggregates])
#This gives introns a transcript "NA" field, which may not be ideal
meta[-aggregates] <- sprintf("transcripts \"%s\"; exonic_part_number \"%s\"; gene_id \"%s\"", df$transcripts[-aggregates], df$exonic_part_number[-aggregates], df$gene_id[-aggregates])
paste(df$seqnames, "dexseq_prepare_annotation.py", df$type, df$start, df$end, ".", df$strand, ".", meta, sep="\t")
}
outputGFF <- unlist(lapply(with_introns2, asGFF2))
write.table(outputGFF, file=gffOutputFile, row.names=F, col.names=F, quote=F)
\ No newline at end of file
# screen -R dexseq_introns
# screen -R kallisto_malte
source <(curl https://raw.githubusercontent.com/holgerbrandl/datautils/v1.38/bash/core_utils.sh 2>&1 2>/dev/null)
export PRJ_DATA=/Volumes/prp31mrna/data
export BIO_BIN_BASE="/projects/bioinfo/holger/bin"
export NGS_TOOLS="/Volumes/furiosa/bioinfo/scripts/ngs_tools/dev"
export PRJ_SCRIPTS="${PRJ_DATA}/../scripts"
export PATH=${BIO_BIN_BASE}/bedtools2-2.25.0/bin/:$PATH
mcdir ${PRJ_DATA}/intron_retention_dexseq
########################################################################################################################
## Detect expressed isoforms
## adopted from http://crazyhottommy.blogspot.de/2016/07/comparing-salmon-kalliso-and-star-htseq.html
mcdir ${PRJ_DATA}/intron_retention_dexseq/isoform_abundance
export PATH=${HOME}/bin/kallisto-0.43.1/src:$PATH
#which kallisto
#kallisto --help
## build index
## ftp://ftp.ensembl.org/pub/release-75/fasta/drosophila_melanogaster/cdna/
#wget ftp://ftp.ensembl.org/pub/release-75/fasta/drosophila_melanogaster/cdna/Drosophila_melanogaster.BDGP5.75.cdna.all.fa.gz
wget ftp://ftp.ensembl.org/pub/release-88/fasta/drosophila_melanogaster/cdna/Drosophila_melanogaster.BDGP6.cdna.all.fa.gz
gunzip Drosophila_melanogaster.BDGP6.cdna.all.fa.gz
mv Drosophila_melanogaster.BDGP6.cdna.all.fa Drosophila_melanogaster.BDGP6.ens88.cdna.all.fa
## build kallisto index
ensCdna=Drosophila_melanogaster.BDGP6.ens88.cdna.all.fa
kallisto index -i ${ensCdna}.kalisto.idx ${ensCdna}
## quantify transcript abundance
for fastqFile in $(ls ${PRJ_DATA}/originals/renamed/*fastq.gz); do
#DEBUG fastFile=$(ls ${PRJ_DATA}/originals/*fastq.gz| head -n 1)
jl submit "kallisto quant -t 20 -i ${ensCdna}.kalisto.idx -o $(basename ${fastqFile} .fastq.gz) --single -l 200 -s 20 ${fastqFile}"
done
jl wait --email
rendr_snippet "filter_expresed_tx" <<"EOF"
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.40/R/core_commons.R")
#' isoforms per gene
gene2tx <- quote({
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "dmelanogaster_gene_ensembl", host = paste0("mar2017.archive.ensembl.org"), path = "/biomart/martservice", archive = FALSE)
c("ensembl_gene_id", "ensembl_transcript_id") %>% biomaRt::getBM(mart = mart)
}) %>% cache_it()
reload_dplyr()
gene2tx %<>% tbl_df
#' how many isoforms are there for each gene
isoCounts = gene2tx %>%
count(ensembl_gene_id) %>%
n_as("num_iso")
isoCounts %>%
filter(num_iso < 20) %>%
ggplot(aes(as.factor(num_iso))) + geom_bar()
count(isoCounts, num_iso > 1)
#' load kallisto isoform abundance data
load_pack(tximport)
# files = list.files("isoform_abundance", "abundance.h5", recursive=TRUE, full=TRUE)
# names(files) = map(files, ~basename(dirname(.x)))
# txi.kallisto <- tximport(files, type = "kallisto", txOut = TRUE)
files = list.files("isoform_abundance", "abundance.tsv", recursive = TRUE, full = TRUE)
names(files) = map(files, ~ basename(dirname(.x)))
txi.kallisto <- tximport(files, type = "kallisto", txOut = TRUE)
# head(txi.kallisto$counts)
head(txi.kallisto$abundance)
#ccdsTx <- getBM(attributes=c("ensembl_gene_id", "ensembl_transcript_id", "gene_biotype", "transcript_biotype"), filters=c("with_ccds"), values=TRUE, mart=mart)
txTPM = txi.kallisto$abundance %>%
as_df %>%
rownames_to_column("ensembl_transcript_id") %>%
gather(replicate, tpm, - ensembl_transcript_id) %>%
left_join(gene2tx)
count(txTPM, is.na(ensembl_gene_id))
txTPM %>% ggplot(aes(tpm)) + geom_histogram() + scale_x_log10()
#' from papers3://publication/doi/10.1242/dev.144535:
#' transcripts contributing less than 5% to the total abundance (TPM) of the corresponding gene in all samples were removed from the gtf file (Soneson et al., 2016)
#'
#' Calculate relative abundance
txTPM %<>%
group_by(ensembl_gene_id, replicate) %>%
mutate(gene_abundance = sum(tpm), rel_iso_abundance = tpm / gene_abundance)
glimpse(txTPM)
# filter(txTPM, ensembl_gene_id=="FBgn0000014") %>% print_all
txTPM %>% ungroup %>% count(is.na(rel_iso_abundance))
ggplot(txTPM, aes(rel_iso_abundance)) + geom_histogram()
txPropSummary = txTPM %>% group_by(ensembl_gene_id, ensembl_transcript_id) %>% summarize(more_5perc = sum(rel_iso_abundance > 0.05, na.rm=T))
txPropSummary %>% ggplot(aes(as.factor(more_5perc))) + geom_bar() + ggtitle("isoforms contributing more than 5% to gene expressoin in N samples") + xlab("# num samples")
lowAbundTx = filter(txPropSummary, more_5perc==0)
lowAbundTx %$% write_lines(ensembl_transcript_id, path="low_abundance_tx.txt")
#' check example gene
filter(txTPM, ensembl_transcript_id=="FBtr0083385")
filter(txTPM, ensembl_gene_id=="FBgn0000015") %>% print_all
#' NA values
# filter(txTPM, ensembl_transcript_id=="FBtr0070002")
# filter(txTPM, ensembl_gene_id=="FBgn0000014")
EOF
########################################################################################################################
### Reanalyze retention using dexseq
mcdir ${PRJ_DATA}/intron_retention_dexseq
## for protocol see http://seqanswers.com/forums/showthread.php?t=42420
## the dexseq python scrips require python 2.x to run
#pip install --user numpy
#pip install --user HTSeq
source ${NGS_TOOLS}/common/deepseq_commons.sh
assemblyBase=BDGP6_Ens81
## local
export DEXSEQ_PYSCRIPTS=/Library/Frameworks/R.framework/Versions/3.4/Resources/library/DEXSeq/python_scripts
## keep a copy of the oringal
baseGtf=tx_model/${assemblyBase}.igenome_orignal.gtf
igvsort ${baseGtf}
lowAbundFiltGff=tx_model/BDGP6_Ens81.low_abund_filt.gff
kscript - ${baseGtf} isoform_abundance/low_abundance_tx.txt <<"EOF" > ${lowAbundFiltGff}
//DEPS com.github.holgerbrandl:kscript:1.2.1
//DEPS de.mpicbg.scicomp:kutils:0.8-SNAPSHOT
import de.mpicbg.scicomp.bioinfo.readGTF
import kscript.text.linesFrom
import java.io.File
//val args = arrayOf("/Volumes/prp31mrna/data/intron_retention_dexseq/FBgn0012034.gtf", "/Volumes/prp31mrna/data/intron_retention_dexseq/isoform_abundance/low_abundance_tx.txt")
val gtf = readGTF(File(args[0]))
val lowAbundTx = linesFrom(File(args[1])).toList()
gtf.filterNot { record -> lowAbundTx.contains(record.attributes.get("transcript_id")) }.forEach { println(it) }
EOF
wc -l ${baseGtf} ${lowAbundFiltGff}
igvsort ${lowAbundFiltGff}
dexseqGff=${assemblyBase}.dexseq.gff
#idea ${DEXSEQ_PYSCRIPTS}/dexseq_prepare_annotation.py
python ${DEXSEQ_PYSCRIPTS}/dexseq_prepare_annotation.py ${lowAbundFiltGff} ${dexseqGff}
igvsort ${dexseqGff}
## complement to include introns
#chmod +x ${PRJ_SCRIPTS}/intron_retention/add_introns_to_gtf.R
tmpTxModel=tx_model/${assemblyBase}.dexseq_wintrons.gff
${PRJ_SCRIPTS}/intron_retention/add_introns_to_gtf.R ${dexseqGff} ${tmpTxModel}
finalTxModel=tx_model/${assemblyBase}.dexeq_inasex.gff
kscript 'lines.map{ it.replace("NA\"; exonic_part_number \"", "NA\"; exonic_part_number \"i") }.print()' ${tmpTxModel} | sed 's/intronic_part/exonic_part/' > ${finalTxModel}
#head ${assemblyBase}.dexeq_inasex.gff
igvsort ${finalTxModel}
## count reads in all segments
export gffFile=${finalTxModel}
ll $gffFile ${DEXSEQ_PYSCRIPTS}/dexseq_count.py
for bamFile in $(find ${PRJ_DATA}/alignments/ | grep '.bam$'); do
jl submit "python2.7 ${DEXSEQ_PYSCRIPTS}/dexseq_count.py --format bam ${gffFile} ${bamFile} $(basename ${bamFile} .bam).dexseq_counts.txt"
done
#jl reset
jl wait --email --report
rend.R ${PRJ_SCRIPTS}/intron_retention/dexseq_retention.R
mcdir ${PRJ_DATA}/intron_retention_dexseq/term_enrichment
Rscript - <<"EOF"
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.39/R/core_commons.R")
diffexGenes = read_tsv("../diffex_regions.introns.tsv") %>% mutate(contrast="treatment_vs_control")
diffexGenes %<>% filter_count(!is.na(s1_overex))
## annotate results with symbols and stuff
geneInfo = quote({
mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "dmelanogaster_gene_ensembl", host = "mar2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)
c("ensembl_gene_id", "external_gene_name", "description") %>% biomaRt::getBM(mart = mart) %>% tbl_df
}) %>% cache_it("geneInfo")
enrLists = bind_rows(
transmute(diffexGenes, ensembl_gene_id=groupID, gene_set="treatment vs control"),
transmute(diffexGenes, ensembl_gene_id=groupID, gene_set=paste("treatment", if_else(s1_overex, ">", "<"), "control"))
) %>% distinct_all(ensembl_gene_id, gene_set)
count(enrLists, gene_set)
# enrLists %>% write_tsv("diffex_regions.introns.slim_for_enrichment.tsv")
enrLists %>% left_join(geneInfo) %>% write_tsv("diffex_regions.introns.slim_for_enrichment.tsv")
# https://stackoverflow.com/questions/24652771/finding-the-maximum-absolute-value-whilst-preserving-or-symbol
absmax <- function(x) { x[which.max( abs(x) )][1]}
## also export plot overlay data
read_tsv("../de_data.tsv") %>% transmute(
ensembl_gene_id = groupID,
contrast = "intron_retention_trt_over_ctrl",
plot_score = - log10(pvalue) * ifelse(s1_overex, 1, - 1)
) %>%
group_by(contrast, ensembl_gene_id) %>%
# summarize(max_score=max(plot_score)) %>%
summarize(max_score=absmax(plot_score)) %>%
spread(contrast, max_score) %>%
write_tsv("plot_score_matrix.txt")
EOF
## test for term enrichment
#rend.R -e ${NGS_TOOLS}/common/cp_enrichment.R --qcutoff 0.05 diffex_regions.introns.slim_for_enrichment.tsv gene_set
rend.R -e ${NGS_TOOLS}/common/cp_enrichment.R --qcutoff 0.05 --overlay_expr_data plot_score_matrix.txt diffex_regions.introns.slim_for_enrichment.tsv gene_set
## test for feature enrichment as well
rend.R -e ${NGS_TOOLS}/common/gene_feature_enrichment.R --qcutoff 0.05 diffex_regions.introns.slim_for_enrichment.tsv gene_set
#R --args --qcutoff 0.05 diffex_regions.introns.slim_for_enrichment.tsv gene_set
#' # Intron Retention Analysis
#' Using DEXSeq. See https://stat.ethz.ch/pipermail/bioconductor/2013-February/050728.html or http://seqanswers.com/forums/showthread.php?t=42420
#' Similar protocol was used in Sauerwald, J., Soneson, C., Robinson, M. D., & Luschnig, S. (2017). Faithful mRNA splicing depends on the Prp19 complex subunit faint sausage and is required for tracheal branching morphogenesis in Drosophila. Development, 144(4), 657–663. doi:10.1242/dev.144535
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.40/R/core_commons.R")
load_pack(DEXSeq)
# devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.39/R/core_commons.R")
# load_pack(DEXSeq)
# setwd("/home/lakshman/mnt/mlehmann_rnaseq/mapping/HTSeq_counts")
countFiles = list.files(getwd(), pattern = "_counts.txt$", full.names = TRUE)
# countFiles = list.files("no_low_abundance_filter_counts", pattern = "_counts.txt$", full.names = TRUE)
data_frame(countFiles) %>% mutate()
flattenedFile = "tx_model/BDGP6_Ens81.dexeq_inasex.gff"
# track2replicate = read_tsv("tracks2replicates.txt")
sampleInfo = data.frame(
SampleName = c("P18E3_1H_tRNA_1", "P18E3_1H_tRNA_2", "P18E3_1H_tRNA_3", "w_total_RNA_1", "w_total_RNA_2", "w_total_RNA_3"),
condition = c("knockdown", "knockdown", "knockdown", "control", "control", "control"),
libType = c("single-end", "single-end", "single-end", "single-end", "single-end", "single-end")
)
expLayout = data_frame(countFile = countFiles) %>%
mutate(SampleName = basename(countFile) %>% str_replace_all(".dexseq_counts.txt", "")) %>%
# left_join(track2replicate, by = c("sample" = "File")) %>%
left_join(sampleInfo, by = "SampleName")
# require(tibble)
sampleTable = expLayout %>%
as_df %>%
dplyr::select(SampleName, condition, libType) %>%
column2rownames("SampleName")
dxd = DEXSeqDataSetFromHTSeq(countFiles, sampleData = sampleTable, design = ~ sample + exon + condition : exon, flattenedfile = flattenedFile)
#
# class: DEXSeqDataSet
# dim: 78465 12
# exptData(0):
# assays(1): counts
# rownames(78465): FBgn0000003:E001 FBgn0000008:E001 ... FBgn0264726:E001
# FBgn0264727:E001
# rowData metadata column names(5): featureID groupID exonBaseMean
# exonBaseVar transcripts
# colnames: NULL
# colData names(4): sample condition libType exon
head(counts(dxd), 5)
head(featureCounts(dxd), 5)
head(rowData(dxd), 3)
sampleAnnotation(dxd)
dxd = estimateSizeFactors(dxd)
dxd = estimateDispersions(dxd)
plotDispEsts(dxd)
dxd = testForDEU(dxd)
dxd = estimateExonFoldChanges(dxd, fitExpToVar = "condition")
# dxd_1 = estimateExonFoldChanges(dxd, fitExpToVar = "condition", maxRowsMF = 4000)
deResultsObj = DEXSeqResults(dxd)
# dxr2 = DEXSeqResults(dxd, FDR = 0.1)
#' plot an example
plotDEXSeq(deResultsObj, "FBgn0000008", legend = TRUE, cex.axis = 1.2, cex = 1.3, lwd = 2)
## later
DEXSeqHTML(deResultsObj, genes = "FBgn0000008", FDR = 0.1, color = c("#FF000080", "#0000FF80"))
#' convert to table and annotate results
deResults = deResultsObj %>%
as_tibble() %>%
tibble::rownames_to_column("ensembl_gene_id") %>%
dplyr::rename(s1_over_s2_logfc = log2fold_knockdown_control) %>%
mutate(is_hit = padj < 0.1) %>%
mutate(s1_overex = s1_over_s2_logfc > 0) %>%
mutate(transcripts = map_chr(transcripts, ~paste(.x, collapse=", ")))
deResults %>% str(deResults)
## see http://rpackages.ianhowson.com/bioc/DESeq2/man/results.html when using contrasts argument
#file:///Volumes/prp31mrna/mapping/HTSeq_counts/DEXSeqReport/testForDEU.html
####How are the log2 fold changes calculated
#http://seqanswers.com/forums/archive/index.php/t-27564.html
# setwd("Volumes/prp31mrna/mapping/HTSeq_counts/DEXSeqReport")
# load("DESeq_exon_usage_analysis.RData")
# library(readr)
## annotate results with symbols and stuff
geneInfo = quote({
mart = biomaRt::useMart("ENSEMBL_MART_ENSEMBL", dataset = "dmelanogaster_gene_ensembl", host = "mar2017.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE)
# c("intron_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
c("ensembl_gene_id", "external_gene_name", "description", "chromosome_name", "start_position", "end_position") %>%
biomaRt::getBM(mart = mart) %>%
tbl_df
}) %>% cache_it("geneInfo")
deResults %<>% left_join(geneInfo, by=c("groupID"="ensembl_gene_id"))
deResults %<>% mutate(is_intron=str_detect(featureID, "^Ei"))
diffexRegions = filter(deResults, is_hit) %>% as.data.frame()
write_tsv(diffexRegions, "de_data.tsv")
write_tsv(diffexRegions, "diffex_regions.all.tsv")
diffexRegions %>%
filter(is_intron) %>%
write_tsv("diffex_regions.introns.tsv")
diffexRegions %>%
filter(!is_intron) %>%
write_tsv("diffex_regions.exons.tsv")
#' ## Methods bits from Sauerwald et al. 2017
#'
#' DEXSeq(Anders et al., 2012)was used to investigate the extent of intron retention. The reads were aligned to the
#' Drosophila melanogaster genome (Ensembl v70) with STAR (v2.4.2a) (Dobin et al., 2013), retaining only reads aligning to
#' a unique location. Based on the kallisto results obtained as described above, transcripts contributing less than 5% to
#' the total abundance (TPM) of the corresponding gene in all samples were removed from the gtf file (Soneson et al.,
#' 2016). The reduced gtf file was processed with the python scripts provided with DEXSeq to "flatten" the annotation file
#' by generating disjoint exon bins. Finally, the flattened annotation file was extended with intronic bins, defined as
#' intragenic regions that did not overlap with any exon of the retained transcripts. Using this extended annotation,
#' DEXSeq was used to quantify the abundance of each exonic and intronic bin, and to test each bin for differential
#' inclusion between the control and mutant groups. Only results for intronic bins were retained for interpretation.
#' # visualizaiton