Commit 1a206abb authored by Lena Hersemann's avatar Lena Hersemann

revised section for contaminations removal and added "scrap"-info to the...

revised section for contaminations removal and added "scrap"-info to the "intensities_incl_ctrls.txt" file; added TODO information; cosmetics
parent a0052f9c
......@@ -13,13 +13,14 @@
suppressMessages(require(docopt))
doc = '
Prepare mass spec data for differential analysis
Prepare mass spec data for differential abundance analysis
Usage: ms_data_prep.R [options] <ms_data_folder> <design_file>
Options:
--renaming_scheme <file> Table with two columns with the "old" and "new" names
--data_type <data_type> Which data type (i.e. raw or lfq) to use for the analysis [default: lfq]
'
#TODO: set standard as optional argument (Henrik will provide an extra file which specifies the necessary information)
#commandArgs <- function(x) c("--renaming_scheme", "renaming_scheme.txt", "../provided", "exp_design.txt")
opts = docopt(doc, commandArgs(TRUE))
......@@ -130,7 +131,9 @@ if (data_type == "lfq") {
perBatch <- sapply(names(perBatch), function(x){
data <- as.data.frame(perBatch[x])
colnames(data) %<>% str_replace(., "lfq_", "")
colnames(data) %<>% str_replace(., paste0(x, "."), "")
prot_col <- data %>% select(contains(".protein_ids")) %>% colnames()
sample <- str_replace(prot_col, ".protein_ids", "")
colnames(data) %<>% str_replace(., paste0(sample, "."), "")
data %>% tbl_df()
}, simplify = FALSE,USE.NAMES = TRUE)
......@@ -157,6 +160,15 @@ init_stat <- perBatch %>% map_df(~ mutate(.x,
#' Examples for unique and ambiguous entries
init_stat %>% gather(feature, direction, unique_entry:scrap_prop_ambiguous) %>% filter(direction) %>% group_by(feature) %>% slice(1) %>% select(feature, protein_ids)
# extract information on protein IDs which shall be removed later
scrap_ids <- init_stat %>% gather(feature, direction, unique_entry:scrap_prop_ambiguous) %>% filter(direction) %>% filter(feature == "scrap_prop_unique" | feature == "scrap_prop_ambiguous") %>% select(protein_ids) %>% mutate(single_entries = str_split(protein_ids, ";")) %>% unnest(single_entries) %>%
group_by(protein_ids) %>%
mutate(all_scrap = sum(str_detect(single_entries, "CON__|REV__|RTSTANDARD")) == n()) %>%
ungroup() %>%
select(-single_entries) %>%
distinct_all(protein_ids) %>%
filter(all_scrap) %$% protein_ids
#'
#' <br>
#'
......@@ -229,6 +241,8 @@ init_stat_plot %>% ggplot(aes(intensity, fill = feature, color = feature)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(panel.background = element_rect(fill = 'white', colour = 'grey'), panel.grid = element_line(color = "gray90"))
#TODO: take the RTSTANDARD intensity as a minimal threshold to mark low-abundant proteinGroups, i.e. groups which have an intensity lower than the RTSTANDARD
#'
#' <br><br>
#'
......@@ -304,7 +318,7 @@ if (!is_empty(pepFiles) & nrow(contr_data) > 0){
# if (any (perBatch %>% map_df(~ filter(.x, is.na(scrap)) %>% count(is_dupl = duplicated(protein_acc)), .id = 'GROUP') %$% is_dupl)) {stop("ATTENTION: one of the samples contains duplicated protein ids")}
perBatch %<>% map(~ mutate(.x, scrap = case_when(str_detect(protein_ids, "CON__")~"CON", str_detect(protein_ids, "REV__")~"REV", str_detect(protein_ids, fixed("ST|RTSTANDARD"))~"STANDARD")))
# perBatch %<>% map(~ mutate(.x, scrap = case_when(str_detect(protein_ids, "CON__")~"CON", str_detect(protein_ids, "REV__")~"REV", str_detect(protein_ids, fixed("ST|RTSTANDARD"))~"STANDARD")))
#'
......@@ -317,13 +331,16 @@ sample_info <- perBatch %>% map_df(~ select(.x, -starts_with("identification_typ
if (any(map(perBatch, ~colnames(.x) %>% str_detect(., "identification_type")) %>% unlist())) {
sample_info <- perBatch %>% map_df(~ select(.x, -starts_with("intensity")), .id = 'GROUP') %>% gather(sample, identification_type, starts_with("identification_type")) %>% mutate(sample = str_replace_all(sample, "identification_type_", "")) %>% mutate(oldName = sample) %>% rename(file_name = GROUP) %>%
# right_join(sample_info, by = c("sample", "file_name", "oldName", "protein_ids", "protein_acc", "scrap", "fasta_headers", "init_groups"))
right_join(sample_info, by = c("sample", "file_name", "oldName", "protein_ids", "old_protein_ids", "scrap", "fasta_headers"))
right_join(sample_info, by = c("sample", "file_name", "oldName", "protein_ids", "old_protein_ids", "fasta_headers"))
ident_types <- TRUE
print("Identification type data were provided")
} else {
print("No identification type information found")
}
#TODO: report statistics on how many MS/MS or by matching were found for each gene for the individual replicates in the script
#TODO: summarize the MS/MS information as numeric value (e.g. percentage of MS/MS in all replicates) and add information to the limma output (consider to export information as html)
# TODO: find faster solution for renaming
sample_info$sample %<>% str_replace_all(., oldNames, newNames)
write_tsv(sample_info, paste0(results_prefix, ".feature_sample_information.txt"))
......@@ -344,6 +361,7 @@ write_tsv(protein_info, paste0(results_prefix, ".feature_information.txt"))
print("Number of ambiguous protein IDs with re-arranged order:")
perBatch %>% map_df(~ count(.x, protein_ids != old_protein_ids) %>% setNames(c("new_order", "count")) %>% spread(new_order, count),.id = 'GROUP') %>% kable()
#TODO: think about exporting some information on which proteinGroups were changed in their order (e.g. a column indicating changed order)
## combine individual files by protein_ids, protein_acc and scrap only
msData = perBatch %>%
......@@ -408,7 +426,6 @@ msData %>% select(protein_ids, contains("intensity")) %>% gather(sample, intensi
names(msData) %<>% str_replace("intensity_", "")
write_tsv(msData, path=add_prefix("intensities_incl_ctrls.txt"))
# msData <- read_tsv("msData.txt")
#' [msData](msData.txt)
......@@ -421,15 +438,17 @@ write_tsv(msData, path=add_prefix("intensities_incl_ctrls.txt"))
#'
#' <br><br>
#'
#' ### Control removal
#' ### Contaminations removal
#' In this step, unique CON__ REV__ and STANDARD entries are removed. So far, ambiguous entries with simultaneous CON__ and protein accessions are kept.
protein_info %<>% mutate(sep_ids = str_split(protein_ids, ";")) %>% unnest(sep_ids) %>%
mutate(sep_prot = str_match(sep_ids, "[trsp]+\\|([:alnum:]+)")[,2]) %>%
group_by(protein_ids) %>%
mutate(is_scrap = sum(str_detect(sep_ids, "CON__|REV__|STANDARD")) == n()) %>%
distinct(protein_ids, fasta_headers, protein_acc, gene_name, is_scrap) %>% ungroup()
# protein_info %>% mutate(sep_ids = str_split(protein_ids, ";")) %>% unnest(sep_ids) %>%
# mutate(sep_prot = str_match(sep_ids, "[trsp]+\\|([:alnum:]+)")[,2]) %>%
# na.omit() %>%
# group_by(protein_ids) %>%
# distinct(protein_ids, fasta_headers, protein_acc, gene_name) %>% ungroup()
msData %<>% left_join(protein_info, by = "protein_ids")
msData %<>% mutate(is_scrap = protein_ids %in% scrap_ids) %>% left_join(protein_info, by = "protein_ids")
write_tsv(msData, path=add_prefix("intensities_incl_ctrls.txt"))
msData %>% filter(is_scrap) %>% select(protein_ids) %>% DT::datatable(caption="controls removed from data")
......@@ -606,6 +625,7 @@ print(paste0(nrow(anti_join(msData, imputeData)), " out of ", nrow(msData), " ro
if (sample_num != ncol(imputeData)-1) {stop("ATTENTION: Final intensities matrix does not contain all input samples")}
#TODO: report information for each gene and condition, whether any value was imputed and add information to final limma results
# after zero-imputing of all remaining NAs
imputeData %<>% mutate_if(is.numeric, funs(replace(., is.na(.), 0)))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment