Commit c72ed071 authored by Lena Hersemann's avatar Lena Hersemann

fixed bug in CON__ REV__ and STANDARD__ removal: now all corresponding unique...

fixed bug in CON__ REV__ and STANDARD__ removal: now all corresponding unique entries and all biased entries which only contain any of these features are removed; added export of protein information and filtered for one fasta_header per protein_id only; cosmetics
parent 9eeac727
......@@ -271,6 +271,14 @@ if (any(map(perBatch, ~colnames(.x) %>% str_detect(., "identification_type")) %>
sample_info$sample %<>% str_replace_all(., oldNames, newNames)
write_tsv(sample_info, paste0(results_prefix, ".feature_sample_information.txt"))
protein_info <- sample_info %>% distinct(protein_ids, fasta_headers) %>% group_by(protein_ids) %>%
filter(max(nchar(fasta_headers))==nchar(fasta_headers)) %>% slice(1) %>% ungroup() %>%
rowwise() %>%
mutate(gene_name = paste(unlist(str_extract_all(fasta_headers, "GN=([:alnum:]+)")), collapse = "; ") %>% str_replace_all(., "GN=", ""),
protein_acc = paste(unlist(str_extract_all(protein_ids, "[trsp]+\\|([:alnum:]+)")), collapse = "; ") %>% str_replace_all(., "sp|tr|[|]", ""))
write_tsv(protein_info, paste0(results_prefix, ".feature_information.txt"))
#'
#' <br><br>
......@@ -358,14 +366,20 @@ write_tsv(msData, path=add_prefix("lfq_incl_ctrls.txt"))
#'
#' ### Control removal
#' In this step, unique CON__ REV__ and STANDARD entries are removed. So far, biased entries with simultaneous CON__ and protein accessions are kept.
msData %<>% mutate(is_scrap = is_control(protein_ids) & !str_detect(protein_ids, ";"))
msData %>% filter(is_scrap) %>% select(protein_ids) %>% DT::datatable(caption="controls removed from data")
protein_info %<>% mutate(sep_ids = str_split(protein_ids, ";")) %>% unnest(sep_ids) %>%
mutate(sep_prot = str_match(sep_ids, "[trsp]+\\|([:alnum:]+)")[,2]) %>%
group_by(protein_ids) %>%
mutate(is_scrap = sum(str_detect(sep_ids, "CON__|REV__|STANDARD")) == n()) %>%
distinct(protein_ids, fasta_headers, protein_acc, gene_name, is_scrap) %>% ungroup()
msData %<>% left_join(protein_info, by = "protein_ids")
msData %>% filter(is_scrap) %>% select(protein_ids) %>% DT::datatable(caption="controls removed from data")
tribble(~intial_data, ~filtered_data, ~removed_rows,
nrow(msData), nrow(filter(msData, !is_scrap)), nrow(filter(msData, is_scrap))) %>% kable()
msData %<>% filter(!is_scrap) %>% select(-is_scrap)
msData %<>% filter(!is_scrap) %>% select(-is_scrap, -fasta_headers, -protein_acc, -gene_name)
stopifnot(nrow(filter(msData, str_length(protein_ids)==0)) ==0)
......@@ -379,7 +393,7 @@ msData %>% gather(sample, intensity, -protein_ids) %>% group_by(protein_ids) %>%
summarize(sum_intensity=sum(intensity, na.rm=T)) %>%
arrange(-sum_intensity) %>% slice(1:25) %>%
# left_join(sample_info %>% distinct(protein_ids, protein_acc, gene_name, fasta_headers), by = "protein_ids") %>%
left_join(sample_info %>% distinct(protein_ids, gene_name, fasta_headers), by = "protein_ids") %>%
left_join(protein_info, by = "protein_ids") %>%
transmute(protein_ids, gene_name, sum_intensity, fasta_headers) %>%
table_browser()
......@@ -401,7 +415,7 @@ grid.arrange(p1, p2, nrow = 1)
protsData %>% ggplot(aes(intensity)) + geom_histogram(fill = "lightcyan4") + scale_x_log10() + ggtitle("LFQ intensities") + facet_wrap(~sample)
protsData %>% ggplot(aes(sample, intensity+1)) + geom_boxplot(fill = "lightcyan4") + ggtitle("LFQ intensities") + scale_y_log10() + coord_flip()
protsData %>% ggplot(aes(sample, intensity+1)) + geom_boxplot(fill = "lightcyan4") + ggtitle("LFQ intensities") + scale_y_log10() + coord_flip() + ylab("intensity")
## when using linear scale, we discard 0s to avoid biases means
protsData %>% filter(intensity>0) %>% ggplot(aes(sample, intensity)) +
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment