Commit 322b0c92 authored by Lena Hersemann's avatar Lena Hersemann

revised code for plotting of lfq intensities and added density plot for all...

revised code for plotting of lfq intensities and added density plot for all unique and ambiguous entries per sample; cosmetics
parent c72ed071
......@@ -112,27 +112,27 @@ perBatch %>% map_df(~ select(.x, starts_with("LFQ_intensity")) %$% paste0(str_re
#'
#' <br>
#'
#' ### Unique and biased entries per file, i.e. one or multiple protein IDs
## collect information about lines per file and whether the entries are related to proteins, "scrap" and or unique or biased entries
#' ### Unique and ambiguous entries per file, i.e. one or multiple protein IDs
## collect information about lines per file and whether the entries are related to proteins, "scrap" and or unique or ambiguous entries
init_stat <- perBatch %>% map_df(~ mutate(.x,
unique_entry = !str_detect(protein_ids, ";") & !str_detect(protein_ids, "CON__|REV__|RSTANDARD"),
biased_entry = str_detect(protein_ids, ";") & !str_detect(protein_ids, "CON__|REV__|RSTANDARD"),
ambiguous_entry = str_detect(protein_ids, ";") & !str_detect(protein_ids, "CON__|REV__|RSTANDARD"),
scrap_prop_unique = !str_detect(protein_ids, ";") & str_detect(protein_ids, "CON__|REV__|RSTANDARD"),
scrap_prop_biased = str_detect(protein_ids, ";") & str_detect(protein_ids, "CON__|REV__|RSTANDARD")
scrap_prop_ambiguous = str_detect(protein_ids, ";") & str_detect(protein_ids, "CON__|REV__|RSTANDARD")
), .id = 'GROUP')
#' Examples for unique and biased entries
init_stat %>% gather(feature, direction, unique_entry:scrap_prop_biased) %>% filter(direction) %>% group_by(feature) %>% slice(1) %>% select(feature, protein_ids)
#' Examples for unique and ambiguous entries
init_stat %>% gather(feature, direction, unique_entry:scrap_prop_ambiguous) %>% filter(direction) %>% group_by(feature) %>% slice(1) %>% select(feature, protein_ids)
#'
#' <br>
#'
#' Summary of unique and biased samples per input file
#' Summary of unique and ambiguous samples per input file
# export numeric summary as table
init_stat %>% group_by(., GROUP) %>% summarize(total_entries = n(), unique_entries = sum(unique_entry) + sum(scrap_prop_unique), biased_entries = sum(biased_entry) + sum(scrap_prop_biased)) %>% kable()
init_stat %>% group_by(., GROUP) %>% summarize(total_entries = n(), unique_entries = sum(unique_entry) + sum(scrap_prop_unique), ambiguous_entries = sum(ambiguous_entry) + sum(scrap_prop_ambiguous)) %>% kable()
init_stat %>% select(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased) %>% gather(feature, direction, -GROUP) %>%
init_stat %>% select(GROUP, unique_entry, ambiguous_entry, scrap_prop_unique, scrap_prop_ambiguous) %>% gather(feature, direction, -GROUP) %>%
group_by(GROUP, feature) %>% summarize(count = sum(direction)) %>%
ggplot(aes(GROUP, count, fill = feature)) +
geom_col() +
......@@ -141,16 +141,21 @@ init_stat %>% select(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap
# ggtitle("Unique and non-unique entries per input file") +
scale_fill_manual(values=c("coral3", "coral1", "cadetblue3", "cadetblue"))
# extract information on lfq intensities for the different entry categories per sample
init_stat_plot <- init_stat %>% select(GROUP, unique_entry, ambiguous_entry, scrap_prop_unique, scrap_prop_ambiguous, matches("lfq_intensity")) %>%
gather(sample, lfq_intensity, -c(GROUP, unique_entry, ambiguous_entry, scrap_prop_unique, scrap_prop_ambiguous)) %>%
gather(feature, direction, unique_entry, ambiguous_entry, scrap_prop_unique, scrap_prop_ambiguous) %>%
filter(direction) %>%
mutate(sample = str_replace(sample, "lfq_intensity_", "") %>% str_replace_all(., oldNames, newNames),
GROUP = str_replace(GROUP, ".proteinGroups.txt", "") %>% str_replace_all(., oldNames, newNames)) %>%
na.omit()
#'
#' <br><br>
#'
#' ### LFQ intensities of unique and biased entries per file
init_stat %>% select(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased, matches("lfq_intensity")) %>%
gather(sample, lfq_intensity, -c(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased)) %>%
gather(feature, direction, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased) %>%
filter(direction) %>%
mutate(GROUP = str_replace(GROUP, ".proteinGroups.txt", "")) %>%
na.omit() %>%
#' ### LFQ intensities of unique and ambiguous entries per file
init_stat_plot %>%
ggplot(aes(feature, lfq_intensity+1, fill = feature)) +
geom_boxplot() +
# geom_violin() +
......@@ -164,14 +169,8 @@ init_stat %>% select(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap
#'
#' <br><br>
#'
#' ### LFQ intensities of per sample
init_stat %>% select(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased, matches("lfq_intensity")) %>%
gather(sample, lfq_intensity, -c(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased)) %>%
gather(feature, direction, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased) %>%
filter(direction) %>%
mutate(sample = str_replace(sample, "lfq_intensity_", "") %>% str_replace_all(., oldNames, newNames),
GROUP = str_replace(GROUP, ".proteinGroups.txt", "") %>% str_replace_all(., oldNames, newNames)) %>%
na.omit() %>%
#' ### LFQ intensities of unique and ambiguous entries per sample
init_stat_plot %>%
ggplot(aes(feature, lfq_intensity+1, fill = feature)) +
geom_boxplot() +
# geom_violin() +
......@@ -182,6 +181,22 @@ init_stat %>% select(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap
scale_fill_manual(values=c("coral3", "coral1", "cadetblue3", "cadetblue")) +
facet_wrap(~GROUP + sample)
#TODO: write function to adjust font size in plots according to sample name length
#'
#' <br><br>
#'
#' ### Non-zero LFQ intensities of per sample
init_stat_plot %>% ggplot(aes(lfq_intensity, fill = feature, color = feature)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
xlab("LFQ intensities") +
scale_fill_manual(values=c("coral3", "coral1", "cadetblue3", "cadetblue")) +
scale_color_manual(values=c("coral3", "coral1", "cadetblue3", "cadetblue")) +
facet_wrap(~GROUP + sample) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(panel.background = element_rect(fill = 'white', colour = 'grey'), panel.grid = element_line(color = "gray90"))
#'
#' <br><br>
#'
......@@ -223,8 +238,8 @@ std_info %>%
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# init_stat %>% select(GROUP, protein_ids, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased, matches("lfq_intensity")) %>%
# gather(sample, lfq_intensity, -c(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased)) %>%
# init_stat %>% select(GROUP, protein_ids, unique_entry, ambiguous_entry, scrap_prop_unique, scrap_prop_ambiguous, matches("lfq_intensity")) %>%
# gather(sample, lfq_intensity, -c(GROUP, unique_entry, ambiguous_entry, scrap_prop_unique, scrap_prop_ambiguous)) %>%
# mutate(sample = str_replace(sample, "lfq_intensity_", "")) %>% left_join(renamingScheme, by = c("sample" = "old")) %>%
# na.omit() %>% write_tsv("feature_information.txt")
......@@ -283,8 +298,8 @@ write_tsv(protein_info, paste0(results_prefix, ".feature_information.txt"))
#'
#' <br><br>
#'
#' ### Reorder protein IDs of biased entries
print("Number of biased protein IDs with re-arranged order:")
#' ### Reorder protein IDs of ambiguous entries
print("Number of ambiguous protein IDs with re-arranged order:")
perBatch %>% map_df(~ count(.x, protein_ids != old_protein_ids) %>% setNames(c("new_order", "count")) %>% spread(new_order, count),.id = 'GROUP') %>% kable()
......@@ -365,7 +380,7 @@ write_tsv(msData, path=add_prefix("lfq_incl_ctrls.txt"))
#' <br><br>
#'
#' ### Control removal
#' In this step, unique CON__ REV__ and STANDARD entries are removed. So far, biased entries with simultaneous CON__ and protein accessions are kept.
#' In this step, unique CON__ REV__ and STANDARD entries are removed. So far, ambiguous entries with simultaneous CON__ and protein accessions are kept.
protein_info %<>% mutate(sep_ids = str_split(protein_ids, ";")) %>% unnest(sep_ids) %>%
mutate(sep_prot = str_match(sep_ids, "[trsp]+\\|([:alnum:]+)")[,2]) %>%
group_by(protein_ids) %>%
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment