Commit ced15313 authored by Lena Hersemann's avatar Lena Hersemann

added the following: summary of STANDARD raw and LFQ intensities, summary of...

added the following: summary of STANDARD raw and LFQ intensities, summary of samples per input file, sorting of protein IDs before merging the tables and reporting of the number of changed protein IDs orders, boxplot of LFQ intensities per sample, reporting of renaming scheme, some general documentation; cosmetics
parent 79ca1300
......@@ -2,7 +2,7 @@
#' # Preprocessing of mass spec data
#' Objective: Extract an abundance matrix to be used as input for limma.
#'
#' Created by: `r system("whoami", intern=T)`
#'
#' Created at: `r format(Sys.Date(), format="%B %d %Y")`
......@@ -79,7 +79,7 @@ mqTxtFiles = list.files(ms_data_folder, "proteinGroups.txt", full=TRUE)
perBatch = mqTxtFiles %>%
map(~ read_tsv(.x) %>% pretty_columns() %>%
select(protein_ids, fasta_headers, matches("^lfq_intensity"), matches("^identification_type"))) %>%
select(protein_ids, fasta_headers, matches("^lfq_intensity"), matches("^identification_type"), matches("^intensity"))) %>%
setNames(basename(mqTxtFiles))
vec_as_df(unlist(opts)) %>%
......@@ -88,27 +88,49 @@ vec_as_df(unlist(opts)) %>%
kable()
#'
#' <br><br>
#' <br>
#'
#' renaming scheme:
renamingScheme %>% setNames(c("old names", "new names")) %>% kable()
# sort protein IDs
perBatch %<>% map(~ rowwise(.x) %>% mutate(old_protein_ids = protein_ids, protein_ids = paste(sort(unlist(str_split(protein_ids, ";"))), collapse = ";")))
#'
#' <br>
#'
#' ## File level
perBatch %>% map_df(~ select(.x, starts_with("LFQ_intensity")) %$% paste0(str_replace(colnames(.), "lfq_intensity_", ""), collapse = "; ")) %>% gather(file, samples) %>% kable()
#'
#' <br>
#'
#' ### Unique and biased entries per file, i.e. one or multiple protein IDs
## collect information about lines per file and whether the entries are related to proteins, "scrap" and or unique or biased entries
init_stat <- perBatch %>% map_df(~ mutate(.x,
unique_entry = !str_detect(protein_ids, ";") & !str_detect(protein_ids, "CON__|REV__|ST|RSTANDARD"),
biased_entry = str_detect(protein_ids, ";") & !str_detect(protein_ids, "CON__|REV__|ST|RSTANDARD"),
scrap_prop_unique = !str_detect(protein_ids, ";") & str_detect(protein_ids, "CON__|REV__|ST|RSTANDARD"),
scrap_prop_biased = str_detect(protein_ids, ";") & str_detect(protein_ids, "CON__|REV__|ST|RSTANDARD")
unique_entry = !str_detect(protein_ids, ";") & !str_detect(protein_ids, "CON__|REV__|RSTANDARD"),
biased_entry = str_detect(protein_ids, ";") & !str_detect(protein_ids, "CON__|REV__|RSTANDARD"),
scrap_prop_unique = !str_detect(protein_ids, ";") & str_detect(protein_ids, "CON__|REV__|RSTANDARD"),
scrap_prop_biased = str_detect(protein_ids, ";") & str_detect(protein_ids, "CON__|REV__|RSTANDARD")
), .id = 'GROUP')
#' Examples for unique and biased entries
init_stat %>% gather(feature, direction, unique_entry:scrap_prop_biased) %>% filter(direction) %>% group_by(feature) %>% slice(1) %>% select(feature, protein_ids)
#'
#' <br>
#'
#' Summary of unique and biased samples per input file
# export numeric summary as table
init_stat %>% group_by(., GROUP) %>% summarize(total_entries = n(), unique_entries = sum(unique_entry) + sum(scrap_prop_unique), biased_protein_entries = sum(biased_entry) + sum(scrap_prop_biased)) %>% kable()
init_stat %>% group_by(., GROUP) %>% summarize(total_entries = n(), unique_entries = sum(unique_entry) + sum(scrap_prop_unique), biased_entries = sum(biased_entry) + sum(scrap_prop_biased)) %>% kable()
init_stat %>% select(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased) %>% gather(feature, direction, -GROUP) %>%
group_by(GROUP, feature) %>% summarize(count = sum(direction)) %>%
......@@ -132,12 +154,44 @@ init_stat %>% select(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap
ggplot(aes(feature, lfq_intensity, fill = feature)) +
geom_boxplot() +
# geom_violin() +
facet_wrap(~GROUP) +
scale_y_log10() +
ylab("LFQ intensities") +
theme(axis.text.x=element_blank(),axis.ticks.x=element_blank()) +
# ggtitle("LFQ intensities per category accross all samples") +
scale_fill_manual(values=c("coral3", "coral1", "cadetblue3", "cadetblue"))
scale_fill_manual(values=c("coral3", "coral1", "cadetblue3", "cadetblue")) +
facet_wrap(~GROUP)
#'
#' <br><br>
#'
#' ### LFQ intensities of per sample
init_stat %>% select(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased, matches("lfq_intensity")) %>%
gather(sample, lfq_intensity, -c(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased)) %>%
gather(feature, direction, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased) %>%
filter(direction) %>%
mutate(sample = str_replace(sample, "lfq_intensity_", "") %>% str_replace_all(., oldNames, newNames),
GROUP = str_replace(GROUP, ".proteinGroups.txt", "") %>% str_replace_all(., oldNames, newNames)) %>%
na.omit() %>%
ggplot(aes(feature, lfq_intensity, fill = feature)) +
geom_boxplot() +
# geom_violin() +
scale_y_log10() +
ylab("LFQ intensities") +
theme(axis.text.x=element_blank(),axis.ticks.x=element_blank()) +
# ggtitle("LFQ intensities per category accross all samples") +
scale_fill_manual(values=c("coral3", "coral1", "cadetblue3", "cadetblue")) +
facet_wrap(~GROUP + sample)
rbind(init_stat %>% filter(str_detect(protein_ids, "STANDARD")) %>%
select(starts_with("lfq_intensity")) %>%
gather(sample, intensities) %>% na.omit() %>%
mutate(sample = str_replace(sample, "lfq_intensity_", ""), feature = "lfq intensities"),
init_stat %>% filter(str_detect(protein_ids, "STANDARD")) %>%
select(starts_with("intensity_")) %>%
gather(sample, intensities) %>% na.omit() %>%
mutate(sample = str_replace(sample, "lfq_intensity_", ""), feature = "raw intensities")
) %>% ggplot(aes(feature, intensities)) + geom_boxplot() + ggtitle("Intensities of STANDARDs")
# init_stat %>% select(GROUP, protein_ids, unique_entry, biased_entry, scrap_prop_unique, scrap_prop_biased, matches("lfq_intensity")) %>%
......@@ -163,7 +217,8 @@ init_stat %>% select(GROUP, unique_entry, biased_entry, scrap_prop_unique, scrap
# if (any (perBatch %>% map_df(~ filter(.x, is.na(scrap)) %>% count(is_dupl = duplicated(protein_acc)), .id = 'GROUP') %$% is_dupl)) {stop("ATTENTION: one of the samples contains duplicated protein ids")}
perBatch %<>% map(~ mutate(.x, scrap = case_when(str_detect(protein_ids, "CON__")~"CON", str_detect(protein_ids, "REV__")~"REV", str_detect(protein_ids, fixed("ST|RTSTANDARD"))~"STANDARD")))
perBatch %<>% map(~ mutate(.x, scrap = case_when(str_detect(protein_ids, "CON__")~"CON", str_detect(protein_ids, "REV__")~"REV", str_detect(protein_ids, fixed("ST|RTSTANDARD"))~"STANDARD")) %>%
select(-starts_with("intensity")))
#'
......@@ -188,6 +243,14 @@ sample_info$sample %<>% str_replace_all(., oldNames, newNames)
write_tsv(sample_info, paste0(results_prefix, "feature_sample_information.txt"))
#'
#' <br><br>
#'
#' ### Reorder protein IDs of biased entries
print("Number of biased protein IDs with re-arranged order:")
perBatch %>% map_df(~ count(.x, protein_ids != old_protein_ids) %>% setNames(c("new_order", "count")) %>% spread(new_order, count),.id = 'GROUP') %>% kable()
## combine individual files by protein_ids, protein_acc and scrap only
msData = perBatch %>%
map(~ select(.x, protein_ids, matches("^lfq_intensity"))) %>%
......@@ -203,6 +266,8 @@ names(msData) %<>% str_replace_all(., oldNames, newNames)
#'
#' ## Sample level
#'
#' In case multiple input files are analysed, the tables are now merged and further processed as one table. To merge the tables the protein IDs were used and thus NAs will be reported if the input tables do not have all protein IDs in common.
#'
#' <br>
#'
#' ### Missing values per sample
......@@ -263,6 +328,7 @@ write_tsv(msData, path=add_prefix("lfq_incl_ctrls.txt"))
#' <br><br>
#'
#' ### Control removal
#' In this step, unique CON__ REV__ and STANDARD entries are removed. So far, biased entries with simultaneous CON__ and protein accessions are kept.
msData %>% filter(is_control(protein_ids)) %>% select(protein_ids) %>% DT::datatable(caption="controls removed from data")
tribble(~intial_data, ~filtered_data, ~removed_rows,
......@@ -355,6 +421,8 @@ normData = protsData
#' design
expDesign %>% kable()
lapply(colnames(expDesign)[which(colnames(expDesign) != "replicate")], function(x){
expDesign[,which(colnames(expDesign) == x)] %>%
setNames("feature") %>%
......@@ -366,13 +434,11 @@ lapply(colnames(expDesign)[which(colnames(expDesign) != "replicate")], function(
}) %>% grid.arrange(grobs = ., ncol=3)
expDesign %>% kable()
#'
#' <br><br>
#'
#' ## Indent status information
#
# <br><br>
#
# ## Indent status information
# identTypes = read_tsv("../ident_types_by_sample.txt")
# expDesign = read_tsv("../exp_design.txt") %>% select(replicate, condition) %>% distinct
......@@ -414,9 +480,11 @@ if (na_prop > 0) {
group_by(condition, protein_ids) %>%
summarize(num_vals=n(), na_prop=(sum(is.na(intensity))/length(intensity) )%>% round(3)) %>%
# count(num_vals, na_prop)
ggplot(aes(as.factor(na_prop))) + geom_bar() + facet_wrap(~num_vals, scales="free_x") + ggtitle("na proportions by number of replicates per condition")
ggplot(aes(as.factor(na_prop))) + geom_bar() + facet_wrap(~num_vals, scales="free_x") + ggtitle("na proportions by number of replicates per condition") + xlab("NA proportion")
}
#' In the following, missing LFQ intensities are imputated if intensities were reported for more than 60% of provided replicates. All remaining NA values will be set to zero.
imputeData = normData %>%
left_join(expDesign, by=c("sample"="replicate")) %>%
group_by(condition, protein_ids) %>%
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment