ms_data_prep.R 28.5 KB
Newer Older
1 2 3 4
#!/usr/bin/env Rscript

#' # Preprocessing of mass spec data
#' Objective: Extract an abundance matrix to be used as input for limma.
5
#'
6 7 8 9 10 11 12 13 14 15
#' Created by: `r system("whoami", intern=T)`
#'
#' Created at: `r format(Sys.Date(), format="%B %d %Y")`

#-----------------------------------------------------------------------------------------------------------------------

#+ include=FALSE
suppressMessages(require(docopt))

doc = '
16
Prepare mass spec data for differential abundance analysis
17 18 19 20
Usage: ms_data_prep.R [options] <ms_data_folder> <design_file>

Options:
--renaming_scheme <file>    Table with two columns with the "old" and "new" names
21
--data_type <data_type>    Which data type (i.e. raw or lfq) to use for the analysis [default: lfq]
22
'
23
#TODO: set standard as optional argument (Henrik will provide an extra file which specifies the necessary information)
24

25
#commandArgs <- function(x) c("--renaming_scheme", "renaming_scheme.txt", "../provided", "exp_design.txt")
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
opts = docopt(doc, commandArgs(TRUE))

#-----------------------------------------------------------------------------------------------------------------------
devtools::source_url("https://raw.githubusercontent.com/holgerbrandl/datautils/v1.51/R/core_commons.R")
library(knitr)
filter = dplyr::filter
library(gridExtra)


is_control = function(protein_ids){
    str_detect(protein_ids, "^CON__") | str_detect(protein_ids, "^REV__") | str_detect(protein_ids, fixed("ST|RTSTANDARD"))
}

newNames <- function(oldN) {
    renamingScheme[which(renamingScheme$old == oldN),]$new
}

## todo port to do core_commons.R
impute_some = function(data, fun=mean, max_na=0.4) if(sum(is.na(data))/length(data)<=max_na)  Hmisc::impute(data, fun=fun) else data
# data = c(2,3,4,NA,4)
# impute_some(c(2,3,NA,NA,NA))


#-----------------------------------------------------------------------------------------------------------------------

results_prefix="data_prep"
#TODO: also summarize MaxQuant settings and run specificities (i.e. machine, preparation time, processing date, etc.)


## load data and export information on file names
ms_data_folder <- opts$ms_data_folder
renaming_scheme <- opts$renaming_scheme
design_file <- opts$design_file
59
data_type <- opts$data_type
60 61


62 63 64 65 66 67 68
## read in MaxQuant output files (for analysis on protein level: "proteinGroups.txt") and reformat the data
mqTxtFiles = list.files(ms_data_folder, "proteinGroups.txt", full=TRUE)
# mqTxtFiles = list.files(interp_from_env("${PRJ_DATA}/provided/20181122-133344-Barbara-scaffoldDB-all-txt"), "proteinGroups.txt", full=TRUE)
# mqTxtFiles = list.files(interp_from_env("${PRJ_DATA}/provided/20181102-Bar-MQ-Scaffold_Holger"), "proteinGroups.txt", full=TRUE)

perBatch = mqTxtFiles %>%
    map(~ read_tsv(.x) %>% pretty_columns() %>%
69
        select(protein_ids, fasta_headers, matches("^lfq_intensity"), matches("^identification_type"), matches("^intensity_"))) %>%
70 71 72
    setNames(basename(mqTxtFiles))


73
## load renaming scheme
74 75 76
if (is.null(renaming_scheme)){
    orig_names <- perBatch %>% map(~select(.x, starts_with("lfq_intensity")) %>% colnames()) %>% unlist(use.names = FALSE) %>% str_replace(., "lfq_intensity_", "")
    renamingScheme <- data.frame(old = orig_names, new = orig_names)
77
} else {
78 79 80 81 82 83
    # TODO make sure you read in data based on file extension (write function if not available)
    if (str_detect(renaming_scheme, ".xls")) {
        renamingScheme <- read_xlsx(renaming_scheme)
    } else {
        renamingScheme <- read_tsv(renaming_scheme)
    }
84 85
}

86

87 88 89 90 91 92 93
oldNames <- str_c(renamingScheme$old, collapse = "|")

## load design_file
expDesign <- read_tsv(design_file)
if (all(expDesign$replicate != renamingScheme$new)) {stop("ATTENTION: replicates of contrast and design file do not match")}


94
# list input arguments
95 96 97 98 99 100
vec_as_df(unlist(opts)) %>%
    filter(! str_detect(name, "^[<-]")) %>%
    rbind(c("input_file_num", length(mqTxtFiles))) %>%
    kable()

#'
101 102 103 104
#' <br>
#'
#' renaming scheme:

105
if (is.null(renaming_scheme)){ print("Original naming kept; renaming scheme was not provided") }
106 107 108 109 110 111 112 113 114 115 116
renamingScheme %>% setNames(c("old names", "new names")) %>% kable()




# sort protein IDs
perBatch %<>% map(~ rowwise(.x) %>% mutate(old_protein_ids = protein_ids, protein_ids = paste(sort(unlist(str_split(protein_ids, ";"))), collapse = ";")))


#'
#' <br>
117 118
#'
#' ## File level
119 120 121 122

perBatch %>% map_df(~ select(.x, starts_with("LFQ_intensity")) %$% paste0(str_replace(colnames(.), "lfq_intensity_", ""), collapse = "; ")) %>% gather(file, samples) %>% kable()


123 124 125 126 127 128 129 130 131 132 133
# collect raw and lfq intensities of STANDARDs
contr_data <- perBatch %>% map_df(~ filter(.x, str_detect(protein_ids, "RTSTANDARD")), .id = 'GROUP') %>% select(-old_protein_ids)


# select intensities type, i.e. raw or lfq for further analysis
if (data_type == "lfq") {

    perBatch %<>% map(~ select(.x, -starts_with("intensity")))
    perBatch <- sapply(names(perBatch), function(x){
        data <- as.data.frame(perBatch[x])
        colnames(data) %<>% str_replace(., "lfq_", "")
134 135 136
        prot_col <- data %>% select(contains(".protein_ids")) %>% colnames()
        sample <- str_replace(prot_col, ".protein_ids", "")
        colnames(data) %<>% str_replace(., paste0(sample, "."), "")
137 138 139 140 141 142 143 144 145 146
        data %>% tbl_df()
    }, simplify = FALSE,USE.NAMES = TRUE)

} else if (data_type == "raw") {
    perBatch %<>% map(~ select(.x, -starts_with("lfq_intensity")))
} else {
    stop("ATTENTION: unknown data type requested; valid data types are raw and lfq")
}


147 148
#'
#' <br>
149
#'
150 151
#' ### Unique and ambiguous entries per file, i.e. one or multiple protein IDs
## collect information about lines per file and whether the entries are related to proteins, "scrap" and or unique or ambiguous entries
152
init_stat <- perBatch %>% map_df(~ mutate(.x,
153 154 155 156
    unique_entry = !str_detect(protein_ids, ";") & !str_detect(protein_ids, "CON__|REV__|RTSTANDARD"),
    ambiguous_entry = str_detect(protein_ids, ";") & !str_detect(protein_ids, "CON__|REV__|RTSTANDARD"),
    scrap_prop_unique = !str_detect(protein_ids, ";") & str_detect(protein_ids, "CON__|REV__|RTSTANDARD"),
    scrap_prop_ambiguous = str_detect(protein_ids, ";") & str_detect(protein_ids, "CON__|REV__|RTSTANDARD")
157 158 159
    ), .id = 'GROUP')


160 161
#' Examples for unique and ambiguous entries
init_stat %>% gather(feature, direction, unique_entry:scrap_prop_ambiguous) %>% filter(direction) %>% group_by(feature) %>% slice(1) %>% select(feature, protein_ids)
162

163 164 165 166 167 168 169 170 171
# extract information on protein IDs which shall be removed later
scrap_ids <- init_stat %>% gather(feature, direction, unique_entry:scrap_prop_ambiguous) %>% filter(direction) %>% filter(feature == "scrap_prop_unique" | feature == "scrap_prop_ambiguous") %>% select(protein_ids) %>% mutate(single_entries = str_split(protein_ids, ";")) %>% unnest(single_entries) %>%
    group_by(protein_ids) %>%
    mutate(all_scrap = sum(str_detect(single_entries, "CON__|REV__|RTSTANDARD")) == n()) %>%
    ungroup() %>%
    select(-single_entries) %>%
    distinct_all(protein_ids) %>%
    filter(all_scrap) %$% protein_ids

172 173 174
#'
#' <br>
#'
175
#' Summary of unique and ambiguous samples per input file
176
# export numeric summary as table
177
init_stat %>% group_by(., GROUP) %>% summarize(total_entries = n(), unique_entries = sum(unique_entry) + sum(scrap_prop_unique), ambiguous_entries = sum(ambiguous_entry) + sum(scrap_prop_ambiguous)) %>% kable()
178

179
init_stat %>% select(GROUP, unique_entry, ambiguous_entry, scrap_prop_unique, scrap_prop_ambiguous) %>% gather(feature, direction, -GROUP) %>%
180 181 182 183 184 185 186 187
        group_by(GROUP, feature) %>% summarize(count = sum(direction)) %>%
        ggplot(aes(GROUP, count, fill = feature)) +
            geom_col() +
            coord_flip() +
            theme(legend.title=element_blank()) + xlab("") +
            # ggtitle("Unique and non-unique entries per input file") +
            scale_fill_manual(values=c("coral3", "coral1", "cadetblue3", "cadetblue"))

188

189 190 191
# extract information on intensities for the different entry categories per sample
init_stat_plot <- init_stat %>% select(GROUP, unique_entry, ambiguous_entry, scrap_prop_unique, scrap_prop_ambiguous, matches("intensity")) %>%
    gather(sample, intensity, -c(GROUP, unique_entry, ambiguous_entry, scrap_prop_unique, scrap_prop_ambiguous)) %>%
192 193
    gather(feature, direction, unique_entry, ambiguous_entry, scrap_prop_unique, scrap_prop_ambiguous) %>%
    filter(direction) %>%
194
    mutate(sample = str_replace(sample, "intensity_", "") %>% str_replace_all(., oldNames, newNames),
195 196 197
    GROUP = str_replace(GROUP, ".proteinGroups.txt", "") %>% str_replace_all(., oldNames, newNames)) %>%
    na.omit()

198 199 200
#'
#' <br><br>
#'
201
#' ### Intensities of unique and ambiguous entries per file
202
init_stat_plot %>%
203
    ggplot(aes(feature, intensity+1, fill = feature)) +
204 205 206
        geom_boxplot() +
        # geom_violin() +
        scale_y_log10() +
207
        ylab("intensities") +
208
        theme(axis.text.x=element_blank(),axis.ticks.x=element_blank()) +
209
        # ggtitle("intensities per category accross all samples") +
210 211 212 213 214 215
        scale_fill_manual(values=c("coral3", "coral1", "cadetblue3", "cadetblue")) +
        facet_wrap(~GROUP)

#'
#' <br><br>
#'
216
#' ### Intensities of unique and ambiguous entries per sample
217
init_stat_plot %>%
218
    ggplot(aes(feature, intensity+1, fill = feature)) +
219 220 221
    geom_boxplot() +
# geom_violin() +
    scale_y_log10() +
222
    ylab("intensities") +
223
    theme(axis.text.x=element_blank(),axis.ticks.x=element_blank()) +
224
# ggtitle("intensities per category accross all samples") +
225 226 227
    scale_fill_manual(values=c("coral3", "coral1", "cadetblue3", "cadetblue")) +
    facet_wrap(~GROUP + sample)

228 229 230 231 232
#TODO: write function to adjust font size in plots according to sample name length

#'
#' <br><br>
#'
233 234
#' ### Non-zero intensities of per sample
init_stat_plot %>% ggplot(aes(intensity, fill = feature, color = feature)) +
235 236
    geom_density(alpha = 0.2) +
    scale_x_log10() +
237
    xlab("intensities") +
238 239 240 241 242 243
    scale_fill_manual(values=c("coral3", "coral1", "cadetblue3", "cadetblue")) +
    scale_color_manual(values=c("coral3", "coral1", "cadetblue3", "cadetblue")) +
    facet_wrap(~GROUP + sample) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    theme(panel.background = element_rect(fill = 'white', colour = 'grey'), panel.grid = element_line(color = "gray90"))

244 245
#TODO: take the RTSTANDARD intensity as a minimal threshold to mark low-abundant proteinGroups, i.e. groups which have an intensity lower than the RTSTANDARD

246 247 248 249
#'
#' <br><br>
#'
#' ### LFQ and raw intensities of STANDARDS on protein level
250 251 252 253 254 255 256 257 258 259 260 261
if (nrow(contr_data) > 0){

    contr_data %>% gather(sample, intensity, -c(GROUP, protein_ids, fasta_headers)) %>%
        mutate(feature = str_match(sample, "lfq_intensity|intensity")[,1],
            feature = case_when(feature == "intensity"~"raw intensity", feature == "lfq_intensity"~"lfq intensity"),
            sample = str_replace(sample, "lfq_intensity_|intensity_", ""),
            intensity = as.numeric(intensity)
        ) %>% ggplot(aes(feature, intensity)) + geom_boxplot()

} else {
    print("No standard information found")
}
262

263 264 265 266 267 268 269 270


#'
#' <br><br>
#'
#' ### LFQ and raw intensities of STANDARDS on peptide level
pepFiles <- list.files(ms_data_folder, pattern = "peptides.txt", full = TRUE)

271
if (!is_empty(pepFiles) & nrow(contr_data) > 0){
272 273 274

    std_info = pepFiles %>%
        map(~ read_tsv(.x) %>% pretty_columns() %>%
275
            select(proteins, sequence, matches("^intensity_"), matches("^intensity_")) %>%
276 277 278 279 280 281 282 283 284
            filter(str_detect(proteins, "STANDARD"))) %>%
        setNames(basename(pepFiles)) %>%
        map_df(~ gather(.x, feature, intensity, contains("intensity")) %>%
        separate(feature, c("type", "sample"), sep = "sity_"), .id = 'GROUP') %>%
        mutate(sample = str_replace_all(sample, oldNames, newNames), type = str_replace(type, "inten", "intensity"))

    print("STANDARD intensities on peptide level are available for the following samples:")
    unique(std_info$sample)

285

286
    std_info %>% ggplot(aes(sample, intensity, fill = type)) + geom_boxplot() + coord_flip()
287

288 289 290
    std_info %>%
        ggplot(aes(sample, sequence, fill = intensity)) + geom_tile() + facet_wrap(~type) +
        theme(axis.text.x = element_text(angle = 45, hjust = 1))
291

Lena Hersemann's avatar
Lena Hersemann committed
292
} else {
293
    print("Peptide level information were not provided or standard information were not found")
Lena Hersemann's avatar
Lena Hersemann committed
294
}
295

296 297


298 299
# init_stat %>% select(GROUP, protein_ids, unique_entry, ambiguous_entry, scrap_prop_unique, scrap_prop_ambiguous, matches("lfq_intensity")) %>%
#     gather(sample, lfq_intensity, -c(GROUP, unique_entry, ambiguous_entry, scrap_prop_unique, scrap_prop_ambiguous)) %>%
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
#     mutate(sample = str_replace(sample, "lfq_intensity_", "")) %>% left_join(renamingScheme, by = c("sample" = "old")) %>%
#     na.omit() %>% write_tsv("feature_information.txt")

#TODO: allow input parameter which specifies which data shall be discarded before the analysis
# currently only unique "scrap" entries are discarded

## extract protein accession from compound protein_ids column, trim away accession version to improve id-conversion rate and split non-unique entries
# perBatch %<>% map(~ mutate(.x, protein_acc=str_split_fixed(protein_ids, "[|]", 3) %>% get_col(2) %>% str_replace_all("[-].*", "")))
# perBatch %<>% map(~ mutate(.x, init_groups = protein_ids) %>%
#         separate_rows(protein_ids, sep = ";") %>%
#         ungroup() %>%
#         # extract protein accession and get information on contaminations and references
#         mutate(protein_acc = str_split_fixed(protein_ids, "[|]", 3) %>% get_col(2)) %>% #str_replace_all("[-].*", "")) %>%
#         mutate(scrap = case_when(str_detect(protein_ids, "^CON__")~"CON", str_detect(protein_ids, "^REV__")~"REV", str_detect(protein_ids, fixed("ST|RTSTANDARD"))~"STANDARD"))
#     )

# # check that there are no duplicated protein ids
# # perBatch[[1]] %<>% rbind(perBatch[[1]], perBatch[[1]])
# if (any (perBatch %>% map_df(~ filter(.x, is.na(scrap)) %>% count(is_dupl = duplicated(protein_acc)), .id = 'GROUP') %$% is_dupl)) {stop("ATTENTION: one of the samples contains duplicated protein ids")}


321
# perBatch %<>% map(~ mutate(.x, scrap = case_when(str_detect(protein_ids, "CON__")~"CON", str_detect(protein_ids, "REV__")~"REV", str_detect(protein_ids, fixed("ST|RTSTANDARD"))~"STANDARD")))
322 323 324 325 326 327 328


#'
#' <br><br>
#'
#' ### Presence of identification types (i.e. MS/MS or by_matching)
## extract all data in long table format, in case identification_type is know, add those information to the table and rename samples and export data for later annotation
329
sample_info <- perBatch %>% map_df(~ select(.x, -starts_with("identification_type")), .id = 'GROUP') %>% gather(sample, lfq, starts_with("intensity")) %>% mutate(sample = str_replace_all(sample, "intensity_", "")) %>% mutate(oldName = sample) %>% rename(file_name = GROUP) %>% mutate(gene_name = str_match(fasta_headers, "GN=([:alnum:]+)\\s+")[,2])
330 331

if (any(map(perBatch, ~colnames(.x) %>% str_detect(., "identification_type")) %>% unlist())) {
332
    sample_info <- perBatch %>% map_df(~ select(.x, -starts_with("intensity")), .id = 'GROUP') %>% gather(sample, identification_type, starts_with("identification_type")) %>% mutate(sample = str_replace_all(sample, "identification_type_", "")) %>% mutate(oldName = sample) %>% rename(file_name = GROUP) %>%
333
    # right_join(sample_info, by = c("sample", "file_name", "oldName", "protein_ids", "protein_acc", "scrap", "fasta_headers", "init_groups"))
334
    right_join(sample_info, by = c("sample", "file_name", "oldName", "protein_ids", "old_protein_ids", "fasta_headers"))
335 336 337 338 339 340
    ident_types <- TRUE
    print("Identification type data were provided")
} else {
    print("No identification type information found")
}

341 342 343
#TODO: report statistics on how many MS/MS or by matching were found for each gene for the individual replicates in the script
#TODO: summarize the MS/MS information as numeric value (e.g. percentage of MS/MS in all replicates) and add information to the limma output (consider to export information as html)

344 345
# TODO: find faster solution for renaming
sample_info$sample %<>% str_replace_all(., oldNames, newNames)
346
write_tsv(sample_info, paste0(results_prefix, ".feature_sample_information.txt"))
347

348 349 350
protein_info <- sample_info %>% distinct(protein_ids, fasta_headers) %>% group_by(protein_ids) %>%
    filter(max(nchar(fasta_headers))==nchar(fasta_headers)) %>% slice(1) %>% ungroup() %>%
    rowwise() %>%
351 352
    mutate(gene_name = paste(unlist(str_extract_all(fasta_headers, "GN=([:alnum:]+\\-?[:alnum:]?\\-?[:alnum:]?)")), collapse = "; ") %>% str_replace_all(., "GN=", ""),
        protein_acc = paste(unlist(str_extract_all(protein_ids, "[trsp]+\\|([:alnum:]+\\-?[:alnum:]?\\-?[:alnum:]?)")), collapse = "; ") %>% str_replace_all(., "sp|tr|[|]", ""))
353 354 355

write_tsv(protein_info, paste0(results_prefix, ".feature_information.txt"))

356

357 358 359
#'
#' <br><br>
#'
360 361
#' ### Reorder protein IDs of ambiguous entries
print("Number of ambiguous protein IDs with re-arranged order:")
362 363
perBatch %>% map_df(~ count(.x, protein_ids != old_protein_ids) %>% setNames(c("new_order", "count")) %>% spread(new_order, count),.id = 'GROUP') %>% kable()

364
#TODO: think about exporting some information on which proteinGroups were changed in their order (e.g. a column indicating changed order)
365

366 367
## combine individual files by protein_ids, protein_acc and scrap only
msData = perBatch %>%
368
    map(~ select(.x, protein_ids, matches("^intensity"))) %>%
369 370 371 372 373 374 375 376 377 378 379 380
    purrr::reduce(full_join, by="protein_ids")

sample_num <- ncol(msData)-1

## rename samples
names(msData) %<>% str_replace_all(., oldNames, newNames)

#'
#' <br><br>
#'
#' ## Sample level
#'
381 382
#' In case multiple input files are analysed, the tables are now merged and further processed as one table. To merge the tables the protein IDs were used and thus NAs will be reported if the input tables do not have all protein IDs in common.
#'
383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
#' <br>
#'
#' ### Missing values per sample
## plot NA proportion based on the DataExplorer function profile_missing()
load_pack(DataExplorer)
#plot_missing(msData) +

# TODO: adjust prercentages
missing_value <- profile_missing(msData[, which(colnames(msData) != "protein_ids")])

ggplot(missing_value, aes_string(x = "feature", y = "num_missing", fill = "group")) +
    geom_bar(stat = "identity") +
    geom_text(aes(label = paste0(round(100 * pct_missing, 2), "%"))) +
    scale_fill_manual("Group", values = c("Good" = "cadetblue", "OK" = "cadetblue3", "Bad" = "coral1", "Remove" = "coral3"), breaks = c("Good", "OK", "Bad", "Remove")) +
    theme(legend.title = element_text(size=14, face="bold")) +
    coord_flip() +
    xlab("Features") + ylab("Missing Rows")


na_prop <- msData %>% gather(sample, intensity, -protein_ids) %$% { sum(is.na(intensity)/length(intensity))}


if (na_prop > 0) {
    msData %>% select(protein_ids, contains("intensity")) %>% gather(sample, intensity, -protein_ids) %>%
        group_by(protein_ids) %>% #first_group %>% print_all %>%
        summarize(num_na=sum(is.na(intensity)), num_meas=n(), na_prop=num_na/n()) %>%
        mutate(na_prop = MASS::fractions(na_prop)) %>%
        # ggplot(aes(as.factor(na_prop))) +
        ggplot(aes(na_prop)) +
        geom_bar(fill = "lightcyan4") +
        xlab("NA proportion") + ggtitle("Protein ID specific NA proportion across all samples")
}

#'
#' <br><br>
#'
#' ### Zero values across all samples
# TODO report in similar structure like the NA proportions
## plot zero proportion
msData %>% select(protein_ids, contains("intensity")) %>% gather(sample, intensity, -protein_ids) %>%
    group_by(protein_ids) %>% #first_group %>% print_all %>%
    summarize(num_zero=sum(intensity == 0), num_meas=n(), zero_prop=num_zero/n()) %>%
    ggplot(aes(zero_prop)) + geom_bar(fill = "lightcyan4") + xlab("Proportion of zeros") + ggtitle("Protein ID specific proportion of zeros across all samples")


428
names(msData) %<>% str_replace("intensity_", "")
429 430 431 432 433 434 435 436 437 438 439 440
# msData <- read_tsv("msData.txt")
#'  [msData](msData.txt)


# #' retain only proteins with na proportion less than 75%
# proteinsNaLt70 = naSummary %>% filter(na_prop <0.75) %>% pull(protein_acc)
# msData %<>% filter(protein_acc %in% proteinsNaLt70)


#'
#' <br><br>
#'
441
#' ### Contaminations removal
442
#' In this step, unique CON__ REV__ and STANDARD entries are removed. So far, ambiguous entries with simultaneous CON__ and protein accessions are kept.
443 444 445 446 447
# protein_info %>% mutate(sep_ids = str_split(protein_ids, ";")) %>% unnest(sep_ids) %>%
#     mutate(sep_prot = str_match(sep_ids, "[trsp]+\\|([:alnum:]+)")[,2]) %>%
#     na.omit() %>%
#     group_by(protein_ids) %>%
#     distinct(protein_ids, fasta_headers, protein_acc, gene_name) %>% ungroup()
448

449 450 451
msData %<>% mutate(is_scrap = protein_ids %in% scrap_ids) %>% left_join(protein_info, by = "protein_ids")

write_tsv(msData, path=add_prefix("intensities_incl_ctrls.txt"))
452 453

msData %>% filter(is_scrap) %>% select(protein_ids) %>% DT::datatable(caption="controls removed from data")
454 455

tribble(~intial_data, ~filtered_data, ~removed_rows,
456
    nrow(msData), nrow(filter(msData, !is_scrap)), nrow(filter(msData, is_scrap))) %>% kable()
457

458
msData %<>% filter(!is_scrap) %>% select(-is_scrap, -fasta_headers, -protein_acc, -gene_name)
459 460 461 462 463 464 465

stopifnot(nrow(filter(msData, str_length(protein_ids)==0)) ==0)


#'
#' <br><br>
#'
466
#' ### Protein abundance / intensities
467 468 469 470 471
#' List of the 25 most abundant proteins across all samples
msData %>% gather(sample, intensity, -protein_ids) %>% group_by(protein_ids) %>%
    summarize(sum_intensity=sum(intensity, na.rm=T)) %>%
    arrange(-sum_intensity) %>% slice(1:25) %>%
    # left_join(sample_info %>% distinct(protein_ids, protein_acc, gene_name, fasta_headers), by = "protein_ids") %>%
472
    left_join(protein_info, by = "protein_ids") %>%
473 474 475 476 477 478 479 480 481
    transmute(protein_ids, gene_name, sum_intensity, fasta_headers) %>%
    table_browser()


#'
#' <br><br>
#'
#' ## Data Normalization
#'
482 483
#' How do the intensities look like?
protsData = gather(msData, sample, intensity, -protein_ids) %>% mutate(sample = str_replace(sample, "intensity_", "")) %T>% glimpse
484

485
p1 <- protsData %>% ggplot(aes(intensity)) + geom_histogram(fill = "lightcyan4") + scale_x_log10() + ggtitle("intensities across all samples")
486
p2 <- protsData %>% ggplot(aes(sample, weight=intensity)) +
487
    geom_bar(fill = "lightcyan4") + coord_flip() + ggtitle("intensity totals per sample")
488 489 490

grid.arrange(p1, p2, nrow = 1)

491
protsData %>% ggplot(aes(intensity)) + geom_histogram(fill = "lightcyan4") + scale_x_log10() + ggtitle("intensities") + facet_wrap(~sample)
492 493


494
protsData %>% ggplot(aes(sample, intensity+1)) + geom_boxplot(fill = "lightcyan4") + ggtitle("intensities") + scale_y_log10() + coord_flip() + ylab("intensity")
495 496 497 498

## when using linear scale, we discard 0s to avoid biases means
protsData %>% filter(intensity>0) %>% ggplot(aes(sample, intensity)) +
    geom_boxplot(fill = "lightcyan4") +
499
    ggtitle("intensities > 0") +
500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
    scale_y_continuous(limits=c(0, 5E6)) +
    coord_flip()


## also compare top10 intensities per sample
heg_data <- protsData %>%
    group_by(sample) %>%
    top_n(10, intensity) %>%
    # arrange(-intensity) %>% slice(1:10) %>%
    left_join(sample_info, by = c("sample", "protein_ids", "intensity" = "lfq")) %>%
    # first_group() %>%
    ungroup() %>%
    arrange(sample, intensity) %>%
    # mutate(order = row_number(), unique_protein = !str_detect(init_groups, ";"))
    mutate(order = row_number(), unique_protein = !str_detect(protein_ids, ";"))

d1 <- ggplot(heg_data, aes(order, intensity, fill = unique_protein)) + geom_col(stat = "identity") + coord_flip() + xlab("") + scale_x_continuous(breaks = heg_data$order, labels = heg_data$gene_name, expand = c(0,0)) + scale_fill_manual(values=c("coral1", "cadetblue")) +
    ylab("") +
    theme(legend.position = "bottom") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

if("identification_type" %in% colnames(heg_data) == FALSE){
522 523
    d2 <- d1 + facet_wrap(~sample, scales="free_y", ncol = 2) + ggtitle("intensities of top 10 proteins per sample - same scale")
    d3 <- d1 + facet_wrap(~sample, scales="free", ncol = 2) + ggtitle("intensities of top 10 proteins per sample - different scale")
524
} else {
525 526
    d2 <- d1 + geom_point(aes(order, intensity, shape = identification_type), fill = "black") + scale_shape_manual(values=c(20, 1)) + facet_wrap(~sample, scales="free_y", ncol = 2) + ggtitle("intensities of top 10 proteins per sample - same scale")
    d3 <- d1 + geom_point(aes(order, intensity, shape = identification_type), fill = "black") + scale_shape_manual(values=c(20, 1)) + facet_wrap(~sample, scales="free", ncol = 2) + ggtitle("intensities of top 10 proteins per sample - different scale")
527 528
}

529
write_tsv(heg_data, path=add_prefix("top10_highest_intensities.txt"))
530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545

#+ fig.height=length(unique(heg_data$sample))/2+5, eval=length(unique(heg_data$sample))>0
d2

#+ fig.height=length(unique(heg_data$sample))/2+8, eval=length(unique(heg_data$sample))>0
d3

#'
#' <br><br>
#'
#' ## Comparison model and design file

normData = protsData

#' design

546 547
expDesign %>% kable()

548 549 550 551 552 553 554 555 556 557 558 559
lapply(colnames(expDesign)[which(colnames(expDesign) != "replicate")], function(x){
    expDesign[,which(colnames(expDesign) == x)] %>%
        setNames("feature") %>%
        ggplot(aes(as.factor(feature))) +
        geom_bar(fill = "lightcyan4") +
        ggtitle(paste0("samples per ", x)) +
        xlab("") +
        theme(axis.text.x = element_text(angle = 45, hjust = 1))
}) %>% grid.arrange(grobs = ., ncol=3)



560 561 562 563
#
# <br><br>
#
# ## Indent status information
564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582

# identTypes = read_tsv("../ident_types_by_sample.txt")
# expDesign = read_tsv("../exp_design.txt") %>% select(replicate, condition) %>% distinct

if(exists("ident_types")) {

    identType <- perBatch %>%
            map(~ select(.x, protein_ids, matches("identification_type"))) %>%
            reduce(full_join, by = "protein_ids")

    names(identType) %<>% str_replace_all(., oldNames, newNames) %>% str_replace_all( "identification_type_", "")

    ## summarize MS-MS status for all three replicates per condition (pattern: x1|x2|x3 with x1 = MS/MS, x2 = by_matching and x3 = NA)
    identSummary = identType %>% gather(sample, ident_status, -protein_ids) %>%
        left_join(expDesign, by=c("sample"="replicate")) %>%
        mutate_inplace(ident_status, str_replace_all(c("By MS/MS"="ms_by_ms", "By matching"="matching"))) %>%
        mutate_at(vars(ident_status), funs(replace(., is.na(.), "missing"))) %>%
        count(protein_ids, condition, ident_status) %>%
        spread(ident_status, n) %>%
583 584 585
        mutate_if(is_integer, funs(replace(., is.na(.), 0)))

    if (any(colnames(identSummary) != "missing")) { identSummary %<>% mutate(missing = 0) }
586

587
    identSummary %<>% mutate(ident_by_msms_match_missing=paste(ms_by_ms, matching, missing, sep="|")) %>%
588
        select(protein_ids, condition, ident_by_msms_match_missing)
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606

    write_tsv(identSummary, path=add_prefix("ident_types_summary.txt"))
    # identSummary <- read_tsv("ident_types_summary.txt")
}


#'
#' <br><br>
#'
#' ## Abundance matrix imputation


if (na_prop > 0) {
    normData %>%
        left_join(expDesign, by=c("sample"="replicate")) %>%
        group_by(condition, protein_ids) %>%
        summarize(num_vals=n(), na_prop=(sum(is.na(intensity))/length(intensity) )%>% round(3)) %>%
    # count(num_vals, na_prop)
607
        ggplot(aes(as.factor(na_prop))) + geom_bar() + facet_wrap(~num_vals, scales="free_x") + ggtitle("na proportions by number of replicates per condition") + xlab("NA proportion")
608 609
}

610
#' In the following, missing intensities are imputated if intensities were reported for more than 60% of provided replicates. All remaining NA values will be set to zero.
611

612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627
imputeData = normData %>%
    left_join(expDesign, by=c("sample"="replicate")) %>%
    group_by(condition, protein_ids) %>%
    mutate(intensity_imp=impute_some(intensity)) %>%
    ungroup() %>% select(protein_ids, sample, intensity_imp) %>%
    spread(sample, intensity_imp)

# before impute
# msData[, 1:4] %>% arrange(protein_ids) %>% tail()
# after mean-impute of semi-complete records
# imputeData[, 1:4] %>% arrange(protein_ids) %>% tail()

print(paste0(nrow(anti_join(msData, imputeData)), " out of ", nrow(msData), " rows changed due to imputation"))

if (sample_num != ncol(imputeData)-1) {stop("ATTENTION: Final intensities matrix does not contain all input samples")}

628
#TODO: report information for each gene and condition, whether any value was imputed and add information to final limma results
629 630 631 632 633 634 635 636

# after zero-imputing of all remaining NAs
imputeData %<>% mutate_if(is.numeric, funs(replace(., is.na(.), 0)))
# imputeData
write_tsv(imputeData, path=add_prefix("intens_imputed.txt"))
# imputeData <- read_tsv(add_prefix("intens_imputed.txt"))
#  [imputeData](`r add_prefix("intens_imputed.txt")`)

637 638 639


#-----------------------------------------------------------------------------------------------------------------------
640
# save copy of the used ms_data_prep.R script
641
system("cp ${NGS_TOOLS}/ms_workflow/ms_data_prep.R ./.ngs_tools_ms_data_prep_$(date +%Y%m%d_%H%M).R")
642 643

# get R version and package infos
644
writeLines(capture.output(devtools::session_info()), ".sessionInfo.txt")
645

646 647
session::save.session(".ms_data_prep.R.dat")
# session::restore.session(".ms_data_prep.R.dat")