github output

0bb8ad6c · luppino · 89f5ac32 · 0bb8ad6c
Commit 0bb8ad6c authored 2 years ago by luppino
--- a/demag_newgenes/runDeMAG_newgenes.md
+++ b/demag_newgenes/runDeMAG_newgenes.md
+------------------------------------------------------------------------
+
+### Run DeMAG on new genes
+
+In order to show that DeMAG is generalizable we run the model on a set
+of extra 334 genes that have at least 5 benign and 5 pathogenic
+high-quality (at least 2 review stars) variants in ClinVar (version
+20220812).
+
+For the features we used:
+
+-   precomputed EVmutation scores from the
+    [webserver](https://marks.hms.harvard.edu/evmutation/)
+-   [IUPred2A](https://iupred2a.elte.hu/) disorder scores that we
+    obtained with the command line application. To note here that for
+    the UniProt id Q8NFD5, we specified Q8NFD5-1 because IUPred2A
+    considered as canonical isoform the id Q8NFD5-5 (as correctly
+    indicated in UniProt).
+-   pLDDT scores obtained from the AlphaFold pdb files. For protein
+    longer than 1800 residues we assembled AlphaFold multiple models to
+    produce an ensemble structure. Nevertheless, these models are highly
+    unreliable. (AlphaFold model is missing for UniProt id Q9NZV5).
+-   Partners score that we designed based on 3D contacts between C-alpha
+    atoms but without evolutionary coupled residues because it would
+    have required to run an alignment pipeline not optimized for
+    hundreds of genes.
+-   [PolyPhen-2](http://genetics.bwh.harvard.edu/pph2/dokuwiki/downloads)
+    was used to annotate the remaining 9 features (Nsubs, Score2, Nres,
+    Score1, phylop, dScore, NormASA, BaRE, DistQmin). Check
+    [here](http://genetics.bwh.harvard.edu/pph2/dokuwiki/appendix_a) for
+    a description of the features.
+
+``` r
+library(data.table)
+library(stringr)
+library(gbm)
+```
+
+    ## Loaded gbm 2.1.8
+
+``` r
+library(caret)
+```
+
+    ## Loading required package: lattice
+
+    ## Loading required package: ggplot2
+
+``` r
+setwd("/Users/luppino/ownCloud/GitLab/DeMAG/demag_newgenes/")
+
+new_genes   = fread("/Users/luppino/ownCloud/GitLab/DeMAG/demag_newgenes/genes_at_least_5.tsv")
+neut        = fread("clinvar_20220812.neutral.clean.pph2.output")
+dele        = fread("clinvar_20220812.deleterious.clean.pph2.output")
+
+neut$label1 = "neutral"
+dele$label1 = "deleterious"
+
+df   = rbind(neut,dele)
+df   = df[acc%in%new_genes$From,]
+
+# do not consider genes with length greater than 1800 residues because AF models are unreliable
+df   = df[length<=1800,]
+
+## attach plddt
+plddt      = fread("/Users/luppino/ownCloud/GitLab/DeMAG/demag_newgenes/genes_at_least_5_plddt.txt",col.names = c("acc","pos","plddt"))
+df         = merge(df,plddt,by=c("acc","pos"),all.x=T)
+## attach iupred
+iupred     = fread("/Users/luppino/ownCloud/GitLab/DeMAG/demag_newgenes/genes_at_least_5_iupred.txt",col.names = c("pos","aa1","iupred","acc"))
+iupred$acc = str_extract(iupred$acc,"[^_]+")
+df         = merge(df,iupred,by=c("acc","pos","aa1"),all.x=T)
+## attach partners score
+partners   = fread("/Users/luppino/ownCloud/Gitlab/DeMAG/demag_newgenes/genes_at_least_5_partners_score.txt")
+df         = merge(df,partners,by=c("pos","acc"),all.x=T)
+## evmutation
+evmut      = fread("/Users/luppino/ownCloud/Gitlab/DeMAG/demag_newgenes/genes_at_least_5_evmut.txt")
+evmut      = evmut[!duplicated(evmut[,.(id,pos,wt,subs)]),]
+df         = merge(df,evmut[,.(id,pos,wt,subs,evm_score_v2=prediction_epistatic)],by.x=c("acc","pos","aa1","aa2"),by.y=c("id","pos","wt","subs"),all.x=T)
+
+
+##load model (it is located in the models folder)
+load("/Users/luppino/ownCloud/PhD_results/R workspace/gbm.RData")
+
+## try to exclude some features to check drop in performance
+#df$partners_score = NA
+#df$evm_score_v2   = NA
+
+## DeMAG prediction
+cl.gbm        = predict(final.gbm,newdata=df,n.trees=final_bestTree,type="response")
+variants      = df[,"DeMAG_prediction":=cl.gbm]
+variants      = variants[,"DeMAG":=ifelse(DeMAG_prediction>=0.5,"deleterious","neutral")]
+
+caret::confusionMatrix(as.factor(variants$DeMAG),as.factor(variants$label1),positive="deleterious")
+```
+
+    ## Confusion Matrix and Statistics
+    ## 
+    ##              Reference
+    ## Prediction    deleterious neutral
+    ##   deleterious        4676     423
+    ##   neutral             463    2349
+    ##                                           
+    ##                Accuracy : 0.888           
+    ##                  95% CI : (0.8808, 0.8949)
+    ##     No Information Rate : 0.6496          
+    ##     P-Value [Acc > NIR] : <2e-16          
+    ##                                           
+    ##                   Kappa : 0.7548          
+    ##                                           
+    ##  Mcnemar's Test P-Value : 0.1901          
+    ##                                           
+    ##             Sensitivity : 0.9099          
+    ##             Specificity : 0.8474          
+    ##          Pos Pred Value : 0.9170          
+    ##          Neg Pred Value : 0.8353          
+    ##              Prevalence : 0.6496          
+    ##          Detection Rate : 0.5911          
+    ##    Detection Prevalence : 0.6445          
+    ##       Balanced Accuracy : 0.8787          
+    ##                                           
+    ##        'Positive' Class : deleterious     
+    ##