Skip to content
Snippets Groups Projects
Commit 0100dc31 authored by luppino's avatar luppino
Browse files

training gbm model for DeMAG

parent 6dcafbf1
No related merge requests found
## libraries:
library(data.table)
library(stepPlr)
library(gbm)
library(ROCR)
library(caret)
library(stringr)
library(inTrees)
library(tidyr)
db = fread("ACMG59_training_v4.pph.output",stringsAsFactors = T)
db = db[label!="hgmd_deleterious",]
# iupred
disorder = fread("iupred_acmg59.txt",stringsAsFactors = T)
db = merge(db,disorder[,-"aa1"],by=c("acc","pos"),all.x = T)
#evmutation
evmut = fread("evmut_variants_v2.txt",col.names = c("acc","pos","aa1","aa2","evm_score_v2"),stringsAsFactors = T)
db = merge(db,evmut,by=c("acc","pos","aa1","aa2"),all.x = T)
#evmutation humortho
db = db[label=="humortho_neutral","evm_score_v2":=evmut[db[label=="humortho_neutral",],on =.(acc=acc,aa1=aa2,aa2=aa1,pos=pos),evm_score_v2]]
#coupling partners:
coup = fread("partners_score.txt",stringsAsFactors = T,col.names=c("pos","acc","partners","partners_score"))
db = merge(db,coup[,-3],by=c("acc","pos"),all.x = T)
## alphafold plddt
plddt = fread("alphafold_plddt.txt")
db = merge(db,plddt[,1:3],by.x=c("gene","pos"),by.y = c("id","aa1"),all.x=T)
db = db[,.(
acc,
#length,
#pos,
aa1,
aa2,
#str,
#ntlen,
#ntpos,
#nt1,
#nt2,
#dref = as.factor(dref),
#exon=as.numeric(str_extract(exon,"[^/]+"))/as.numeric(gsub(".*/","",exon)),
#cexon=as.numeric(str_extract(cexon,"[^/]+"))/as.numeric(gsub(".*/","",cexon)),
#txcov=as.numeric(str_extract(txcov,"[^/]+"))/as.numeric(gsub(".*/","",txcov)),
gerprs,
phylop,
trv = as.factor(trv),
CpG = as.factor(CpG),
#JXmin,
#JXc,
frame = as.factor(frame),
dgn = as.factor(dgn),
cdn2,
cdn1,
PfamHit=as.factor(ifelse(PfamHit!="NO","YES","NO")),
site=as.factor(ifelse(site!="NO","YES","NO")),
region=as.factor(ifelse(region!="NO","YES","NO")),
#PHAT,
dScore,
Score1,
Score2,
Nobs,
Nseqs,
Nsubs,
Nvars,
Nres,
DistPmin,
DistPSNP,
DistQmin,
BaRE,
#RVISraw,
Nstr,
Nfilt,
#PDB_len,
#PDB_pos,
#PDB_idn,
dVol,
dProp,
SecStr,
#MapReg,
NormASA,
`B-fact`,
#`H-bonds`,
evm_score_v2,
iupred,
partners_score, #FoldX_continuous,
plddt,
label1)]
#dummy code for label variable
db = db[,"label1":=ifelse(label1=="deleterious",1,0)]
#all model features
db = db[,c("acc","label1","Score1","Score2","dScore","partners_score","evm_score_v2","iupred","Nsubs","BaRE","plddt","phylop","Nres","DistQmin","NormASA")]
#clinvar features
#db = db[,c("acc","label1","Score1","Score2","dScore","partners_score","evm_score_v2","DistPmin","Nsubs","BaRE","plddt","phylop","Nres","DistQmin","NormASA")]
#only on complete data.set
#db = db[complete.cases(db),]
# without epistatic features
#db = db[,c("acc","label1", "Score1","Score2","dScore","iupred","plddt","Nsubs","BaRE","phylop","Nres","DistQmin","NormASA")]
# without structurally derived features
#db = db[,c("acc","label1", "Score1","Score2","dScore","evm_score_v2","partners_score","Nsubs","BaRE","phylop","Nres","DistQmin")]
#shuffle features order to see if anything changes
set.seed(1)
cols = sample(names(db))
db = db[,..cols]
## create a copy
dataset = as.data.frame(copy(db))
################################## CV for gbm ##################################################
set.seed(135)
### train e test
# search_grid= expand_grid("shrinkage"=seq(0.001,0.01,length.out = 3),
# "interaction.depth"=1:3,
# "AUC"=0,
# "bestcutoff"=0,
# "Accuracy"=0,
# "Sensitivity"=0,
# "Specificity"=0,
# "MCC"=0,
# "PPV"=0,
# "NPV"=0)
#without splitting
drops = c("acc","gene","label1")
train = dataset[ , !(names(dataset) %in% drops)]
cl.train = dataset[,"label1"]
# training data
k=4
data = dataset
#data = dataset[complete.cases(dataset),]
folds = groupKFold(data$acc,k=k) # creates fold with the grouping variable acc
sapply(folds,function(x) prop.table(table(data[x,"label1"]))) # check the balance within each fold
sapply(folds,function(x) nrow(data)-length(x))
drops = c("acc","gene")
data = data[,!(names(dataset) %in% drops)]
statistics.test = data.frame("k"=rep(1:k),
"AUC"=rep(NA,k),
"bestcutoff"=rep(NA,k),
"Accuracy"=rep(NA,k),
"Sensitivity"=rep(NA,k),
"Specificity"=rep(NA,k),
"MCC"=rep(NA,k),
"PPV"=rep(NA,k),
"NPV"=rep(NA,k))
final_model = data.frame("AUC"=NA,
"bestcutoff"=NA,
"Accuracy"=NA,
"Sensitivity"=NA,
"Specificity"=NA,
"MCC"=NA,
"PPV"=NA,
"NPV"=NA)
#for (j in 1:nrow(search_grid)){
### prepare cross validation for the training data
#store err for each fold
cl.gbm.test = vector(mode = "list",k)
cl.gbm.test.c = vector(mode = "list",k)
labels.test = vector(mode = "list",k)
models = vector("list",length = k)
## partition of each fold
for (i in 1:k){
cv.train = data[folds[[i]],]
cv.test = data[-folds[[i]],]
fit.gbm = gbm(formula = label1~.,
distribution = "bernoulli",
data = cv.train,
n.trees = 10000,
interaction.depth=2,
shrinkage =0.001,
n.minobsinnode = 40,
cv.folds = 5)
#choose the tree that minimizes the cv errors for prediction
bestTree = gbm.perf(fit.gbm,plot.it = F, method = "cv")
# predictions and labels for test set
cl.gbm.test[[i]] = predict(fit.gbm,newdata=cv.test,n.trees=bestTree,type="response")
labels.test[[i]] = cv.test[,"label1"]
pred = prediction(cl.gbm.test[[i]],labels.test[[i]])
auc = performance(pred, measure = "auc")
tpfp = performance(pred, measure = "tpr", x.measure = "fpr")
acc = performance(pred,"acc")
statistics.test[i,"AUC"] = auc@y.values[[1]]
statistics.test[i,"bestcutoff"] = acc@x.values[[1]][which.max(acc@y.values[[1]])]
statistics.test[i,"Accuracy"] = max(acc@y.values[[1]])
cl.gbm.test.c[[i]] = rep(0,length(labels.test[[i]]))
cl.gbm.test.c[[i]][cl.gbm.test[[i]]>round(statistics.test[i,"bestcutoff"],1)]=1
cm = caret::confusionMatrix(as.factor(cl.gbm.test.c[[i]]),as.factor(labels.test[[i]]),positive = "1")
statistics.test[i,"Sensitivity"] = round(cm$byClass[[1]],2)
statistics.test[i,"Specificity"] = round(cm$byClass[[2]],2)
statistics.test[i,"MCC"] = round(mltools::mcc(cl.gbm.test.c[[i]],labels.test[[i]]),2)
statistics.test[i,"PPV"] = round(cm$byClass[[3]],2)
statistics.test[i,"NPV"] = round(cm$byClass[[4]],2)
models[[i]] = fit.gbm
}
###
# search_grid[j,"AUC"] = round(mean(statistics.test$AUC),2)
# search_grid[j,"bestcutoff"] = round(mean(statistics.test$bestcutoff),2)
# search_grid[j,"Accuracy"] = round(mean(statistics.test$Accuracy),2)
# search_grid[j,"Sensitivity"] = round(mean(statistics.test$Sensitivity),2)
# search_grid[j,"Specificity"] = round(mean(statistics.test$Specificity),2)
# search_grid[j,"MCC"] = round(mean(statistics.test$MCC),2)
# search_grid[j,"PPV"] = round(mean(statistics.test$PPV),2)
# search_grid[j,"NPV"] = round(mean(statistics.test$NPV),2)
#}
# ###########################################################################
# # aggregated average after CV
# ###########################################################################
final_model[2,"type"] = "CV aggregated"
pred_agg = prediction(unlist(cl.gbm.test),unlist(labels.test))
auc_agg = performance(pred, measure = "auc")
tpfp_agg = performance(pred, measure = "tpr", x.measure = "fpr")
final_model[2,"AUC"] = round(auc@y.values[[1]],2)
cm = caret::confusionMatrix(as.factor(unlist(cl.gbm.test.c)),as.factor(unlist(labels.test)),positive = "1")
final_model[2,"Accuracy"] = round(cm$overall[[1]],2)
final_model[2,"Sensitivity"] = round(cm$byClass[[1]],2)
final_model[2,"Specificity"] = round(cm$byClass[[2]],2)
final_model[2,"MCC"] = round(mltools::mcc(as.factor(unlist(cl.gbm.test.c)),as.factor(unlist(labels.test))),2)
final_model[2,"PPV"] = round(cm$byClass[[3]],2)
final_model[2,"NPV"] = round(cm$byClass[[4]],2)
#
# ###########################################################################
#
#
final_model[1,"type"] = "test on training"
## final model (TESTING ON TRAINING)
final.gbm = gbm(formula = label1~.,
distribution = "bernoulli",
data = data,
n.trees = 10000,
interaction.depth=2,
shrinkage =0.001,
n.minobsinnode = 40,
cv.folds = 5)
#choose the tree that minimizes the cv errors for prediction
final_bestTree = gbm.perf(final.gbm,plot.it = F, method = "cv")
final.test = predict(final.gbm,newdata=data,n.trees=final_bestTree,type="response")
final.true = data[,"label1"]
pred = prediction(final.test,final.true)
auc = performance(pred, measure = "auc")
tpfp = performance(pred, measure = "tpr", x.measure = "fpr")
acc = performance(pred,"acc")
final_model[1,"AUC"] = auc@y.values[[1]]
final_model[1,"bestcutoff"] = acc@x.values[[1]][which.max(acc@y.values[[1]])]
final_model[1,"Accuracy"] = max(acc@y.values[[1]])
cl.gbm.train = rep(0,length(final.true))
cl.gbm.train[final.test>round(final_model[1,"bestcutoff"],1)]=1
cm = caret::confusionMatrix(as.factor(cl.gbm.train),as.factor(final.true),positive = "1")
final_model[1,"Sensitivity"] = round(cm$byClass[[1]],2)
final_model[1,"Specificity"] = round(cm$byClass[[2]],2)
final_model[1,"MCC"] = round(mltools::mcc(cl.gbm.train,final.true),2)
final_model[1,"PPV"] = round(cm$byClass[[3]],2)
final_model[1,"NPV"] = round(cm$byClass[[4]],2)
#
save.image("training_without_hgmd.RData")
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment