|
|
|
|
|
func = function (x, y) { |
|
|
|
xup <- if (is.data.frame(x)) x else as.data.frame(x) |
|
xup$Class <- y |
|
frqtab <- data.frame(table(xup[,c("Class", "scn")])) |
|
frqmax <- frqtab[tail(order(frqtab$Freq),1),] |
|
xup <- rbind(xup[xup$scn%in%frqmax$scn & xup$Class%in%frqmax$Class ,], |
|
xup[sample(rownames(xup[xup$scn%in%frqmax$scn & !xup$Class%in%frqmax$Class ,]), size = frqmax$Freq, replace = T),], |
|
xup[sample(rownames(xup[!xup$scn%in%frqmax$scn & !xup$Class%in%frqmax$Class ,]), size = frqmax$Freq, replace = T),], |
|
xup[sample(rownames(xup[!xup$scn%in%frqmax$scn & xup$Class%in%frqmax$Class ,]), size = frqmax$Freq, replace = T),] |
|
) |
|
list(x=xup[, !grepl("Class", colnames(xup), fixed = TRUE)], |
|
y=xup$Class) |
|
} |
|
|
|
samplingfct <- list(name = "upsampling to balance Class and scn!", |
|
func = func, |
|
first = TRUE) |
|
|
|
gene2familyalignment_quant <- function(gene, variants, alignmentfile) |
|
{ |
|
variant <- as.data.frame(table(variants), stringsAsFactors = F) |
|
variant$variant <- as.integer(variant$variant) |
|
gene1 <- alignmentfile[,gene] |
|
bigfamilyalignment <- rep(0,nrow(alignmentfile)) |
|
bigfamilyalignment[which(gene1!="-")][variant$variant] <- variant$Freq |
|
return(bigfamilyalignment) |
|
} |
|
|
|
ma <- function(x,windowsize){stats::filter(x,rep(1/windowsize,windowsize), circular = T)} |
|
|
|
vardens <- function(gene1, funcycat, featuretable, wind, alignmentfile, varonfamilyalignment) |
|
{ |
|
densgof <- apply(as.matrix(varonfamilyalignment[,grepl(funcycat, colnames(varonfamilyalignment))]), 1, sum) |
|
|
|
allvarongene <- densgof[!as.data.frame(alignmentfile)[,gene1]%in%"-"] |
|
|
|
slwindall <- ma(x = allvarongene, windowsize = wind) |
|
slwindall <- slwindall[featuretable[gene%in%gene1]$pos] |
|
return(slwindall) |
|
} |
|
|
|
|
|
fitControl <- caret::trainControl( |
|
method = "repeatedcv", |
|
number = 10, |
|
repeats = 10, |
|
sampling = samplingfct, |
|
classProbs = T |
|
) |
|
|
|
|
|
modelperformance <- function(out) { |
|
res <- c(multiClassSummary(out, lev = c("gof", "lof")), |
|
|
|
mcc(preds = ifelse(out$pred%in%"gof", 1, 0), |
|
actuals = ifelse(out$obs%in%"gof", 1, 0)), |
|
round(twoClassSummary(out, lev = c("gof", "lof")), digits = 2) ) |
|
names(res)[15] <- "MCC" |
|
return(res[c("Balanced_Accuracy", "Sens", "Spec","AUC","Precision","Recall","F1", "prAUC","Kappa", "MCC")]) |
|
} |
|
|
|
|
|
|
|
predictgof <- function(varallmod, modeltype, alignmentfile, featuretable) |
|
{ |
|
|
|
suppressWarnings(RNGversion("3.5.3")) |
|
set.seed(999) |
|
|
|
inTraining <- createDataPartition(as.factor(varallmod$Class), p = .9, list = FALSE) |
|
trainingall <- varallmod[ inTraining,] |
|
testing <- varallmod[ -inTraining,] |
|
|
|
set.seed(989) |
|
inTraining1 <- createDataPartition((trainingall$Class), p = .5, list = FALSE) |
|
training1 <- trainingall[inTraining1,] |
|
training2 <- trainingall[-inTraining1,] |
|
|
|
|
|
training1 <- training1[,c("gene", "pos","refAA", "altAA", "Class")] |
|
|
|
|
|
gofgenes <- unique(training1[training1$Class%in%"gof",]$gene) |
|
lofgenes <- unique(training1[training1$Class%in%"lof",]$gene) |
|
|
|
familyaligned_gof <- c() |
|
for ( i in gofgenes) |
|
{ |
|
var1 <- training1[training1$gene%in%i & training1$Class%in%"gof",][,c("pos", "altAA")] |
|
gof <- gene2familyalignment_quant(gene = i, variants = var1$pos, alignmentfile = famcacscn) |
|
familyaligned_gof <- cbind(familyaligned_gof, gof) |
|
} |
|
familyaligned_lof <- c() |
|
colnames(familyaligned_gof) <- paste(gofgenes,"GOF", sep = "_") |
|
for ( i in lofgenes) |
|
{ |
|
var1 <- training1[training1$gene%in%i & training1$Class%in%"lof",][,c("pos", "altAA")] |
|
gof <- gene2familyalignment_quant(gene = i, variants = var1$pos, alignmentfile = famcacscn) |
|
familyaligned_lof <- cbind(familyaligned_lof, gof) |
|
} |
|
colnames(familyaligned_lof) <- paste(lofgenes,"LOF", sep = "_") |
|
familyaligned <- cbind(familyaligned_gof, familyaligned_lof) |
|
|
|
|
|
uniqgenemech <- unique(featuretable$gene) |
|
|
|
featuretable$densgof <- unlist(sapply(uniqgenemech, function(x){vardens(x, "GOF", featuretable, wind = 10, famcacscn, familyaligned)})) |
|
featuretable$densgof3aa <- unlist(sapply(uniqgenemech, function(x){vardens(x, "GOF", featuretable, wind = 3, famcacscn, familyaligned)})) |
|
featuretable$denslof <- unlist(sapply(uniqgenemech, function(x){vardens(x, "LOF", featuretable, wind = 10, famcacscn, familyaligned)})) |
|
featuretable$denslof3aa <- unlist(sapply(uniqgenemech, function(x){vardens(x, "LOF", featuretable, wind = 3, famcacscn, familyaligned)})) |
|
|
|
|
|
featuretable$densgof <- round(scale(featuretable$densgof), 2) |
|
featuretable$densgof3aa <- round(scale(featuretable$densgof3aa),2) |
|
featuretable$denslof <- round(scale(featuretable$denslof),2) |
|
featuretable$denslof3aa <- round(scale(featuretable$denslof3aa),2) |
|
|
|
|
|
training2 <- cbind(training2, as.data.frame(featuretable[match(training2$protid, protid)])[,grep("dens", colnames(featuretable))]) |
|
|
|
training <- training2[,!colnames(training2)%in%c(colnames(training1), "protid")] |
|
training$Class <- training2$Class |
|
|
|
testing <- cbind(testing, as.data.frame(featuretable[match(testing$protid, protid)])[,grep("dens", colnames(featuretable))]) |
|
|
|
|
|
cl <- makePSOCKcluster(5) |
|
registerDoParallel(cl) |
|
|
|
set.seed(999) |
|
starttime <- as.character(Sys.time()) |
|
print(c("start training at", starttime), quote = F) |
|
|
|
gbmFit1_2 <- caret::train(Class ~ ., data = training, |
|
method = modeltype, |
|
trControl = fitControl, |
|
verbose = FALSE) |
|
starttime <- as.character(Sys.time()) |
|
print(c("finish training at", starttime), quote = F) |
|
model1 <- gbmFit1_2 |
|
test_data <- testing$Class |
|
|
|
|
|
write.csv(training, file = 'training.fuNCion.csv') |
|
write.csv(testing, file = 'testing.fuNCion.csv') |
|
|
|
|
|
|
|
|
|
res <- system('/share/descartes/Users/gz2294/miniconda3/envs/RESCVE/bin/python /share/pascal/Users/gz2294/Data/DMS/Ion_Channel/funNCion/sklearn.gbm.py training.fuNCion.csv testing.fuNCion.csv', |
|
intern = T) |
|
auc <- as.numeric(strsplit(res, '=')[[1]][2]) |
|
out <- data.frame(obs= test_data, |
|
gof = predict(model1, newdata = testing, type = "prob")[,"gof"], |
|
lof = predict(model1, newdata = testing, type = "prob")[,"lof"], |
|
pred = predict(model1, newdata = testing), |
|
gene = feat[-inTraining,]$gene, |
|
auc = auc |
|
) |
|
return(list(out, gbmFit1_2)) |
|
stopCluster(cl) |
|
} |
|
|
|
|
|
|
|
|
|
predictgof_manual_split <- function(trainingall, testing, modeltype, alignmentfile, featuretable) |
|
{ |
|
|
|
suppressWarnings(RNGversion("3.5.3")) |
|
set.seed(999) |
|
|
|
|
|
|
|
|
|
|
|
set.seed(989) |
|
inTraining1 <- createDataPartition((trainingall$Class), p = .5, list = FALSE) |
|
training1 <- trainingall[inTraining1,] |
|
training2 <- trainingall[-inTraining1,] |
|
|
|
|
|
training1 <- training1[,c("gene", "pos","refAA", "altAA", "Class")] |
|
|
|
|
|
gofgenes <- unique(training1[training1$Class%in%"gof",]$gene) |
|
lofgenes <- unique(training1[training1$Class%in%"lof",]$gene) |
|
|
|
familyaligned_gof <- c() |
|
for ( i in gofgenes) |
|
{ |
|
var1 <- training1[training1$gene%in%i & training1$Class%in%"gof",][,c("pos", "altAA")] |
|
gof <- gene2familyalignment_quant(gene = i, variants = var1$pos, alignmentfile = famcacscn) |
|
familyaligned_gof <- cbind(familyaligned_gof, gof) |
|
} |
|
familyaligned_lof <- c() |
|
colnames(familyaligned_gof) <- paste(gofgenes,"GOF", sep = "_") |
|
for ( i in lofgenes) |
|
{ |
|
var1 <- training1[training1$gene%in%i & training1$Class%in%"lof",][,c("pos", "altAA")] |
|
gof <- gene2familyalignment_quant(gene = i, variants = var1$pos, alignmentfile = famcacscn) |
|
familyaligned_lof <- cbind(familyaligned_lof, gof) |
|
} |
|
colnames(familyaligned_lof) <- paste(lofgenes,"LOF", sep = "_") |
|
familyaligned <- cbind(familyaligned_gof, familyaligned_lof) |
|
|
|
|
|
uniqgenemech <- unique(featuretable$gene) |
|
|
|
featuretable$densgof <- unlist(sapply(uniqgenemech, function(x){vardens(x, "GOF", featuretable, wind = 10, famcacscn, familyaligned)})) |
|
featuretable$densgof3aa <- unlist(sapply(uniqgenemech, function(x){vardens(x, "GOF", featuretable, wind = 3, famcacscn, familyaligned)})) |
|
featuretable$denslof <- unlist(sapply(uniqgenemech, function(x){vardens(x, "LOF", featuretable, wind = 10, famcacscn, familyaligned)})) |
|
featuretable$denslof3aa <- unlist(sapply(uniqgenemech, function(x){vardens(x, "LOF", featuretable, wind = 3, famcacscn, familyaligned)})) |
|
|
|
|
|
featuretable$densgof <- round(scale(featuretable$densgof), 2) |
|
featuretable$densgof3aa <- round(scale(featuretable$densgof3aa),2) |
|
featuretable$denslof <- round(scale(featuretable$denslof),2) |
|
featuretable$denslof3aa <- round(scale(featuretable$denslof3aa),2) |
|
|
|
|
|
training2 <- cbind(training2, as.data.frame(featuretable[match(training2$protid, protid)])[,grep("dens", colnames(featuretable))]) |
|
|
|
|
|
|
|
training <- training2 |
|
for (co in c(colnames(training1), "protid")) { |
|
training[,co] <- NULL |
|
} |
|
training$Class <- training2$Class |
|
|
|
|
|
testing <- cbind(testing, as.data.frame(featuretable[match(testing$protid, protid)])[,grep("dens", colnames(featuretable))]) |
|
|
|
|
|
|
|
|
|
|
|
set.seed(999) |
|
starttime <- as.character(Sys.time()) |
|
print(c("start training at", starttime), quote = F) |
|
|
|
|
|
write.csv(training, file = 'training.fuNCion.csv') |
|
write.csv(testing, file = 'testing.fuNCion.csv') |
|
|
|
|
|
|
|
|
|
res <- system('/share/descartes/Users/gz2294/miniconda3/envs/RESCVE/bin/python /share/pascal/Users/gz2294/Data/DMS/Ion_Channel/funNCion/sklearn.gbm.py training.fuNCion.csv testing.fuNCion.csv', |
|
intern = T) |
|
starttime <- as.character(Sys.time()) |
|
print(c("finish training at", starttime), quote = F) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
auc <- as.numeric(strsplit(res, '=')[[1]][2]) |
|
auc |
|
} |
|
|