File size: 12,511 Bytes
7718235 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 |
# upsampling function to have same numbers of scn,cac,lof,gof
func = function (x, y) {
# which of the four class x scn combos is highest? -> upsample all data to that!
xup <- if (is.data.frame(x)) x else as.data.frame(x)
xup$Class <- y
frqtab <- data.frame(table(xup[,c("Class", "scn")])) # frquency table
frqmax <- frqtab[tail(order(frqtab$Freq),1),] # find highest class x scn combo
xup <- rbind(xup[xup$scn%in%frqmax$scn & xup$Class%in%frqmax$Class ,],
xup[sample(rownames(xup[xup$scn%in%frqmax$scn & !xup$Class%in%frqmax$Class ,]), size = frqmax$Freq, replace = T),],
xup[sample(rownames(xup[!xup$scn%in%frqmax$scn & !xup$Class%in%frqmax$Class ,]), size = frqmax$Freq, replace = T),],
xup[sample(rownames(xup[!xup$scn%in%frqmax$scn & xup$Class%in%frqmax$Class ,]), size = frqmax$Freq, replace = T),]
)
list(x=xup[, !grepl("Class", colnames(xup), fixed = TRUE)],
y=xup$Class)
}
samplingfct <- list(name = "upsampling to balance Class and scn!",
func = func,
first = TRUE)
gene2familyalignment_quant <- function(gene, variants, alignmentfile)
{
variant <- as.data.frame(table(variants), stringsAsFactors = F)
variant$variant <- as.integer(variant$variant)
gene1 <- alignmentfile[,gene]
bigfamilyalignment <- rep(0,nrow(alignmentfile))
bigfamilyalignment[which(gene1!="-")][variant$variant] <- variant$Freq
return(bigfamilyalignment)
}
ma <- function(x,windowsize){stats::filter(x,rep(1/windowsize,windowsize), circular = T)}
vardens <- function(gene1, funcycat, featuretable, wind, alignmentfile, varonfamilyalignment)
{
densgof <- apply(as.matrix(varonfamilyalignment[,grepl(funcycat, colnames(varonfamilyalignment))]), 1, sum)
# map onto gene
allvarongene <- densgof[!as.data.frame(alignmentfile)[,gene1]%in%"-"]
# slwind with ALL variants
slwindall <- ma(x = allvarongene, windowsize = wind)
slwindall <- slwindall[featuretable[gene%in%gene1]$pos] # adapt to multiple aa per sites
return(slwindall)
}
# define parameters during training (caret fct)
fitControl <- caret::trainControl( ## here: k-fold cross validation
method = "repeatedcv",
number = 10,
repeats = 10,
sampling = samplingfct,
classProbs = T #
)
# output performance
modelperformance <- function(out) {
res <- c(multiClassSummary(out, lev = c("gof", "lof")),
# matthews correlation coefficient:
mcc(preds = ifelse(out$pred%in%"gof", 1, 0),
actuals = ifelse(out$obs%in%"gof", 1, 0)),
round(twoClassSummary(out, lev = c("gof", "lof")), digits = 2) )
names(res)[15] <- "MCC"
return(res[c("Balanced_Accuracy", "Sens", "Spec","AUC","Precision","Recall","F1", "prAUC","Kappa", "MCC")])
}
# training fct
predictgof <- function(varallmod, modeltype, alignmentfile, featuretable)
{
# reproducible random splits
suppressWarnings(RNGversion("3.5.3"))
set.seed(999)
# randomly split in training/testing
inTraining <- createDataPartition(as.factor(varallmod$Class), p = .9, list = FALSE)
trainingall <- varallmod[ inTraining,] # two training sets
testing <- varallmod[ -inTraining,] # 1 comb and 1 test set
set.seed(989) # separate two training sets, one used for calculating variant densities
inTraining1 <- createDataPartition((trainingall$Class), p = .5, list = FALSE)
training1 <- trainingall[inTraining1,]
training2 <- trainingall[-inTraining1,]
# calculate variant density from training1 and map on training2 ####
training1 <- training1[,c("gene", "pos","refAA", "altAA", "Class")]
# variants on family alignment
gofgenes <- unique(training1[training1$Class%in%"gof",]$gene)
lofgenes <- unique(training1[training1$Class%in%"lof",]$gene)
familyaligned_gof <- c()
for ( i in gofgenes)
{
var1 <- training1[training1$gene%in%i & training1$Class%in%"gof",][,c("pos", "altAA")]
gof <- gene2familyalignment_quant(gene = i, variants = var1$pos, alignmentfile = famcacscn)
familyaligned_gof <- cbind(familyaligned_gof, gof)
}
familyaligned_lof <- c()
colnames(familyaligned_gof) <- paste(gofgenes,"GOF", sep = "_")
for ( i in lofgenes)
{
var1 <- training1[training1$gene%in%i & training1$Class%in%"lof",][,c("pos", "altAA")]
gof <- gene2familyalignment_quant(gene = i, variants = var1$pos, alignmentfile = famcacscn)
familyaligned_lof <- cbind(familyaligned_lof, gof)
}
colnames(familyaligned_lof) <- paste(lofgenes,"LOF", sep = "_")
familyaligned <- cbind(familyaligned_gof, familyaligned_lof)
# variants on family alignment -> var densitiy -> on individual genes
uniqgenemech <- unique(featuretable$gene)
# diff sliding windows 10 AA
featuretable$densgof <- unlist(sapply(uniqgenemech, function(x){vardens(x, "GOF", featuretable, wind = 10, famcacscn, familyaligned)}))
featuretable$densgof3aa <- unlist(sapply(uniqgenemech, function(x){vardens(x, "GOF", featuretable, wind = 3, famcacscn, familyaligned)}))
featuretable$denslof <- unlist(sapply(uniqgenemech, function(x){vardens(x, "LOF", featuretable, wind = 10, famcacscn, familyaligned)}))
featuretable$denslof3aa <- unlist(sapply(uniqgenemech, function(x){vardens(x, "LOF", featuretable, wind = 3, famcacscn, familyaligned)}))
# zscore and round
featuretable$densgof <- round(scale(featuretable$densgof), 2)
featuretable$densgof3aa <- round(scale(featuretable$densgof3aa),2)
featuretable$denslof <- round(scale(featuretable$denslof),2)
featuretable$denslof3aa <- round(scale(featuretable$denslof3aa),2)
# map variant density of training1 onto training2 and testing data
training2 <- cbind(training2, as.data.frame(featuretable[match(training2$protid, protid)])[,grep("dens", colnames(featuretable))])
# remove altAA etc
training <- training2[,!colnames(training2)%in%c(colnames(training1), "protid")]
training$Class <- training2$Class
# add vardens onto testing
testing <- cbind(testing, as.data.frame(featuretable[match(testing$protid, protid)])[,grep("dens", colnames(featuretable))])
# train ####
cl <- makePSOCKcluster(5)
registerDoParallel(cl)
set.seed(999)
starttime <- as.character(Sys.time())
print(c("start training at", starttime), quote = F)
# print()
gbmFit1_2 <- caret::train(Class ~ ., data = training,
method = modeltype,
trControl = fitControl,
verbose = FALSE)
starttime <- as.character(Sys.time())
print(c("finish training at", starttime), quote = F)
model1 <- gbmFit1_2
test_data <- testing$Class
# compare with gbm method implemented with sklearn
write.csv(training, file = 'training.fuNCion.csv')
write.csv(testing, file = 'testing.fuNCion.csv')
# gbmFit1_2 <- caret::train(Class ~ ., data = training,
# method = modeltype,
# trControl = fitControl,
# verbose = T)
res <- system('/share/descartes/Users/gz2294/miniconda3/envs/RESCVE/bin/python /share/pascal/Users/gz2294/Data/DMS/Ion_Channel/funNCion/sklearn.gbm.py training.fuNCion.csv testing.fuNCion.csv',
intern = T)
auc <- as.numeric(strsplit(res, '=')[[1]][2])
out <- data.frame(obs= test_data,
gof = predict(model1, newdata = testing, type = "prob")[,"gof"],
lof = predict(model1, newdata = testing, type = "prob")[,"lof"],
pred = predict(model1, newdata = testing),
gene = feat[-inTraining,]$gene,
auc = auc
)
return(list(out, gbmFit1_2))
stopCluster(cl)
}
# training fct, modified only the training, testing data split
predictgof_manual_split <- function(trainingall, testing, modeltype, alignmentfile, featuretable)
{
# reproducible random splits
suppressWarnings(RNGversion("3.5.3"))
set.seed(999)
# randomly split in training/testing
# inTraining <- createDataPartition(as.factor(varallmod$Class), p = .9, list = FALSE)
# trainingall <- varallmod[ inTraining,] # two training sets
# testing <- varallmod[ -inTraining,] # 1 comb and 1 test set
set.seed(989) # separate two training sets, one used for calculating variant densities
inTraining1 <- createDataPartition((trainingall$Class), p = .5, list = FALSE)
training1 <- trainingall[inTraining1,]
training2 <- trainingall[-inTraining1,]
# calculate variant density from training1 and map on training2 ####
training1 <- training1[,c("gene", "pos","refAA", "altAA", "Class")]
# variants on family alignment
gofgenes <- unique(training1[training1$Class%in%"gof",]$gene)
lofgenes <- unique(training1[training1$Class%in%"lof",]$gene)
familyaligned_gof <- c()
for ( i in gofgenes)
{
var1 <- training1[training1$gene%in%i & training1$Class%in%"gof",][,c("pos", "altAA")]
gof <- gene2familyalignment_quant(gene = i, variants = var1$pos, alignmentfile = famcacscn)
familyaligned_gof <- cbind(familyaligned_gof, gof)
}
familyaligned_lof <- c()
colnames(familyaligned_gof) <- paste(gofgenes,"GOF", sep = "_")
for ( i in lofgenes)
{
var1 <- training1[training1$gene%in%i & training1$Class%in%"lof",][,c("pos", "altAA")]
gof <- gene2familyalignment_quant(gene = i, variants = var1$pos, alignmentfile = famcacscn)
familyaligned_lof <- cbind(familyaligned_lof, gof)
}
colnames(familyaligned_lof) <- paste(lofgenes,"LOF", sep = "_")
familyaligned <- cbind(familyaligned_gof, familyaligned_lof)
# variants on family alignment -> var densitiy -> on individual genes
uniqgenemech <- unique(featuretable$gene)
# diff sliding windows 10 AA
featuretable$densgof <- unlist(sapply(uniqgenemech, function(x){vardens(x, "GOF", featuretable, wind = 10, famcacscn, familyaligned)}))
featuretable$densgof3aa <- unlist(sapply(uniqgenemech, function(x){vardens(x, "GOF", featuretable, wind = 3, famcacscn, familyaligned)}))
featuretable$denslof <- unlist(sapply(uniqgenemech, function(x){vardens(x, "LOF", featuretable, wind = 10, famcacscn, familyaligned)}))
featuretable$denslof3aa <- unlist(sapply(uniqgenemech, function(x){vardens(x, "LOF", featuretable, wind = 3, famcacscn, familyaligned)}))
# zscore and round
featuretable$densgof <- round(scale(featuretable$densgof), 2)
featuretable$densgof3aa <- round(scale(featuretable$densgof3aa),2)
featuretable$denslof <- round(scale(featuretable$denslof),2)
featuretable$denslof3aa <- round(scale(featuretable$denslof3aa),2)
# map variant density of training1 onto training2 and testing data
training2 <- cbind(training2, as.data.frame(featuretable[match(training2$protid, protid)])[,grep("dens", colnames(featuretable))])
# remove altAA etc
# training <- training2[,!colnames(training2)%in%c(colnames(training1), "protid")]
# previous code didn't work
training <- training2
for (co in c(colnames(training1), "protid")) {
training[,co] <- NULL
}
training$Class <- training2$Class
# add vardens onto testing
testing <- cbind(testing, as.data.frame(featuretable[match(testing$protid, protid)])[,grep("dens", colnames(featuretable))])
# train ####
# cl <- makePSOCKcluster(5)
# registerDoParallel(cl)
set.seed(999)
starttime <- as.character(Sys.time())
print(c("start training at", starttime), quote = F)
# print()
# write to csv as the training program didn't work
write.csv(training, file = 'training.fuNCion.csv')
write.csv(testing, file = 'testing.fuNCion.csv')
# gbmFit1_2 <- caret::train(Class ~ ., data = training,
# method = modeltype,
# trControl = fitControl,
# verbose = T)
res <- system('/share/descartes/Users/gz2294/miniconda3/envs/RESCVE/bin/python /share/pascal/Users/gz2294/Data/DMS/Ion_Channel/funNCion/sklearn.gbm.py training.fuNCion.csv testing.fuNCion.csv',
intern = T)
starttime <- as.character(Sys.time())
print(c("finish training at", starttime), quote = F)
# model1 <- gbmFit1_2
# test_data <- testing$Class
#
# out <- data.frame(obs= test_data,
# gof = predict(model1, newdata = testing, type = "prob")[,"gof"],
# lof = predict(model1, newdata = testing, type = "prob")[,"lof"],
# pred= predict(model1, newdata = testing)
# ,gene=testing$gene
# )
# return(list(out, gbmFit1_2))
# stopCluster(cl)
auc <- as.numeric(strsplit(res, '=')[[1]][2])
auc
}
|