library(ggplot2) python.path = "/share/descartes/Users/gz2294/miniconda3/envs/RESCVE/bin/python" args <- commandArgs(trailingOnly = T) # base dir for transfer learning base.dirs <- args[1] base.dirs <- strsplit(base.dirs, split = ',')[[1]] ALL.gof.lof <- read.csv('figs/ALL.csv', row.names = 1, na.strings = c('.', 'NA')) source('./prepare.biochem.R') ALL.gof.lof <- prepare.unique.id(ALL.gof.lof) biochem.cols <- c('secondary_struc', 'rsa', 'conservation.entropy', 'conservation.alt', 'conservation.ref', 'pLDDT', 'itan.gof', 'itan.lof', 'data_source') # base dir for transfer learning uniprotID.dic <- c("P21802"="FGFR2", "P15056"="BRAF", "P07949"="RET", "P04637"="TP53", "Q09428"="ABCC8", "O00555"="CACNA1A", "Q14654"="KCNJ11", "Q99250"="SCN2A", "Q14524"="SCN5A", "Q14524.clean"="SCN5A", "P21802.itan.split"="FGFR2", "P15056.itan.split"="BRAF", "P07949.itan.split"="RET", "P04637.itan.split"="TP53", "Q09428.itan.split"="ABCC8", "O00555.itan.split"="CACNA1A", "Q14654.itan.split"="KCNJ11", "Q99250.itan.split"="SCN2A", "Q14524.clean.itan.split"="SCN5A", "IPR016248"="Fibroblast Growth Factor Receptor Family", "IPR005821"="Ion Transport Domain", "IPR013518"="Potassium Channel Inwardly Rectifying Kir Cytoplasmic Domain", "IPR020635"="Tyrosine Protein Kinase Catalytic Domain" ) task.ids <- rep( c(read.csv('../scripts/gene.txt', header = F)$V1, read.csv('../scripts/gene.itan.txt', header = F)$V1, read.csv('../scripts/gene.pfams.txt', header = F)$V1 ), each=5) result <- data.frame(task.id = task.ids) result$fold <- rep(0:4, dim(result)[1]/5) result$model <- "PreMode" result$task.name <- result$task.id result$task.size.lof <- NA result$task.size.gof <- NA result$task.type <- NA for (i in 1:dim(result)[1]) { result$task.type[i] <- "Gene" tmp <- strsplit(result$task.name[i], "\\.")[[1]] tmp[1] <- uniprotID.dic[tmp[1]] if (startsWith(result$task.name[i], "IPR")) { result$task.name[i] <- paste("Domain: ", tmp, collapse = "") result$task.type[i] <- "Domain" } else { if (grepl('.IPR', result$task.name[i])) { tmp <- strsplit(result$task.name[i], "\\.")[[1]] tmp[1] <- uniprotID.dic[tmp[1]] if (tmp[2] %in% names(uniprotID.dic)) { tmp[2] <- uniprotID.dic[tmp[2]] } if (length(tmp) >= 3) { if (tmp[3] == 'self') { tmp[3] <- "(Gene Only)" result$task.type[i] <- "Gene.Gene" } else { tmp[3] <- "(Family)" result$task.type[i] <- "Gene.Domain" } } result$task.name[i] <- paste0("Gene: ", paste(tmp, collapse = ".")) } else { result$task.name[i] <- paste0("Gene: ", tmp[1]) } } task.train <- read.csv(paste0('../data.files/ICC.seed.0/', result$task.id[i], '/training.csv')) task.test <- read.csv(paste0('../data.files/ICC.seed.0/', result$task.id[i], '/testing.csv')) result$task.size.lof[i] <- sum(task.train$score==-1) + sum(task.test$score==-1) result$task.size.gof[i] <- sum(task.train$score==1) + sum(task.test$score==1) } # add baseline AUC # esm alphabets source('./AUROC.R') alphabet <- c('', '', '', '', 'L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O', '.', '-', '', '') alphabet_2 <- c('L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C') source('./bind_rows.R') result.gof <- data.frame() for (i in 1:dim(result)[1]) { # AUC on gof variants baseline.PreModes <- c() baseline.PreModes.glm <- c() baseline.PreModes.bst <- c() baseline.PreModes.lw <- c() baseline.PreModes.lw.glm <- c() baseline.PreModes.lw.bst <- c() PreMode.tr.aucs <- c() PreMode.tr.lw.aucs <- c() PreMode.min.loss <- c() PreMode.min.loss.lw <- c() for (mod in base.dirs) { if (file.exists(paste0(mod, result$task.id[i], '/testing.fold.', result$fold[i], '.4fold.csv'))) { baseline.result <- read.csv(paste0(mod, result$task.id[i], '/testing.fold.', result$fold[i], '.4fold.csv')) baseline.auc <- plot.AUC(baseline.result$score, rowMeans(baseline.result[,paste0('logits.FOLD.', 0:3)])) if (file.exists(paste0(mod, result$task.id[i], '/training.fold.', result$fold[i], '.4fold.csv'))) { set.seed(0) baseline.train <- read.csv(paste0(mod, result$task.id[i], '/training.fold.', result$fold[i], '.4fold.csv')) baseline.train$score[baseline.train$score==-1] <- 0 myglm <- glm(score ~ logits.FOLD.0 + logits.FOLD.1 + logits.FOLD.2 + logits.FOLD.3, data=baseline.train, family = "binomial") baseline.result$logits.glm <- predict(myglm, newdata = baseline.result, type = "response") baseline.auc.glm <- plot.AUC(baseline.result$score, baseline.result$logits.glm) tr.aucs <- c(plot.AUC(baseline.train$score, baseline.train$logits.FOLD.0)$auc, plot.AUC(baseline.train$score, baseline.train$logits.FOLD.1)$auc, plot.AUC(baseline.train$score, baseline.train$logits.FOLD.2)$auc, plot.AUC(baseline.train$score, baseline.train$logits.FOLD.3)$auc) best.auc <- which.max(tr.aucs) - 1 baseline.auc.bst <- plot.AUC(baseline.result$score, baseline.result[,paste0('logits.FOLD.', best.auc)]) mean.tr.aucs <- plot.AUC(baseline.train$score, rowMeans(baseline.train[,paste0('logits.FOLD.', 0:3)]))$auc min.val.loss <- rowMeans(baseline.train[,paste0('min_loss.FOLD.', 0:3)])[1] } # compare with large.window models, if exist if (file.exists(paste0(mod, result$task.id[i], '.large.window/testing.fold.', result$fold[i], '.4fold.csv'))) { baseline.result <- read.csv(paste0(mod, result$task.id[i], '.large.window/testing.fold.', result$fold[i], '.4fold.csv')) baseline.auc.lw <- plot.AUC(baseline.result$score, rowMeans(baseline.result[,paste0('logits.FOLD.', 0:3)])) if (file.exists(paste0(mod, result$task.id[i], '.large.window/training.fold.', result$fold[i], '.4fold.csv'))) { set.seed(0) baseline.train <- read.csv(paste0(mod, result$task.id[i], '.large.window/training.fold.', result$fold[i], '.4fold.csv')) baseline.train$score[baseline.train$score==-1] <- 0 myglm <- glm(score ~ logits.FOLD.0 + logits.FOLD.1 + logits.FOLD.2 + logits.FOLD.3, data=baseline.train, family = "binomial") baseline.result$logits.glm <- predict(myglm, newdata = baseline.result, type = "response") baseline.auc.lw.glm <- plot.AUC(baseline.result$score, baseline.result$logits.glm) tr.aucs <- c(plot.AUC(baseline.train$score, baseline.train$logits.FOLD.0)$auc, plot.AUC(baseline.train$score, baseline.train$logits.FOLD.1)$auc, plot.AUC(baseline.train$score, baseline.train$logits.FOLD.2)$auc, plot.AUC(baseline.train$score, baseline.train$logits.FOLD.3)$auc) best.auc <- which.max(tr.aucs) - 1 baseline.auc.lw.bst <- plot.AUC(baseline.result$score, baseline.result[,paste0('logits.FOLD.', best.auc)]) mean.tr.aucs.lw <- plot.AUC(baseline.train$score, rowMeans(baseline.train[,paste0('logits.FOLD.', 0:3)]))$auc min.val.loss.lw <- rowMeans(baseline.train[,paste0('min_loss.FOLD.', 0:3)])[1] } } else { baseline.auc.lw <- list(auc=NA) baseline.auc.lw.glm <- list(auc=NA) baseline.auc.lw.bst <- list(auc=NA) mean.tr.aucs.lw <- NA } } else { use.large <- NA baseline.auc <- list(auc=NA) baseline.auc.glm <- list(auc=NA) baseline.auc.bst <- list(auc=NA) baseline.auc.lw <- list(auc=NA) baseline.auc.lw.glm <- list(auc=NA) baseline.auc.lw.bst <- list(auc=NA) mean.tr.aucs <- NA mean.tr.aucs.lw <- NA min.val.loss <- NA min.val.loss.lw <- NA } baseline.PreModes <- c(baseline.PreModes, baseline.auc$auc) baseline.PreModes.glm <- c(baseline.PreModes.glm, baseline.auc.glm$auc) baseline.PreModes.bst <- c(baseline.PreModes.bst, baseline.auc.bst$auc) baseline.PreModes.lw <- c(baseline.PreModes.lw, baseline.auc.lw$auc) baseline.PreModes.lw.glm <- c(baseline.PreModes.lw.glm, baseline.auc.lw.glm$auc) baseline.PreModes.lw.bst <- c(baseline.PreModes.lw.bst, baseline.auc.lw.bst$auc) PreMode.tr.aucs <- c(PreMode.tr.aucs, mean.tr.aucs) PreMode.tr.lw.aucs <- c(PreMode.tr.lw.aucs, mean.tr.aucs.lw) PreMode.min.loss <- c(PreMode.min.loss, min.val.loss) PreMode.min.loss.lw <- c(PreMode.min.loss.lw, min.val.loss.lw) } # add itan baseline.result <- read.csv(paste0('PreMode/', result$task.id[i], '/testing.fold.', result$fold[i], '.4fold.csv')) test.result <- prepare.unique.id(baseline.result) test.result[,biochem.cols] <- ALL.gof.lof[match(test.result$unique.id, ALL.gof.lof$unique.id), biochem.cols] baseline.auc.6 <- plot.AUC(test.result$score[!grepl("Itan", test.result$data_source)], 1-test.result$itan.gof[!grepl("Itan", test.result$data_source)]) # add random forest, elastic net if (file.exists(paste0('PreMode/', result$task.id[i], '/training.fold.', result$fold[i], '.4fold.csv')) & file.exists(paste0('PreMode/', result$task.id[i], '/testing.fold.', result$fold[i], '.4fold.csv'))) { gene.train <- read.csv(paste0('PreMode/', result$task.id[i], '/training.fold.', result$fold[i], '.4fold.csv')) gene.test <- read.csv(paste0('PreMode/', result$task.id[i], '/testing.fold.', result$fold[i], '.4fold.csv')) gene.train <- prepare.unique.id(gene.train) gene.test <- prepare.unique.id(gene.test) # write train and test emb to files train.label.file <- tempfile() test.label.file <- tempfile() train.biochem.file <- tempfile() test.biochem.file <- tempfile() write.csv(gene.train, file = train.label.file) write.csv(gene.test, file = test.label.file) gene.train.biochem <- prepare.biochemical(ALL.gof.lof[match(gene.train$unique.id, ALL.gof.lof$unique.id),]) gene.test.biochem <- prepare.biochemical(ALL.gof.lof[match(gene.test$unique.id, ALL.gof.lof$unique.id),]) write.csv(gene.train.biochem, file = train.biochem.file) write.csv(gene.test.biochem, file = test.biochem.file) res <- system(paste0(python.path, ' ', 'random.forest.glof.py ', train.biochem.file, ' ', train.label.file, ' ', test.biochem.file, ' ', test.label.file), intern = T) baseline.auc.5 <- list(auc=as.numeric(strsplit(res, split = '=')[[1]][2])) } else { baseline.auc.5 <- list(auc=NA) } # aggregate results to.append <- result[rep(i, length(base.dirs)*6+2), ] to.append$auc <- c(baseline.PreModes, baseline.PreModes.lw, baseline.PreModes.glm, baseline.PreModes.lw.glm, baseline.PreModes.bst, baseline.PreModes.lw.bst, baseline.auc.5$auc, baseline.auc.6$auc) to.append$model <- c(base.dirs, paste0(base.dirs, '.lw'), paste0(base.dirs, '.glm'), paste0(base.dirs, '.lw.glm'), paste0(base.dirs, '.bst'), paste0(base.dirs, '.lw.bst'), "random.forest", "Itan.1") to.append$noitan.gof <- sum(test.result$score==1 & baseline.result$data_source != "Itan") to.append$noitan.lof <- sum(test.result$score==-1 & baseline.result$data_source != "Itan") to.append$tr.auc <- c(PreMode.tr.aucs, PreMode.tr.lw.aucs, PreMode.tr.aucs, PreMode.tr.lw.aucs, PreMode.tr.aucs, PreMode.tr.lw.aucs, rep(NA, 2)) to.append$val.loss <- c(PreMode.min.loss, PreMode.min.loss.lw, PreMode.min.loss, PreMode.min.loss.lw, PreMode.min.loss, PreMode.min.loss.lw, rep(NA, 2)) result.gof <- rbind(result.gof, to.append) } saveRDS(result.gof, 'figs/fig.5.prepare.RDS')