File size: 6,680 Bytes
7718235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# visualize with dssp secondary structure 
library(ggplot2)
library(bio3d)
library(patchwork)
genes <- c("Q99250", "Q14524.clean", "O00555")
gene.names <- c("SCN2A", "SCN5A", "CACNA1A")
aa.dict <- c('L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D',
             'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C')
log.dir <- 'PreMode/'
folds <- c(0:4)
source('./AUROC.R')
# prepare heyne feature table
famcacscn <- as.data.frame(fread("./funNCion/scncacaa_familyalignedCACNA1Acantranscript.txt"))
featuretable <- fread("./funNCion/featuretable4github_revision.txt")
featuretable[,(c("chr", "genomic_pos", "USED_REF", "STRAND","Feature", "inpp2")):=NULL] 
featuretable[,(c(grep("dens", colnames(featuretable)))):=NULL] # remove all variant density features
# rmv most correlated variables (as previously identified with caret preprocessing fcts)
featuretable[,(c("H", "caccon", "SF_DEKA")):=NULL] 
featuretable <- unique(featuretable)
# get heyne training variants
varall <- fread("./funNCion/SupplementaryTable_S1_pathvariantsusedintraining_revision2.txt")
varall <- varall[used_in_functional_prediction%in%1]
varall <- varall[prd_mech_revised%in%c("lof", "gof")]
# remove duplicate sites:
varall <- varall[!duplicated(varall[,c("gene", "altAA", "pos")])]
source("./funNCion/R_functions4predicting_goflof_CACNA1SCN.R")
# for three genes, first only visualize seed 0
result.plot <- data.frame()
for (o in 1:length(genes)) {
  for (fold in 0:4) {
    gene <- genes[o]
    print(gene)
    premode.yaml <- yaml::read_yaml(paste0('../scripts/PreMode/', 
                                           gene, '.5fold/', gene, '.fold.', fold, '.yaml'))
    gene.training <- read.csv(premode.yaml$data_file_train, row.names = 1)
    # compare with large window and select by auc
    gene.training.result <- read.csv(paste0(log.dir, gene, '/training.fold.', fold, '.4fold.csv'))
    gene.training.lw.result <- read.csv(paste0(log.dir, gene, '.large.window/training.fold.', fold, '.4fold.csv'))
    tr.auc <- plot.AUC(gene.training.result$score, rowMeans(gene.training.result[,paste0('logits.FOLD.', 0:3)]))$auc
    tr.lw.auc <- plot.AUC(gene.training.lw.result$score, rowMeans(gene.training.lw.result[,paste0('logits.FOLD.', 0:3)]))$auc
    if (tr.lw.auc > tr.auc) {
      gene.testing.result <- read.csv(paste0(log.dir, gene, '.large.window/testing.fold.', fold, '.4fold.csv'))
    } else {
      gene.testing.result <- read.csv(paste0(log.dir, gene, '/testing.fold.', fold, '.4fold.csv'))
    }
    # heyne training
    gene.training$protid <- paste(gene.names[o], gene.training$pos.orig, gene.training$ref, gene.training$alt, sep = ":")
    gene.testing.result$protid <- paste(gene.names[o], gene.testing.result$pos.orig, gene.testing.result$ref, gene.testing.result$alt, sep = ":")
    varall.protid <- varall$protid[varall$protid %in% gene.training$protid]
    # load heyne feature mat
    feat.train <- featuretable[match(varall.protid, protid)] #, nomatch=0L
    feat.train$Class <- varall$prd_mech_revised[match(varall.protid, varall$protid)]
    feat.train <- feat.train[complete.cases(feat.train),]
    feat.test <- featuretable[match(gene.testing.result$protid, protid)] #, nomatch=0L
    feat.test$Class <- 'gof'
    feat.test$Class[gene.testing.result$score==-1] <- 'lof'
    feat.test <- feat.test[complete.cases(feat.test),]
    
    heyne.auc <- predictgof_manual_split(trainingall = feat.train, testing=feat.test, modeltype = "gbm", featuretable = featuretable, alignmentfile = famcacscn)
    heyne.auc <- max(heyne.auc, 1-heyne.auc)
    premode.auc <- plot.AUC(gene.testing.result$score, rowMeans(gene.testing.result[,paste0('logits.FOLD.', 0:3)]))
    result.plot <- rbind(result.plot, data.frame(AUC=c(premode.auc$auc, heyne.auc), 
                                                 model=c('PreMode', 'FunCion (sklearn)'),
                                                 fold=fold,
                                                 HGNC=paste0(gene.names[o], '\n(5 random splits)')))
  }
}

# add results for all
for (fold in 0:4) {
  # compare with large window and select by auc
  gene.training.result <- read.csv(paste0(log.dir, 'Heyne/training.seed.', fold, '.csv'))
  gene.training.lw.result <- read.csv(paste0(log.dir, 'Heyne/training.large.window.seed.', fold, '.csv'))
  tr.auc <- plot.AUC(gene.training.result$score, rowMeans(gene.training.result[,paste0('logits.FOLD.', 0:3)]))$auc
  tr.lw.auc <- plot.AUC(gene.training.lw.result$score, rowMeans(gene.training.lw.result[,paste0('logits.FOLD.', 0:3)]))$auc
  if (tr.lw.auc > tr.auc) {
    gene.testing.result <- read.csv(paste0(log.dir, 'Heyne/testing.seed.', fold, '.csv'))
  } else {
    gene.testing.result <- read.csv(paste0(log.dir, 'Heyne/testing.large.window.seed.', fold, '.csv'))
  }
  premode.auc <- plot.AUC(gene.testing.result$score, rowMeans(gene.testing.result[,paste0('logits.FOLD.', 0:3)]))
  heyne.result <- read.csv('./funNCion/fuNCion.predictions.csv', row.names = 1)
  heyne.auc <- plot.AUC(as.numeric(as.factor(heyne.result$obs))-1, heyne.result$gof)
  result.plot <- rbind(result.plot, data.frame(AUC=c(premode.auc$auc, heyne.auc$auc, heyne.result$auc[1]),
                                               model=c('PreMode', 'FunCion (R)', 'FunCion (sklearn)'),
                                               fold=fold,
                                               HGNC='ALL Ion Channels\n(FunCion paper split)'))
}

num.models <- length(unique(result.plot$model))
p <- ggplot(result.plot, aes(y=AUC, x=HGNC, col=model)) +
  geom_point(alpha=0) +
  scale_color_manual(values = c("#A3A500", "#00BA38", "#F8766D")) + 
  stat_summary(data = result.plot,
               aes(x=as.numeric(factor(HGNC))+0.4*(as.numeric(factor(model)))/num.models-0.2*(num.models+1)/num.models,
                   y = AUC, col=model), 
               fun.data = mean_se, geom = "errorbar", width = 0.2) +
  stat_summary(data = result.plot, 
               aes(x=as.numeric(factor(HGNC))+0.4*(as.numeric(factor(model)))/num.models-0.2*(num.models+1)/num.models,
                   y = AUC, col=model), 
               fun.data = mean_se, geom = "point") +
  labs(x = "HGNC", y = "AUC", fill = "model") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle=60, vjust = 1, hjust = 1), 
        text = element_text(size = 16),
        plot.title = element_text(size=15),
        legend.position="bottom", 
        legend.direction="horizontal") +
  ggtitle('PreMode compared to FuNCion\nin Ion Channel genes') +
  ggeasy::easy_center_title() +
  coord_flip() + guides(col=guide_legend(ncol=2)) +
  ylim(0.5, 1) + xlab('task: Genetics Level Mode of Action') 
ggsave(paste0('figs/fig.5d.pdf'), p, height = 5, width = 6)