File size: 12,877 Bytes
7718235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
library(ggplot2)
python.path = "/share/descartes/Users/gz2294/miniconda3/envs/RESCVE/bin/python"
args <- commandArgs(trailingOnly = T)
# base dir for transfer learning
base.dirs <- args[1]
base.dirs <- strsplit(base.dirs, split = ',')[[1]]
ALL.gof.lof <- read.csv('figs/ALL.csv', row.names = 1, na.strings = c('.', 'NA'))
source('./prepare.biochem.R')
ALL.gof.lof <- prepare.unique.id(ALL.gof.lof)
biochem.cols <- c('secondary_struc', 'rsa', 'conservation.entropy', 
                  'conservation.alt', 'conservation.ref', 'pLDDT',
                  'itan.gof', 'itan.lof', 'data_source')
# base dir for transfer learning
uniprotID.dic <- c("P21802"="FGFR2", "P15056"="BRAF", 
                   "P07949"="RET", "P04637"="TP53", 
                   "Q09428"="ABCC8", "O00555"="CACNA1A", 
                   "Q14654"="KCNJ11", "Q99250"="SCN2A", 
                   "Q14524"="SCN5A",
                   "Q14524.clean"="SCN5A",
                   "P21802.itan.split"="FGFR2", "P15056.itan.split"="BRAF", 
                   "P07949.itan.split"="RET", "P04637.itan.split"="TP53", 
                   "Q09428.itan.split"="ABCC8", "O00555.itan.split"="CACNA1A", "Q14654.itan.split"="KCNJ11", 
                   "Q99250.itan.split"="SCN2A", "Q14524.clean.itan.split"="SCN5A",
                   "IPR016248"="Fibroblast Growth Factor Receptor Family",
                   "IPR005821"="Ion Transport Domain",
                   "IPR013518"="Potassium Channel Inwardly Rectifying Kir Cytoplasmic Domain",
                   "IPR020635"="Tyrosine Protein Kinase Catalytic Domain"
)
task.ids <- rep(
  c(read.csv('../scripts/gene.txt', header = F)$V1,
    read.csv('../scripts/gene.itan.txt', header = F)$V1,
    read.csv('../scripts/gene.pfams.txt', header = F)$V1
    ),
  each=5)
result <- data.frame(task.id = task.ids)
result$fold <- rep(0:4, dim(result)[1]/5)
result$model <- "PreMode"
result$task.name <- result$task.id
result$task.size.lof <- NA
result$task.size.gof <- NA
result$task.type <- NA
for (i in 1:dim(result)[1]) {
  result$task.type[i] <- "Gene"
  tmp <- strsplit(result$task.name[i], "\\.")[[1]]
  tmp[1] <- uniprotID.dic[tmp[1]]
  if (startsWith(result$task.name[i], "IPR")) {
    result$task.name[i] <- paste("Domain: ", tmp, collapse = "")
    result$task.type[i] <- "Domain"
  } else {
    if (grepl('.IPR', result$task.name[i])) {
      tmp <- strsplit(result$task.name[i], "\\.")[[1]]
      tmp[1] <- uniprotID.dic[tmp[1]]
      if (tmp[2] %in% names(uniprotID.dic)) {
        tmp[2] <- uniprotID.dic[tmp[2]]  
      }
      if (length(tmp) >= 3) {
        if (tmp[3] == 'self') {
          tmp[3] <- "(Gene Only)"
          result$task.type[i] <- "Gene.Gene"
        } else {
          tmp[3] <- "(Family)"
          result$task.type[i] <- "Gene.Domain"
        }
      }
      result$task.name[i] <- paste0("Gene: ", paste(tmp, collapse = "."))
    } else {
      result$task.name[i] <- paste0("Gene: ", tmp[1])
    }
  }
  task.train <- read.csv(paste0('../data.files/ICC.seed.0/', result$task.id[i], '/training.csv'))
  task.test <- read.csv(paste0('../data.files/ICC.seed.0/', result$task.id[i], '/testing.csv'))
  result$task.size.lof[i] <- sum(task.train$score==-1) + sum(task.test$score==-1)
  result$task.size.gof[i] <- sum(task.train$score==1) + sum(task.test$score==1)
}
# add baseline AUC
# esm alphabets
source('./AUROC.R')
alphabet <- c('<cls>', '<pad>', '<eos>', '<unk>',
              'L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D',
              'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C',
              'X', 'B', 'U', 'Z', 'O', '.', '-',
              '<null_1>', '<mask>')
alphabet_2 <- c('L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D',
                'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C')

source('./bind_rows.R')
result.gof <- data.frame()
for (i in 1:dim(result)[1]) {
  # AUC on gof variants
  baseline.PreModes <- c()
  baseline.PreModes.glm <- c()
  baseline.PreModes.bst <- c()
  baseline.PreModes.lw <- c()
  baseline.PreModes.lw.glm <- c()
  baseline.PreModes.lw.bst <- c()
  PreMode.tr.aucs <- c()
  PreMode.tr.lw.aucs <- c()
  PreMode.min.loss <- c()
  PreMode.min.loss.lw <- c()
  for (mod in base.dirs) {
    if (file.exists(paste0(mod, result$task.id[i], '/testing.fold.',
                           result$fold[i], '.4fold.csv'))) {
      baseline.result <- read.csv(paste0(mod, result$task.id[i], '/testing.fold.',
                                         result$fold[i], '.4fold.csv'))
      baseline.auc <- plot.AUC(baseline.result$score, rowMeans(baseline.result[,paste0('logits.FOLD.', 0:3)]))
      if (file.exists(paste0(mod, result$task.id[i], '/training.fold.',
                             result$fold[i], '.4fold.csv'))) {
        set.seed(0)
        baseline.train <- read.csv(paste0(mod, result$task.id[i], '/training.fold.',
                                          result$fold[i], '.4fold.csv'))
        baseline.train$score[baseline.train$score==-1] <- 0
        myglm <- glm(score ~ logits.FOLD.0 + logits.FOLD.1 + logits.FOLD.2 + logits.FOLD.3, data=baseline.train, family = "binomial")
        baseline.result$logits.glm <- predict(myglm, newdata = baseline.result, type = "response")
        baseline.auc.glm <- plot.AUC(baseline.result$score, baseline.result$logits.glm)
        tr.aucs <- c(plot.AUC(baseline.train$score, baseline.train$logits.FOLD.0)$auc,
                     plot.AUC(baseline.train$score, baseline.train$logits.FOLD.1)$auc,
                     plot.AUC(baseline.train$score, baseline.train$logits.FOLD.2)$auc,
                     plot.AUC(baseline.train$score, baseline.train$logits.FOLD.3)$auc)
        best.auc <- which.max(tr.aucs) - 1
        baseline.auc.bst <- plot.AUC(baseline.result$score, baseline.result[,paste0('logits.FOLD.', best.auc)])
        mean.tr.aucs <- plot.AUC(baseline.train$score, rowMeans(baseline.train[,paste0('logits.FOLD.', 0:3)]))$auc
        min.val.loss <- rowMeans(baseline.train[,paste0('min_loss.FOLD.', 0:3)])[1]
      }
      # compare with large.window models, if exist
      if (file.exists(paste0(mod, result$task.id[i], '.large.window/testing.fold.',
                             result$fold[i], '.4fold.csv'))) {
        baseline.result <- read.csv(paste0(mod, result$task.id[i], '.large.window/testing.fold.',
                                           result$fold[i], '.4fold.csv'))
        baseline.auc.lw <- plot.AUC(baseline.result$score, rowMeans(baseline.result[,paste0('logits.FOLD.', 0:3)]))
        if (file.exists(paste0(mod, result$task.id[i], '.large.window/training.fold.',
                               result$fold[i], '.4fold.csv'))) {
          set.seed(0)
          baseline.train <- read.csv(paste0(mod, result$task.id[i], '.large.window/training.fold.',
                                            result$fold[i], '.4fold.csv'))
          baseline.train$score[baseline.train$score==-1] <- 0
          myglm <- glm(score ~ logits.FOLD.0 + logits.FOLD.1 + logits.FOLD.2 + logits.FOLD.3, data=baseline.train, family = "binomial")
          baseline.result$logits.glm <- predict(myglm, newdata = baseline.result, type = "response")
          baseline.auc.lw.glm <- plot.AUC(baseline.result$score, baseline.result$logits.glm)
          tr.aucs <- c(plot.AUC(baseline.train$score, baseline.train$logits.FOLD.0)$auc,
                       plot.AUC(baseline.train$score, baseline.train$logits.FOLD.1)$auc,
                       plot.AUC(baseline.train$score, baseline.train$logits.FOLD.2)$auc,
                       plot.AUC(baseline.train$score, baseline.train$logits.FOLD.3)$auc)
          best.auc <- which.max(tr.aucs) - 1
          baseline.auc.lw.bst <- plot.AUC(baseline.result$score, baseline.result[,paste0('logits.FOLD.', best.auc)])
          mean.tr.aucs.lw <- plot.AUC(baseline.train$score, rowMeans(baseline.train[,paste0('logits.FOLD.', 0:3)]))$auc
          min.val.loss.lw <- rowMeans(baseline.train[,paste0('min_loss.FOLD.', 0:3)])[1]
        }
      } else {
        baseline.auc.lw <- list(auc=NA)
        baseline.auc.lw.glm <- list(auc=NA)
        baseline.auc.lw.bst <- list(auc=NA)
        mean.tr.aucs.lw <- NA
      }
    } else {
      use.large <- NA
      baseline.auc <- list(auc=NA)
      baseline.auc.glm <- list(auc=NA)
      baseline.auc.bst <- list(auc=NA)
      baseline.auc.lw <- list(auc=NA)
      baseline.auc.lw.glm <- list(auc=NA)
      baseline.auc.lw.bst <- list(auc=NA)
      mean.tr.aucs <- NA
      mean.tr.aucs.lw <- NA
      min.val.loss <- NA
      min.val.loss.lw <- NA
    }
    baseline.PreModes <- c(baseline.PreModes, baseline.auc$auc)
    baseline.PreModes.glm <- c(baseline.PreModes.glm, baseline.auc.glm$auc)
    baseline.PreModes.bst <- c(baseline.PreModes.bst, baseline.auc.bst$auc)
    baseline.PreModes.lw <- c(baseline.PreModes.lw, baseline.auc.lw$auc)
    baseline.PreModes.lw.glm <- c(baseline.PreModes.lw.glm, baseline.auc.lw.glm$auc)
    baseline.PreModes.lw.bst <- c(baseline.PreModes.lw.bst, baseline.auc.lw.bst$auc)
    PreMode.tr.aucs <- c(PreMode.tr.aucs, mean.tr.aucs)
    PreMode.tr.lw.aucs <- c(PreMode.tr.lw.aucs, mean.tr.aucs.lw)
    PreMode.min.loss <- c(PreMode.min.loss, min.val.loss)
    PreMode.min.loss.lw <- c(PreMode.min.loss.lw, min.val.loss.lw)
  }
  # add itan
  baseline.result <- read.csv(paste0('PreMode/', result$task.id[i], '/testing.fold.',
                                     result$fold[i], '.4fold.csv'))
  test.result <- prepare.unique.id(baseline.result)
  test.result[,biochem.cols] <- ALL.gof.lof[match(test.result$unique.id, ALL.gof.lof$unique.id), biochem.cols]
  
  baseline.auc.6 <- plot.AUC(test.result$score[!grepl("Itan", test.result$data_source)],
                             1-test.result$itan.gof[!grepl("Itan", test.result$data_source)])
  # add random forest, elastic net
  if (file.exists(paste0('PreMode/', result$task.id[i],
                         '/training.fold.', result$fold[i], '.4fold.csv')) &
      file.exists(paste0('PreMode/', result$task.id[i],
                         '/testing.fold.', result$fold[i], '.4fold.csv'))) {
    gene.train <- read.csv(paste0('PreMode/', result$task.id[i],
                                  '/training.fold.', result$fold[i], '.4fold.csv'))
    gene.test <- read.csv(paste0('PreMode/', result$task.id[i],
                                 '/testing.fold.', result$fold[i], '.4fold.csv'))
    gene.train <- prepare.unique.id(gene.train)
    gene.test <- prepare.unique.id(gene.test)
    # write train and test emb to files
    train.label.file <- tempfile()
    test.label.file <- tempfile()
    train.biochem.file <- tempfile()
    test.biochem.file <- tempfile()
    write.csv(gene.train, file = train.label.file)
    write.csv(gene.test, file = test.label.file)
    gene.train.biochem <- prepare.biochemical(ALL.gof.lof[match(gene.train$unique.id, ALL.gof.lof$unique.id),])
    gene.test.biochem <- prepare.biochemical(ALL.gof.lof[match(gene.test$unique.id, ALL.gof.lof$unique.id),])
    write.csv(gene.train.biochem,
              file = train.biochem.file)
    write.csv(gene.test.biochem,
              file = test.biochem.file)
    res <- system(paste0(python.path, ' ',
                         'random.forest.glof.py ', 
                         train.biochem.file, ' ',
                         train.label.file, ' ',
                         test.biochem.file, ' ', 
                         test.label.file), intern = T)
    baseline.auc.5 <- list(auc=as.numeric(strsplit(res, split = '=')[[1]][2]))
  } else {
    baseline.auc.5 <- list(auc=NA)
  }
  # aggregate results
  to.append <- result[rep(i, length(base.dirs)*6+2), ]
  to.append$auc <- c(baseline.PreModes, baseline.PreModes.lw, 
                     baseline.PreModes.glm, baseline.PreModes.lw.glm, 
                     baseline.PreModes.bst, baseline.PreModes.lw.bst, 
                     baseline.auc.5$auc,
                     baseline.auc.6$auc)
  to.append$model <- c(base.dirs, paste0(base.dirs, '.lw'),
                       paste0(base.dirs, '.glm'), paste0(base.dirs, '.lw.glm'),
                       paste0(base.dirs, '.bst'), paste0(base.dirs, '.lw.bst'),
                       "random.forest", "Itan.1")
  to.append$noitan.gof <- sum(test.result$score==1 & baseline.result$data_source != "Itan")
  to.append$noitan.lof <- sum(test.result$score==-1 & baseline.result$data_source != "Itan")
  to.append$tr.auc <- c(PreMode.tr.aucs, PreMode.tr.lw.aucs, 
                           PreMode.tr.aucs, PreMode.tr.lw.aucs, 
                           PreMode.tr.aucs, PreMode.tr.lw.aucs,
                           rep(NA, 2))
  to.append$val.loss <- c(PreMode.min.loss, PreMode.min.loss.lw, 
                          PreMode.min.loss, PreMode.min.loss.lw, 
                          PreMode.min.loss, PreMode.min.loss.lw,
                          rep(NA, 2))
  result.gof <- rbind(result.gof, to.append)
}
saveRDS(result.gof, 'figs/fig.5.prepare.RDS')