File size: 12,372 Bytes
7718235 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
# visualize with dssp secondary structure
genes <- c("P60484")
# plot original scores
original <- rbind(read.csv('../data.files/PTEN.bin/train.seed.0.csv'),
af2.seqs <- read.csv('genes.full.seq.csv', row.names = 1)
aa.dict <- c('L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D',
'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C')
log.dir <- '5genes.all.mut/PreMode/'
folds <- c(-1, 0:4)
for (gene in genes) {
prot_data <- drawProteins::get_features(gene)
prot_data <- drawProteins::feature_to_dataframe(prot_data)
secondary <- prot_data[prot_data$type %in% c("HELIX", "STRAND", "TURN"),]
secondary.df <- data.frame()
for (i in 1:dim(secondary)[1]) {
sec.df <- data.frame(pos.orig = secondary$begin[i]:secondary$end[i],
alt = ".anno_secondary",
ANNO_secondary = secondary$type[i])
secondary.df <- dplyr::bind_rows(secondary.df, sec.df)
#plot the AF2 predicted secondary.df and rsa
gene.af2.file <- paste0("../data.files/af2.files/AF-",
gene, '-F', 1,
dssp.res <- dssp(read.pdb(gene.af2.file),
pdb.res <- read.pdb(gene.af2.file)
plddt.res <- pdb.res$atom$b[pdb.res$calpha]
af2.secondary <- rbind(cbind($helix)[,1:4], type="HELIX"),
cbind($sheet), type="STRAND"),
cbind($turn), type="TURN"))
for (i in 1:dim(af2.secondary)[1]) {
sec.df <- data.frame(pos.orig = af2.secondary$start[i]:af2.secondary$end[i],
alt = ".anno_af2_secondary",
ANNO_secondary = af2.secondary$type[i])
secondary.df <- dplyr::bind_rows(secondary.df, sec.df)
rsa.df <- data.frame(pos.orig=1:length(dssp.res$acc), alt = ".anno_af2_rsa",
plddt.df <- data.frame(pos.orig=1:length(plddt.res), alt = ".anno_af2_pLDDT",
#plot the domain types that only have one row of description
others <- prot_data[prot_data$description != "NONE",]
others <- others[!others$type %in% c("VARIANT", "MUTAGEN", "CONFLICT", "VAR_SEQ", "CHAIN"),]
others$type[others$type=="MOD_RES"] <- "post transl. mod."
others$type[others$type=="DOMAIN"] <- others$description[others$type=="DOMAIN"]
others$type <- tolower(others$type)
unique.df <- data.frame()
for (i in 1:dim(others)[1]) {
unq.df <- data.frame(pos.orig = others$begin[i]:others$end[i],
alt = paste0(".", others$type[i]),
ANNO_domain_type = others$type[i])
unique.df <- dplyr::bind_rows(unique.df, unq.df)
if(!identical(others$type[i],others$type[i+1]) && !identical(others$type[i],others$type[i-1])){
unq.df <- data.frame(pos.orig = others$begin[i]:others$end[i],
alt = paste0(".", others$type[i]),
ANNO_domain_type = others$type[i])
unique.df <- dplyr::bind_rows(unique.df, unq.df)
#plot the other domain types that have multiple kinds of descriptions
multiple.df <- data.frame()
for (i in 1:dim(others)[1]) {
if(identical(others$type[i],others$type[i+1]) | identical(others$type[i],others$type[i-1])){
mult.df <- data.frame(pos.orig = others$begin[i]:others$end[i],
alt = paste0(".", others$type[i]),
ANNO_domain_type = others$description[i])
multiple.df <- dplyr::bind_rows(multiple.df, mult.df)
gene.seq <- af2.seqs$seq[af2.seqs$uniprotID==gene]
xlabs <- strsplit(gene.seq, "")[[1]]
xlabs <- paste0(1:nchar(gene.seq), ":", xlabs)
assemble.logits <- 0 <- data.frame()
all.pretrain <- data.frame()
patch.plot <- list()
for (fold in folds) {
if (fold == -1) {
gene.result <- read.csv(paste0(log.dir, gene, '.pretrain.csv'), row.names = 1)
pretrain.result <- gene.result
training.file <- read.csv(paste0('../data.files/pretrain/training.csv'))[,c("HGNC", "uniprotID", "pos.orig", "ref", "alt", "score", "data_source")]
training.file$score[training.file$score!=0] <- 1
training.file <- training.file[training.file$uniprotID == gene,] <- training.file
} else {
gene.result <- read.csv(paste0(log.dir, gene, '.fold.', fold, '.csv'), row.names = 1)
training.file <- read.csv(paste0('../data.files/PTEN.bin/train.seed.', fold, '.csv'))[,c("HGNC", "pos.orig", "ref", "alt", "score.1", "score.2")]
training.file$score <- NA
testing.file <- read.csv(paste0('../data.files/PTEN.bin/test.seed.', fold, '.csv'))[,c("HGNC", "pos.orig", "ref", "alt", "score.1", "score.2")]
testing.file$score <- NA
if (!"logits" %in% colnames(gene.result) | fold != -1) {
logits <- cbind(pretrain.result$logits, gene.result$logits.0, gene.result$logits.1)
gene.result$logits.2 <- gene.result$logits.1
gene.result$logits.1 <- gene.result$logits.0
gene.result$logits.0 <- pretrain.result$logits
assemble.logits <- assemble.logits + logits
ps <- list() <- paste0("logits.", c(0:2)) <- c('score', 'score.1', 'score.2')
data.train <- list(, training.file, training.file)
for (j in 1:3) {
ps[[j]] <- ggplot() +
geom_tile(data=gene.result, aes_string(x="pos.orig", y="alt",[j])) +
scale_fill_gradientn(colors = c("light blue", "white", "pink"), na.value = 'grey') + labs([j]) +
scale_x_continuous(breaks=seq(0, nchar(gene.seq), 50)) +
ggnewscale::new_scale_fill() +
geom_tile(data=data.train[[j]], aes_string(x="pos.orig", y="alt",[j])) +
scale_fill_gradientn(colors = c("blue", "white", "red")) +
ggnewscale::new_scale_fill() +
geom_tile(data=secondary.df, aes(x=pos.orig, y=alt, fill=ANNO_secondary, width=1)) +
ggnewscale::new_scale_fill() +
geom_tile(data=unique.df, aes(x=pos.orig, y=alt, fill=ANNO_domain_type, width=1),show.legend = F) +
ggnewscale::new_scale_fill() +
geom_tile(data=multiple.df, aes(x=pos.orig, y=alt, fill=ANNO_domain_type, width=1),show.legend = F) +
theme_bw() +
ggtitle("PTEN") + ggeasy::easy_center_title()
p <- ps[[1]] + ps[[2]] + ps[[3]] + plot_layout(nrow = 1)
} else {
p <- ggplot() +
geom_tile(data=gene.result, aes(x=pos.orig, y=alt, fill=logits)) +
scale_fill_gradientn(colors = c("light blue", "white", "pink"), na.value = 'grey') +
scale_x_continuous(breaks=seq(0, nchar(gene.seq), 50)) +
ggnewscale::new_scale_fill() +
geom_tile(data=training.file, aes(x=pos.orig, y=alt, fill=score)) +
scale_fill_gradientn(colors = c("blue", "white", "red")) +
ggnewscale::new_scale_fill() +
geom_tile(data=secondary.df, aes(x=pos.orig, y=alt, fill=ANNO_secondary, width=1)) +
ggnewscale::new_scale_fill() +
geom_tile(data=unique.df, aes(x=pos.orig, y=alt, fill=ANNO_domain_type, width=1),show.legend = F) +
ggnewscale::new_scale_fill() +
geom_tile(data=multiple.df, aes(x=pos.orig, y=alt, fill=ANNO_domain_type, width=1),show.legend = F) +
theme_bw() +
ggtitle("PTEN") + ggeasy::easy_center_title()
if (fold != -1) {
patch.plot[[fold+1]] <- p <- dplyr::bind_rows(, training.file, testing.file)
} else {
all.pretrain <- dplyr::bind_rows(all.pretrain,
assemble.logits <- assemble.logits / (length(folds) - 1)
if (!is.null(dim(assemble.logits))) {
gene.result$logits.0 <- assemble.logits[,1]
gene.result$logits.1 <- assemble.logits[,2]
gene.result$logits.2 <- assemble.logits[,3]
gene.result$logits <- NULL
} else {
gene.result$logits <- assemble.logits
if (!"logits" %in% colnames(gene.result)) {
gene.result$logits.diff <- gene.result$logits.2 - gene.result$logits.1 <- gene.result <- <- secondary.df <- unique.df <- multiple.df
ps <- list() <- c(paste0("logits.", c(0:2)), 'logits.diff')$score.diff <- 0$score.diff[$score.1==0 &$score.2==1] <- 1$score.diff[$score.1==1 &$score.2==0] <- -1$score.diff[$score.1==1 &$score.2==1] <- NA <- c('score', 'score.1', 'score.2', 'score.diff') <- c('Patho', 'Stability', 'Enzyme', 'Enzyme-Stability')
for (j in 1:4) {
if (j %in% c(1)) { <- all.pretrain
} else { <-
ps[[j]] <- ggplot() +
geom_tile(data=gene.result, aes_string(x="pos.orig", y="alt",[j])) + labs([j]) +
scale_fill_gradientn(colors = c("light blue", "white", "pink"), na.value = 'grey') +
scale_x_continuous(breaks=seq(0, nchar(gene.seq), 50), minor_breaks = seq(0, nchar(gene.seq), 10)) +
labs([j]) +
ggnewscale::new_scale_fill() +
geom_tile(, aes_string(x="pos.orig", y="alt",[j], width=1, height=1)) +
scale_fill_gradientn(colors = c("blue", "white", "red"), limits = c(0,1)) +
ggnewscale::new_scale_fill() +
geom_tile(, aes(x=pos.orig, y=alt, fill=ANNO_secondary, width=1, height=1)) +
ggnewscale::new_scale_fill() +
geom_tile(data=rsa.df, aes(x=pos.orig, y=alt, fill=ANNO_RSA, width=1, height=1)) +
scale_fill_gradientn(colors = c("grey", "blue")) +
ggnewscale::new_scale_fill() +
geom_tile(data=plddt.df, aes(x=pos.orig, y=alt, fill=ANNO_pLDDT, width=1, height=1)) +
scale_fill_gradientn(colors = c("orange", "yellow", "lightblue", "blue")) +
ggnewscale::new_scale_fill() +
geom_tile(, aes(x=pos.orig, y=alt, fill=ANNO_domain_type, width=1, height=1),show.legend = F) +
ggnewscale::new_scale_fill() +
geom_tile(, aes(x=pos.orig, y=alt, fill=ANNO_domain_type, width=1, height=1),show.legend = F) +
theme_bw() + theme(legend.position="bottom") +
ggtitle("PTEN") + ggeasy::easy_center_title()
p <- ps[[1]] + ps[[2]] + ps[[3]] + ps[[4]] + plot_layout(nrow=4)
p <- ps[[1]] + ps[[4]] + plot_layout(nrow=2)
ggsave(paste0(log.dir, gene, '.part.pdf'), p, width = max(25, min(nchar(gene.seq)/70, 49.9)), height = 10)
} else {
p <- ggplot() +
geom_tile(data=gene.result, aes(x=pos.orig, y=alt, fill=logits)) +
scale_x_continuous(breaks=seq(0, nchar(gene.seq), 50), minor_breaks = seq(0, nchar(gene.seq), 10)) +
scale_fill_gradientn(colors = c("light blue", "white", "pink"), na.value = 'grey') +
ggnewscale::new_scale_fill() +
geom_tile(, aes(x=pos.orig, y=alt, fill=score)) +
scale_fill_gradientn(colors = c("blue", "white", "red")) +
ggnewscale::new_scale_fill() +
geom_tile(data=secondary.df, aes(x=pos.orig, y=alt, fill=ANNO_secondary)) +
theme_bw() +
scale_x_continuous(breaks=seq(0, nchar(gene.seq), 100)) +
ggtitle("PTEN") + ggeasy::easy_center_title()
ggsave(paste0(log.dir, gene, '.pdf'), p, width = nchar(gene.seq)/50, height = 4)
p <- patch.plot[[1]] / patch.plot[[2]] / patch.plot[[3]] / patch.plot[[4]] / patch.plot[[5]]
system('mv 5genes.all.mut/PreMode/P60484.part.pdf figs/fig.sup.12.pdf')