File size: 4,022 Bytes
# parse dnv table to HGNC, columns:
# Reauired: VarID, Score, HGNC, aaChg
# Optional: offset
source('./parse.input.table/parse.variant.wt.sequence.R')

dnv.table.to.uniprot.by.af2.uniprotID.parallel <- function(
    dnv.table, VarID.column, Score.column,
    uniprotID.column, aaChg.column, offset.column=NA, length.column=NA, 
    match.length=FALSE, njobs=96) {
  library(doParallel)
  cl <- makeCluster(njobs)
  registerDoParallel(cl)
  if (!is.na(offset.column)) {
    offset <- as.numeric(dnv.table[,offset.column])
  } else {
    offset <- rep(0, dim(dnv.table)[1])
  }
  offset[is.na(offset)] <- 0
  if (match.length) { 
    prompt <- data.frame(VarID = dnv.table[,VarID.column],
                         aaChg = dnv.table[,aaChg.column],
                         score = dnv.table[,Score.column],
                         uniprotID = dnv.table[,uniprotID.column],
                         seq.match.length = dnv.table[,length.column])
  } else {
    prompt <- data.frame(VarID = dnv.table[,VarID.column],
                         aaChg = dnv.table[,aaChg.column],
                         score = dnv.table[,Score.column],
                         uniprotID = dnv.table[,uniprotID.column])
  }
  uniprot_ID.mapping <- read.csv('./parse.input.table/swissprot_and_human.full.seq.csv')
  # order to make sure first use longest transcript
  # uniprot_ID.mapping <- uniprot_ID.mapping[order(uniprot_ID.mapping$Length, decreasing = T),]
  result <- foreach (i = 1:dim(prompt)[1], .combine = rbind, .multicombine=TRUE) %dopar% {
    source('./parse.input.table/parse.variant.wt.sequence.R')
    substitute_res <- list(ref=NA, pos=NA, alt=NA, wt = NA,
                           sequence = NA, sequence.len = NA,
                           seq.start = NA, seq.end = NA,
                           pos.orig = NA, sequence.orig = NA,
                           wt.orig = NA, sequence.len.orig = NA)
    aaChg <- prompt$aaChg[i]
    uniprot_IDs <- prompt$uniprotID[i]
    if (grepl("-", uniprot_IDs) | !uniprot_IDs %in% uniprot_ID.mapping$uniprotID) {
      url.request <- paste0('https://rest.uniprot.org/uniprotkb/', uniprot_IDs, '.fasta')
      txt <- strsplit(RCurl::getURL(url.request), split = '\n')[[1]]
      wt.sequences <- paste(txt[2:length(txt)], collapse = '')
    } else {
      wt.sequences <- uniprot_ID.mapping$seq[match(uniprot_IDs, uniprot_ID.mapping$uniprotID)]
    }
    if (match.length) {
      if (!is.na(wt.sequences) & nchar(wt.sequences) != prompt$seq.match.length[i]) {
        wt.sequences <- NA
      }
    }
    j = 1
    while (is.na(substitute_res$sequence) & j <= length(uniprot_IDs)) {
      matched_uniprot_ID <- uniprot_IDs[j]
      wt.sequence <- wt.sequences[j]
      # parse variant
      substitute_res <- parse_one_substitute(aaChg, wt.sequence, offset[i])
      j = j + 1
    }
    tmp <- data.frame(VarID = prompt$VarID[i],
                      aaChg = prompt$aaChg[i],
                      uniprotID = matched_uniprot_ID,
                      ref = substitute_res$ref,
                      pos = substitute_res$pos,
                      alt = substitute_res$alt,
                      score = prompt$score[i],
                      sequence = substitute_res$sequence,
                      wt = substitute_res$wt,
                      sequence.len = substitute_res$sequence.len,
                      seq.start = substitute_res$seq.start,
                      seq.end = substitute_res$seq.end,
                      pos.orig = substitute_res$pos.orig,
                      sequence.orig = substitute_res$sequence.orig,
                      wt.orig = substitute_res$wt.orig,
                      sequence.len.orig = substitute_res$sequence.len.orig)
    tmp
  }
  stopCluster(cl)
  dnv.table$place.holder <- NA
  result <- dplyr::bind_cols(result, dnv.table[,!colnames(dnv.table) %in% colnames(result)])
  result$place.holder <- NULL
  result.noNA <- result[!is.na(result$sequence),]
  output <- list(result=result,
                 result.noNA=result.noNA)
  output
}