Spaces:

InstaDeepAI
/

folding-studio-demo

Running

File size: 10,206 Bytes

a3f3d91

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import multiprocessing as mp
from subprocess import Popen, PIPE
from collections import OrderedDict
from os.path import join, isfile
from .postProcessing import make_auto_mut_plot
import os
import re
import shutil
from . import logger

_name = "Auto_mut"

# Define some variables that dictate how some properties of this - maybe they get to be parameters in the future
_score_threshold = -0.2  # How high an A3D score has to be for the residue to be considered for mutation
_energy_threshold = 0.0   # When a mutation is considered "good"
_score_diff_threshold = 5  # When a mutation is considered to increase solubility  +5 should accept all while 0 would
                           # already be qutite restrictive
_target_mutations = ["E", "K", "D", "R"]  # Glutamic acid, lysine,  aspartic acid, arginine

# The slicing in this code can be hard to read so here is a short summary:
# Chain ID and residue names come as one letter shorts while the number can have more than one char
# Final mutation code is <Old residue><New residue><Residue number><Chain ID>
# The auto_mutation to_exclude argument comes as <Residue number><Chain ID>
# The one that comes from A3D file comes as < Old residue><Residue number><Chain ID>


def run_auto_mutation(work_dir, options, foldx_loc, distance, ph):
    n_mutations = options[0]
    n_processes = options[1]
    to_exclude = []     # Not to worry about it even exists
    if len(options) > 2:
        to_exclude = options[2]
    pool = mp.Pool(n_processes)
    mutations, avg_score = _mutation_list(work_dir=work_dir, excluded_list=to_exclude, n_mutations=n_mutations)
    if not mutations:
        with open(join(work_dir, "Mutations_summary.csv"), "w") as f:   # leave an empty file for the server
            pass
        return
    pool.map(_run_job, [(i, foldx_loc, work_dir, str(distance), ph) for i in mutations])

    _analyze_results(work_dir=work_dir, output_file="Mutations_summary.csv", mutation_list=mutations,
                     base_avg_score=avg_score)
    _cleanup(work_dir=work_dir, mutation_list=mutations)
    try:
        _plots(work_dir=work_dir)
    except Exception as e:  # This is hopefully not needed but in case something happens the user will at least see a
                            # message rather than a traceback
        logger.critical(module_name=_name, msg="It seems that all the mutation attempts failed or some other unexpected"
                                               " error arisen while trying to plot the automated mutations.")
        raise


def _mutation_list(work_dir, excluded_list, n_mutations):
    scores = _parse_a3dcsv(os.path.join(work_dir, "A3D.csv"))
    avg_score = sum(scores.values())/len(list(scores.values()))
    mutation_list = []
    counter = 0
    for residue, value in list(scores.items()):
        if value > _score_threshold and residue[1:] not in excluded_list and residue[0] not in _target_mutations \
                and value != 0:
            mutation_list.extend(["%s%s%s" % (residue[0], i, residue[1:]) for i in _target_mutations])
            logger.info(module_name=_name, msg="Residue number %s from chain %s and a score of %.3f (%s) selected "
                                               "for automated muatation" % (residue[1:-1], residue[-1], value,
                                                                            _aa_dict_F[residue[0]]))
            counter += 1
            if counter >= n_mutations:
                break
        elif value > _score_threshold and residue[1:] in excluded_list:
            logger.info(module_name=_name, msg="Residue number %s from chain %s and a score of %.3f omitted "
                                               "from automated muatation (excluded by the user)." % (residue[1:-1], residue[-1], value))
    if not mutation_list:
        logger.critical(module_name=_name, msg="Couldn't find residues suitable for automated mutations (exceeding a "
                                               "threshold of %.2f). No automated mutations performed." % _score_threshold)
    return mutation_list, avg_score


def _parse_a3dcsv(filepath):  #TODO this is done on muttiple occasions so maybe should be unified somwhere
    """
    Return an OrderedDict of label:score type. The dict is sorted by score so highest is on top
    """
    pattern = re.compile(r"^(.*),(.*),(.*),(.*),(.*)$", re.M)
    scores = OrderedDict()
    try:
        with open(filepath, 'r') as f:
            data = pattern.findall(f.read().replace("\r", ""))[1:]  #
    except IOError:
        return False    # The mutation likely failed this should pass the info to analyze_results
    for line in data:
        label = line[3] + line[2] + line[1]  # One letter code + residue ID + chain ID (the mutation syntax)
        aggScore = float(line[4])
        scores[label] = aggScore
    scores = OrderedDict(sorted(list(scores.items()), key=lambda x: x[1], reverse=True))
    return scores


def _run_job(args):
    """
    Run a single A3D job with a specific mutation
    args go as follows: mutation code, FoldX location, main job's work dir, a3d distance argument
    """
    mutation, foldx, work_dir, distance, ph = args
    os.chdir(work_dir)
    if ph:
        cmd = ["aggrescan", "-i", "output.pdb", "-v", "4", "-w", mutation, "-m", mutation, "-f", foldx,
               "--subprocess", "--distance", distance]
    else:
        cmd = ["aggrescan", "-i", "output.pdb", "-v", "4", "-w", mutation, "-m", mutation, "-f", foldx,
               "--subprocess", "--distance", distance, "-ph", ph]
    logger.info(module_name=_name, msg="Mutating residue number %s from chain %s (%s) into %s "
                                       " " % (mutation[2:-1], mutation[-1], _aa_dict_F[mutation[0]],
                                              _aa_dict_F[mutation[1]]))  # converting letters into full names
    proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
    proc.communicate()
    if proc.returncode != 0:
        logger.warning(module_name=_name, msg="Mutation %s could have failed (this can be ignored if the main program "
                                              "reports the energy difference). Simulation log for that run should be "
                                               "available at %s" % (mutation,
                                                                    os.path.join(work_dir, mutation, "Aggrescan.error")))


def _analyze_results(work_dir, output_file, mutation_list, base_avg_score):
    """
    Analyze the results and select all of those that are not relevant on keeping top X mutations and return the rest
    to the cleaner that will get rid of them, but keeping their scores in the output_file
    """
    data = OrderedDict()
    unnecessary_results = []
    for mutation in mutation_list:
        scores = _parse_a3dcsv(os.path.join(work_dir, mutation, "A3D.csv"))
        if not scores:
            continue
        with open(os.path.join(work_dir, mutation, "MutantEnergyDiff"), 'r') as f:
            mutation_energy = float(f.read().split()[0])    # This should be guaranteed to work given the check above
        avg_score = sum(scores.values())/len(list(scores.values()))
        data[mutation] = [mutation_energy, avg_score, avg_score - base_avg_score]
        if mutation_energy > _energy_threshold or avg_score - base_avg_score > _score_diff_threshold:
            unnecessary_results.append(mutation)
        logger.info(module_name=_name, msg="Effect of mutation residue number %s from chain %s (%s) into %s: "
                                           "Energy difference: %.4f kcal/mol, Difference in average score from the "
                                           "base case: %.4f"
                                           "" % (mutation[2:-1], mutation[-1], _aa_dict_F[mutation[0]],
                                                 _aa_dict_F[mutation[1]], mutation_energy, avg_score - base_avg_score))
    data = OrderedDict(sorted(list(data.items()), key=lambda x: x[1][0]))   # sort by mutation energy
    with open(os.path.join(work_dir, output_file), "w") as f:
        f.write("%s,%s,%s,%s\n" % ("Mutation", "EnergyDiff", "AvgScore", "AvgScoreDiff"))
        for mutation, values in list(data.items()):
            f.write("%s,%.4f,%.4f,%.4f\n" % (mutation, values[0], values[1], values[2]))
    return unnecessary_results


def _cleanup(work_dir, mutation_list):
    for mutation in mutation_list:
        if isfile(join(work_dir, mutation, "A3D.csv")) and isfile(join(work_dir, mutation, "output.pdb")):
            shutil.move(join(work_dir, mutation, "A3D.csv"), join(work_dir, "%s%s" % (mutation, ".csv")))
            shutil.move(join(work_dir, mutation, "output.pdb"), join(work_dir, "%s%s" % (mutation, ".pdb")))
            shutil.rmtree(join(work_dir, mutation))
        else:
            if isfile(join(work_dir, mutation, "Aggrescan.error")):
                shutil.move(join(work_dir, mutation, "Aggrescan.error"), join(work_dir, "%s%s" % (mutation, ".error")))
                shutil.rmtree(join(work_dir, mutation))
            else:
                with open(join(work_dir, "%s%s" %(mutation, ".error")), "w") as f:
                    f.write("The mutation has failed and no error log was created during the simulation. "
                            "This is unexpected and if you require further assistance please contact us or leave a bug "
                            "report on our bitbucket at "
                            "https://bitbucket.org/lcbio/aggrescan3d/issues?status=new&status=open")


def _plots(work_dir):
    make_auto_mut_plot(work_dir)


# This is a copy from somewhere else, maybe should put it somewhere for imports
_aa_dict_F = {'A': 'alanine', 'R': 'arginine', 'N': 'asparagine',
              'D': 'aspartic acid', 'C': 'cysteine', 'E': 'glutamic acid',
              'Q': 'glutamine', 'G': 'glycine', 'H': 'histidine',
              'I': 'isoleucine', 'L': 'leucine', 'K': 'lysine',
              'M': 'methionine', 'F': 'phenylalanine', 'P': 'proline',
              'S': 'serine', 'T': 'threonine', 'W': 'tryptophan',
              'Y': 'tyrosine', 'V': 'valine', 'X': 'unknown'}