File size: 10,206 Bytes
a3f3d91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import multiprocessing as mp
from subprocess import Popen, PIPE
from collections import OrderedDict
from os.path import join, isfile
from .postProcessing import make_auto_mut_plot
import os
import re
import shutil
from . import logger

_name = "Auto_mut"

# Define some variables that dictate how some properties of this - maybe they get to be parameters in the future
_score_threshold = -0.2  # How high an A3D score has to be for the residue to be considered for mutation
_energy_threshold = 0.0   # When a mutation is considered "good"
_score_diff_threshold = 5  # When a mutation is considered to increase solubility  +5 should accept all while 0 would
                           # already be qutite restrictive
_target_mutations = ["E", "K", "D", "R"]  # Glutamic acid, lysine,  aspartic acid, arginine

# The slicing in this code can be hard to read so here is a short summary:
# Chain ID and residue names come as one letter shorts while the number can have more than one char
# Final mutation code is <Old residue><New residue><Residue number><Chain ID>
# The auto_mutation to_exclude argument comes as <Residue number><Chain ID>
# The one that comes from A3D file comes as < Old residue><Residue number><Chain ID>


def run_auto_mutation(work_dir, options, foldx_loc, distance, ph):
    n_mutations = options[0]
    n_processes = options[1]
    to_exclude = []     # Not to worry about it even exists
    if len(options) > 2:
        to_exclude = options[2]
    pool = mp.Pool(n_processes)
    mutations, avg_score = _mutation_list(work_dir=work_dir, excluded_list=to_exclude, n_mutations=n_mutations)
    if not mutations:
        with open(join(work_dir, "Mutations_summary.csv"), "w") as f:   # leave an empty file for the server
            pass
        return
    pool.map(_run_job, [(i, foldx_loc, work_dir, str(distance), ph) for i in mutations])

    _analyze_results(work_dir=work_dir, output_file="Mutations_summary.csv", mutation_list=mutations,
                     base_avg_score=avg_score)
    _cleanup(work_dir=work_dir, mutation_list=mutations)
    try:
        _plots(work_dir=work_dir)
    except Exception as e:  # This is hopefully not needed but in case something happens the user will at least see a
                            # message rather than a traceback
        logger.critical(module_name=_name, msg="It seems that all the mutation attempts failed or some other unexpected"
                                               " error arisen while trying to plot the automated mutations.")
        raise


def _mutation_list(work_dir, excluded_list, n_mutations):
    scores = _parse_a3dcsv(os.path.join(work_dir, "A3D.csv"))
    avg_score = sum(scores.values())/len(list(scores.values()))
    mutation_list = []
    counter = 0
    for residue, value in list(scores.items()):
        if value > _score_threshold and residue[1:] not in excluded_list and residue[0] not in _target_mutations \
                and value != 0:
            mutation_list.extend(["%s%s%s" % (residue[0], i, residue[1:]) for i in _target_mutations])
            logger.info(module_name=_name, msg="Residue number %s from chain %s and a score of %.3f (%s) selected "
                                               "for automated muatation" % (residue[1:-1], residue[-1], value,
                                                                            _aa_dict_F[residue[0]]))
            counter += 1
            if counter >= n_mutations:
                break
        elif value > _score_threshold and residue[1:] in excluded_list:
            logger.info(module_name=_name, msg="Residue number %s from chain %s and a score of %.3f omitted "
                                               "from automated muatation (excluded by the user)." % (residue[1:-1], residue[-1], value))
    if not mutation_list:
        logger.critical(module_name=_name, msg="Couldn't find residues suitable for automated mutations (exceeding a "
                                               "threshold of %.2f). No automated mutations performed." % _score_threshold)
    return mutation_list, avg_score


def _parse_a3dcsv(filepath):  #TODO this is done on muttiple occasions so maybe should be unified somwhere
    """
    Return an OrderedDict of label:score type. The dict is sorted by score so highest is on top
    """
    pattern = re.compile(r"^(.*),(.*),(.*),(.*),(.*)$", re.M)
    scores = OrderedDict()
    try:
        with open(filepath, 'r') as f:
            data = pattern.findall(f.read().replace("\r", ""))[1:]  #
    except IOError:
        return False    # The mutation likely failed this should pass the info to analyze_results
    for line in data:
        label = line[3] + line[2] + line[1]  # One letter code + residue ID + chain ID (the mutation syntax)
        aggScore = float(line[4])
        scores[label] = aggScore
    scores = OrderedDict(sorted(list(scores.items()), key=lambda x: x[1], reverse=True))
    return scores


def _run_job(args):
    """
    Run a single A3D job with a specific mutation
    args go as follows: mutation code, FoldX location, main job's work dir, a3d distance argument
    """
    mutation, foldx, work_dir, distance, ph = args
    os.chdir(work_dir)
    if ph:
        cmd = ["aggrescan", "-i", "output.pdb", "-v", "4", "-w", mutation, "-m", mutation, "-f", foldx,
               "--subprocess", "--distance", distance]
    else:
        cmd = ["aggrescan", "-i", "output.pdb", "-v", "4", "-w", mutation, "-m", mutation, "-f", foldx,
               "--subprocess", "--distance", distance, "-ph", ph]
    logger.info(module_name=_name, msg="Mutating residue number %s from chain %s (%s) into %s "
                                       " " % (mutation[2:-1], mutation[-1], _aa_dict_F[mutation[0]],
                                              _aa_dict_F[mutation[1]]))  # converting letters into full names
    proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
    proc.communicate()
    if proc.returncode != 0:
        logger.warning(module_name=_name, msg="Mutation %s could have failed (this can be ignored if the main program "
                                              "reports the energy difference). Simulation log for that run should be "
                                               "available at %s" % (mutation,
                                                                    os.path.join(work_dir, mutation, "Aggrescan.error")))


def _analyze_results(work_dir, output_file, mutation_list, base_avg_score):
    """
    Analyze the results and select all of those that are not relevant on keeping top X mutations and return the rest
    to the cleaner that will get rid of them, but keeping their scores in the output_file
    """
    data = OrderedDict()
    unnecessary_results = []
    for mutation in mutation_list:
        scores = _parse_a3dcsv(os.path.join(work_dir, mutation, "A3D.csv"))
        if not scores:
            continue
        with open(os.path.join(work_dir, mutation, "MutantEnergyDiff"), 'r') as f:
            mutation_energy = float(f.read().split()[0])    # This should be guaranteed to work given the check above
        avg_score = sum(scores.values())/len(list(scores.values()))
        data[mutation] = [mutation_energy, avg_score, avg_score - base_avg_score]
        if mutation_energy > _energy_threshold or avg_score - base_avg_score > _score_diff_threshold:
            unnecessary_results.append(mutation)
        logger.info(module_name=_name, msg="Effect of mutation residue number %s from chain %s (%s) into %s: "
                                           "Energy difference: %.4f kcal/mol, Difference in average score from the "
                                           "base case: %.4f"
                                           "" % (mutation[2:-1], mutation[-1], _aa_dict_F[mutation[0]],
                                                 _aa_dict_F[mutation[1]], mutation_energy, avg_score - base_avg_score))
    data = OrderedDict(sorted(list(data.items()), key=lambda x: x[1][0]))   # sort by mutation energy
    with open(os.path.join(work_dir, output_file), "w") as f:
        f.write("%s,%s,%s,%s\n" % ("Mutation", "EnergyDiff", "AvgScore", "AvgScoreDiff"))
        for mutation, values in list(data.items()):
            f.write("%s,%.4f,%.4f,%.4f\n" % (mutation, values[0], values[1], values[2]))
    return unnecessary_results


def _cleanup(work_dir, mutation_list):
    for mutation in mutation_list:
        if isfile(join(work_dir, mutation, "A3D.csv")) and isfile(join(work_dir, mutation, "output.pdb")):
            shutil.move(join(work_dir, mutation, "A3D.csv"), join(work_dir, "%s%s" % (mutation, ".csv")))
            shutil.move(join(work_dir, mutation, "output.pdb"), join(work_dir, "%s%s" % (mutation, ".pdb")))
            shutil.rmtree(join(work_dir, mutation))
        else:
            if isfile(join(work_dir, mutation, "Aggrescan.error")):
                shutil.move(join(work_dir, mutation, "Aggrescan.error"), join(work_dir, "%s%s" % (mutation, ".error")))
                shutil.rmtree(join(work_dir, mutation))
            else:
                with open(join(work_dir, "%s%s" %(mutation, ".error")), "w") as f:
                    f.write("The mutation has failed and no error log was created during the simulation. "
                            "This is unexpected and if you require further assistance please contact us or leave a bug "
                            "report on our bitbucket at "
                            "https://bitbucket.org/lcbio/aggrescan3d/issues?status=new&status=open")


def _plots(work_dir):
    make_auto_mut_plot(work_dir)


# This is a copy from somewhere else, maybe should put it somewhere for imports
_aa_dict_F = {'A': 'alanine', 'R': 'arginine', 'N': 'asparagine',
              'D': 'aspartic acid', 'C': 'cysteine', 'E': 'glutamic acid',
              'Q': 'glutamine', 'G': 'glycine', 'H': 'histidine',
              'I': 'isoleucine', 'L': 'leucine', 'K': 'lysine',
              'M': 'methionine', 'F': 'phenylalanine', 'P': 'proline',
              'S': 'serine', 'T': 'threonine', 'W': 'tryptophan',
              'Y': 'tyrosine', 'V': 'valine', 'X': 'unknown'}