File size: 6,803 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# Copyright 2008 by Bartek Wilczynski.
# Revisions copyright 2019 by Victor Lin.
# Adapted from  Bio.MEME.Parser by Jason A. Hackney.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""Module for the support of MEME motif format."""

import xml.etree.ElementTree as ET

from Bio import Seq
from Bio import motifs


def read(handle):
    """Parse the text output of the MEME program into a meme.Record object.

    Examples
    --------
    >>> from Bio.motifs import meme
    >>> with open("motifs/meme.INO_up800.classic.oops.xml") as f:
    ...     record = meme.read(f)
    >>> for motif in record:
    ...     for instance in motif.instances:
    ...         print(instance.motif_name, instance.sequence_name, instance.sequence_id, instance.strand, instance.pvalue)
    GSKGCATGTGAAA INO1 sequence_5 + 1.21e-08
    GSKGCATGTGAAA FAS1 sequence_2 - 1.87e-08
    GSKGCATGTGAAA ACC1 sequence_4 - 6.62e-08
    GSKGCATGTGAAA CHO2 sequence_1 - 1.05e-07
    GSKGCATGTGAAA CHO1 sequence_0 - 1.69e-07
    GSKGCATGTGAAA FAS2 sequence_3 - 5.62e-07
    GSKGCATGTGAAA OPI3 sequence_6 + 1.08e-06
    TTGACWCYTGCYCWG CHO2 sequence_1 + 7.2e-10
    TTGACWCYTGCYCWG OPI3 sequence_6 - 2.56e-08
    TTGACWCYTGCYCWG ACC1 sequence_4 - 1.59e-07
    TTGACWCYTGCYCWG CHO1 sequence_0 + 2.05e-07
    TTGACWCYTGCYCWG FAS1 sequence_2 + 3.85e-07
    TTGACWCYTGCYCWG FAS2 sequence_3 - 5.11e-07
    TTGACWCYTGCYCWG INO1 sequence_5 + 8.01e-07

    """
    record = Record()
    try:
        xml_tree = ET.parse(handle)
    except ET.ParseError:
        raise ValueError(
            "Improper MEME XML input file. XML root tag should start with <MEME version= ..."
        )
    __read_metadata(record, xml_tree)
    __read_alphabet(record, xml_tree)
    sequence_id_name_map = __get_sequence_id_name_map(xml_tree)
    record.sequences = list(sequence_id_name_map.keys())
    __read_motifs(record, xml_tree, sequence_id_name_map)
    return record


class Motif(motifs.Motif):
    """A subclass of Motif used in parsing MEME (and MAST) output.

    This subclass defines functions and data specific to MEME motifs.
    This includes the motif name, the evalue for a motif, and its number
    of occurrences.
    """

    def __init__(self, alphabet=None, instances=None):
        """Initialize the class."""
        motifs.Motif.__init__(self, alphabet, instances)
        self.evalue = 0.0
        self.num_occurrences = 0
        self.name = None
        self.id = None
        self.alt_id = None


class Instance(Seq.Seq):
    """A class describing the instances of a MEME motif, and the data thereof."""

    def __init__(self, *args, **kwds):
        """Initialize the class."""
        Seq.Seq.__init__(self, *args, **kwds)
        self.sequence_name = ""
        self.sequence_id = ""
        self.start = 0
        self.pvalue = 1.0
        self.strand = 0
        self.length = 0
        self.motif_name = ""


class Record(list):
    """A class for holding the results of a MEME run.

    A meme.Record is an object that holds the results from running
    MEME. It implements no methods of its own.

    The meme.Record class inherits from list, so you can access individual
    motifs in the record by their index. Alternatively, you can find a motif
    by its name:

    >>> from Bio import motifs
    >>> with open("motifs/meme.INO_up800.classic.oops.xml") as f:
    ...     record = motifs.parse(f, 'MEME')
    >>> motif = record[0]
    >>> print(motif.name)
    GSKGCATGTGAAA
    >>> motif = record['GSKGCATGTGAAA']
    >>> print(motif.name)
    GSKGCATGTGAAA
    """

    def __init__(self):
        """Initialize the class."""
        self.version = ""
        self.datafile = ""
        self.command = ""
        self.alphabet = ""
        self.sequences = []

    def __getitem__(self, key):
        """Return the motif of index key."""
        if isinstance(key, str):
            for motif in self:
                if motif.name == key:
                    return motif
        else:
            return list.__getitem__(self, key)


# Everything below is private


def __read_metadata(record, xml_tree):
    record.version = xml_tree.getroot().get("version")
    record.datafile = xml_tree.find("training_set").get("primary_sequences")
    record.command = xml_tree.find("model").find("command_line").text
    # TODO - background_frequencies, other metadata under model


def __read_alphabet(record, xml_tree):
    alphabet_tree = (
        xml_tree.find("training_set").find("letter_frequencies").find("alphabet_array")
    )
    for value in alphabet_tree.findall("value"):
        record.alphabet += value.get("letter_id")


def __get_sequence_id_name_map(xml_tree):
    return {
        sequence_tree.get("id"): sequence_tree.get("name")
        for sequence_tree in xml_tree.find("training_set").findall("sequence")
    }


def __read_motifs(record, xml_tree, sequence_id_name_map):
    for motif_tree in xml_tree.find("motifs").findall("motif"):
        instances = []
        for site_tree in motif_tree.find("contributing_sites").findall(
            "contributing_site"
        ):
            letters = [
                letter_ref.get("letter_id")
                for letter_ref in site_tree.find("site").findall("letter_ref")
            ]
            sequence = "".join(letters)
            instance = Instance(sequence)
            instance.motif_name = motif_tree.get("name")
            instance.sequence_id = site_tree.get("sequence_id")
            instance.sequence_name = sequence_id_name_map[instance.sequence_id]
            # TODO - left flank, right flank
            instance.start = int(site_tree.get("position")) + 1
            instance.pvalue = float(site_tree.get("pvalue"))
            instance.strand = __convert_strand(site_tree.get("strand"))
            instance.length = len(sequence)
            instances.append(instance)
        instances = motifs.Instances(instances, record.alphabet)
        motif = Motif(record.alphabet, instances)
        motif.id = motif_tree.get("id")
        motif.name = motif_tree.get("name")
        motif.alt_id = motif_tree.get("alt")
        motif.length = int(motif_tree.get("width"))
        motif.num_occurrences = int(motif_tree.get("sites"))
        motif.evalue = float(motif_tree.get("e_value"))
        # TODO - ic, re, llr, pvalue, bayes_threshold, elapsed_time
        record.append(motif)


def __convert_strand(strand):
    """Convert strand (+/-) from XML if present.

    Default: +
    """
    if strand == "minus":
        return "-"
    if strand == "plus" or strand == "none":
        return "+"


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()