aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2008 by Bartek Wilczynski.
# Revisions copyright 2019 by Victor Lin.
# Adapted from Bio.MEME.Parser by Jason A. Hackney. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Module for the support of MEME motif format."""
import xml.etree.ElementTree as ET
from Bio import Seq
from Bio import motifs
def read(handle):
"""Parse the text output of the MEME program into a meme.Record object.
Examples
--------
>>> from Bio.motifs import meme
>>> with open("motifs/meme.INO_up800.classic.oops.xml") as f:
... record = meme.read(f)
>>> for motif in record:
... for instance in motif.instances:
... print(instance.motif_name, instance.sequence_name, instance.sequence_id, instance.strand, instance.pvalue)
GSKGCATGTGAAA INO1 sequence_5 + 1.21e-08
GSKGCATGTGAAA FAS1 sequence_2 - 1.87e-08
GSKGCATGTGAAA ACC1 sequence_4 - 6.62e-08
GSKGCATGTGAAA CHO2 sequence_1 - 1.05e-07
GSKGCATGTGAAA CHO1 sequence_0 - 1.69e-07
GSKGCATGTGAAA FAS2 sequence_3 - 5.62e-07
GSKGCATGTGAAA OPI3 sequence_6 + 1.08e-06
TTGACWCYTGCYCWG CHO2 sequence_1 + 7.2e-10
TTGACWCYTGCYCWG OPI3 sequence_6 - 2.56e-08
TTGACWCYTGCYCWG ACC1 sequence_4 - 1.59e-07
TTGACWCYTGCYCWG CHO1 sequence_0 + 2.05e-07
TTGACWCYTGCYCWG FAS1 sequence_2 + 3.85e-07
TTGACWCYTGCYCWG FAS2 sequence_3 - 5.11e-07
TTGACWCYTGCYCWG INO1 sequence_5 + 8.01e-07
"""
record = Record()
try:
xml_tree = ET.parse(handle)
except ET.ParseError:
raise ValueError(
"Improper MEME XML input file. XML root tag should start with <MEME version= ..."
)
__read_metadata(record, xml_tree)
__read_alphabet(record, xml_tree)
sequence_id_name_map = __get_sequence_id_name_map(xml_tree)
record.sequences = list(sequence_id_name_map.keys())
__read_motifs(record, xml_tree, sequence_id_name_map)
return record
class Motif(motifs.Motif):
"""A subclass of Motif used in parsing MEME (and MAST) output.
This subclass defines functions and data specific to MEME motifs.
This includes the motif name, the evalue for a motif, and its number
of occurrences.
"""
def __init__(self, alphabet=None, instances=None):
"""Initialize the class."""
motifs.Motif.__init__(self, alphabet, instances)
self.evalue = 0.0
self.num_occurrences = 0
self.name = None
self.id = None
self.alt_id = None
class Instance(Seq.Seq):
"""A class describing the instances of a MEME motif, and the data thereof."""
def __init__(self, *args, **kwds):
"""Initialize the class."""
Seq.Seq.__init__(self, *args, **kwds)
self.sequence_name = ""
self.sequence_id = ""
self.start = 0
self.pvalue = 1.0
self.strand = 0
self.length = 0
self.motif_name = ""
class Record(list):
"""A class for holding the results of a MEME run.
A meme.Record is an object that holds the results from running
MEME. It implements no methods of its own.
The meme.Record class inherits from list, so you can access individual
motifs in the record by their index. Alternatively, you can find a motif
by its name:
>>> from Bio import motifs
>>> with open("motifs/meme.INO_up800.classic.oops.xml") as f:
... record = motifs.parse(f, 'MEME')
>>> motif = record[0]
>>> print(motif.name)
GSKGCATGTGAAA
>>> motif = record['GSKGCATGTGAAA']
>>> print(motif.name)
GSKGCATGTGAAA
"""
def __init__(self):
"""Initialize the class."""
self.version = ""
self.datafile = ""
self.command = ""
self.alphabet = ""
self.sequences = []
def __getitem__(self, key):
"""Return the motif of index key."""
if isinstance(key, str):
for motif in self:
if motif.name == key:
return motif
else:
return list.__getitem__(self, key)
# Everything below is private
def __read_metadata(record, xml_tree):
record.version = xml_tree.getroot().get("version")
record.datafile = xml_tree.find("training_set").get("primary_sequences")
record.command = xml_tree.find("model").find("command_line").text
# TODO - background_frequencies, other metadata under model
def __read_alphabet(record, xml_tree):
alphabet_tree = (
xml_tree.find("training_set").find("letter_frequencies").find("alphabet_array")
)
for value in alphabet_tree.findall("value"):
record.alphabet += value.get("letter_id")
def __get_sequence_id_name_map(xml_tree):
return {
sequence_tree.get("id"): sequence_tree.get("name")
for sequence_tree in xml_tree.find("training_set").findall("sequence")
}
def __read_motifs(record, xml_tree, sequence_id_name_map):
for motif_tree in xml_tree.find("motifs").findall("motif"):
instances = []
for site_tree in motif_tree.find("contributing_sites").findall(
"contributing_site"
):
letters = [
letter_ref.get("letter_id")
for letter_ref in site_tree.find("site").findall("letter_ref")
]
sequence = "".join(letters)
instance = Instance(sequence)
instance.motif_name = motif_tree.get("name")
instance.sequence_id = site_tree.get("sequence_id")
instance.sequence_name = sequence_id_name_map[instance.sequence_id]
# TODO - left flank, right flank
instance.start = int(site_tree.get("position")) + 1
instance.pvalue = float(site_tree.get("pvalue"))
instance.strand = __convert_strand(site_tree.get("strand"))
instance.length = len(sequence)
instances.append(instance)
instances = motifs.Instances(instances, record.alphabet)
motif = Motif(record.alphabet, instances)
motif.id = motif_tree.get("id")
motif.name = motif_tree.get("name")
motif.alt_id = motif_tree.get("alt")
motif.length = int(motif_tree.get("width"))
motif.num_occurrences = int(motif_tree.get("sites"))
motif.evalue = float(motif_tree.get("e_value"))
# TODO - ic, re, llr, pvalue, bayes_threshold, elapsed_time
record.append(motif)
def __convert_strand(strand):
"""Convert strand (+/-) from XML if present.
Default: +
"""
if strand == "minus":
return "-"
if strand == "plus" or strand == "none":
return "+"
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()