Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /motifs /meme.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

6.8 kB

	# Copyright 2008 by Bartek Wilczynski.
	# Revisions copyright 2019 by Victor Lin.
	# Adapted from Bio.MEME.Parser by Jason A. Hackney. All rights reserved.
	# This code is part of the Biopython distribution and governed by its
	# license. Please see the LICENSE file that should have been included
	# as part of this package.
	"""Module for the support of MEME motif format."""

	import xml.etree.ElementTree as ET

	from Bio import Seq
	from Bio import motifs


	def read(handle):
	"""Parse the text output of the MEME program into a meme.Record object.

	Examples
	--------
	>>> from Bio.motifs import meme
	>>> with open("motifs/meme.INO_up800.classic.oops.xml") as f:
	... record = meme.read(f)
	>>> for motif in record:
	... for instance in motif.instances:
	... print(instance.motif_name, instance.sequence_name, instance.sequence_id, instance.strand, instance.pvalue)
	GSKGCATGTGAAA INO1 sequence_5 + 1.21e-08
	GSKGCATGTGAAA FAS1 sequence_2 - 1.87e-08
	GSKGCATGTGAAA ACC1 sequence_4 - 6.62e-08
	GSKGCATGTGAAA CHO2 sequence_1 - 1.05e-07
	GSKGCATGTGAAA CHO1 sequence_0 - 1.69e-07
	GSKGCATGTGAAA FAS2 sequence_3 - 5.62e-07
	GSKGCATGTGAAA OPI3 sequence_6 + 1.08e-06
	TTGACWCYTGCYCWG CHO2 sequence_1 + 7.2e-10
	TTGACWCYTGCYCWG OPI3 sequence_6 - 2.56e-08
	TTGACWCYTGCYCWG ACC1 sequence_4 - 1.59e-07
	TTGACWCYTGCYCWG CHO1 sequence_0 + 2.05e-07
	TTGACWCYTGCYCWG FAS1 sequence_2 + 3.85e-07
	TTGACWCYTGCYCWG FAS2 sequence_3 - 5.11e-07
	TTGACWCYTGCYCWG INO1 sequence_5 + 8.01e-07

	"""
	record = Record()
	try:
	xml_tree = ET.parse(handle)
	except ET.ParseError:
	raise ValueError(
	"Improper MEME XML input file. XML root tag should start with <MEME version= ..."
	)
	__read_metadata(record, xml_tree)
	__read_alphabet(record, xml_tree)
	sequence_id_name_map = __get_sequence_id_name_map(xml_tree)
	record.sequences = list(sequence_id_name_map.keys())
	__read_motifs(record, xml_tree, sequence_id_name_map)
	return record


	class Motif(motifs.Motif):
	"""A subclass of Motif used in parsing MEME (and MAST) output.

	This subclass defines functions and data specific to MEME motifs.
	This includes the motif name, the evalue for a motif, and its number
	of occurrences.
	"""

	def __init__(self, alphabet=None, instances=None):
	"""Initialize the class."""
	motifs.Motif.__init__(self, alphabet, instances)
	self.evalue = 0.0
	self.num_occurrences = 0
	self.name = None
	self.id = None
	self.alt_id = None


	class Instance(Seq.Seq):
	"""A class describing the instances of a MEME motif, and the data thereof."""

	def __init__(self, args, *kwds):
	"""Initialize the class."""
	Seq.Seq.__init__(self, args, *kwds)
	self.sequence_name = ""
	self.sequence_id = ""
	self.start = 0
	self.pvalue = 1.0
	self.strand = 0
	self.length = 0
	self.motif_name = ""


	class Record(list):
	"""A class for holding the results of a MEME run.

	A meme.Record is an object that holds the results from running
	MEME. It implements no methods of its own.

	The meme.Record class inherits from list, so you can access individual
	motifs in the record by their index. Alternatively, you can find a motif
	by its name:

	>>> from Bio import motifs
	>>> with open("motifs/meme.INO_up800.classic.oops.xml") as f:
	... record = motifs.parse(f, 'MEME')
	>>> motif = record[0]
	>>> print(motif.name)
	GSKGCATGTGAAA
	>>> motif = record['GSKGCATGTGAAA']
	>>> print(motif.name)
	GSKGCATGTGAAA
	"""

	def __init__(self):
	"""Initialize the class."""
	self.version = ""
	self.datafile = ""
	self.command = ""
	self.alphabet = ""
	self.sequences = []

	def __getitem__(self, key):
	"""Return the motif of index key."""
	if isinstance(key, str):
	for motif in self:
	if motif.name == key:
	return motif
	else:
	return list.__getitem__(self, key)


	# Everything below is private


	def __read_metadata(record, xml_tree):
	record.version = xml_tree.getroot().get("version")
	record.datafile = xml_tree.find("training_set").get("primary_sequences")
	record.command = xml_tree.find("model").find("command_line").text
	# TODO - background_frequencies, other metadata under model


	def __read_alphabet(record, xml_tree):
	alphabet_tree = (
	xml_tree.find("training_set").find("letter_frequencies").find("alphabet_array")
	)
	for value in alphabet_tree.findall("value"):
	record.alphabet += value.get("letter_id")


	def __get_sequence_id_name_map(xml_tree):
	return {
	sequence_tree.get("id"): sequence_tree.get("name")
	for sequence_tree in xml_tree.find("training_set").findall("sequence")
	}


	def __read_motifs(record, xml_tree, sequence_id_name_map):
	for motif_tree in xml_tree.find("motifs").findall("motif"):
	instances = []
	for site_tree in motif_tree.find("contributing_sites").findall(
	"contributing_site"
	):
	letters = [
	letter_ref.get("letter_id")
	for letter_ref in site_tree.find("site").findall("letter_ref")
	]
	sequence = "".join(letters)
	instance = Instance(sequence)
	instance.motif_name = motif_tree.get("name")
	instance.sequence_id = site_tree.get("sequence_id")
	instance.sequence_name = sequence_id_name_map[instance.sequence_id]
	# TODO - left flank, right flank
	instance.start = int(site_tree.get("position")) + 1
	instance.pvalue = float(site_tree.get("pvalue"))
	instance.strand = __convert_strand(site_tree.get("strand"))
	instance.length = len(sequence)
	instances.append(instance)
	instances = motifs.Instances(instances, record.alphabet)
	motif = Motif(record.alphabet, instances)
	motif.id = motif_tree.get("id")
	motif.name = motif_tree.get("name")
	motif.alt_id = motif_tree.get("alt")
	motif.length = int(motif_tree.get("width"))
	motif.num_occurrences = int(motif_tree.get("sites"))
	motif.evalue = float(motif_tree.get("e_value"))
	# TODO - ic, re, llr, pvalue, bayes_threshold, elapsed_time
	record.append(motif)


	def __convert_strand(strand):
	"""Convert strand (+/-) from XML if present.

	Default: +
	"""
	if strand == "minus":
	return "-"
	if strand == "plus" or strand == "none":
	return "+"


	if __name__ == "__main__":
	from Bio._utils import run_doctest

	run_doctest()