Spaces:
No application file
No application file
# Copyright (C) 2009 by Eric Talevich ([email protected]) | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Classes corresponding to phyloXML elements. | |
See Also | |
-------- | |
Official specification: | |
http://phyloxml.org/ | |
Journal article: | |
Han and Zmasek (2009), https://doi.org/10.1186/1471-2105-10-356 | |
""" | |
import re | |
import warnings | |
from Bio.Align import Alignment, MultipleSeqAlignment | |
from Bio.Seq import Seq | |
from Bio.SeqFeature import SeqFeature, SimpleLocation | |
from Bio.SeqRecord import SeqRecord | |
from Bio import BiopythonWarning | |
from Bio.Phylo import BaseTree | |
class PhyloXMLWarning(BiopythonWarning): | |
"""Warning for non-compliance with the phyloXML specification.""" | |
pass | |
def _check_str(text, testfunc): | |
"""Check a string using testfunc, and warn if there's no match (PRIVATE).""" | |
if text is not None and not testfunc(text): | |
warnings.warn( | |
f"String {text} doesn't match the given regexp", | |
PhyloXMLWarning, | |
stacklevel=2, | |
) | |
# Core elements | |
class PhyloElement(BaseTree.TreeElement): | |
"""Base class for all PhyloXML objects.""" | |
class Phyloxml(PhyloElement): | |
"""Root node of the PhyloXML document. | |
Contains an arbitrary number of Phylogeny elements, possibly followed by | |
elements from other namespaces. | |
:Parameters: | |
attributes : dict | |
(XML namespace definitions) | |
phylogenies : list | |
The phylogenetic trees | |
other : list | |
Arbitrary non-phyloXML elements, if any | |
""" | |
def __init__(self, attributes, phylogenies=None, other=None): | |
"""Initialize parameters for PhyloXML object.""" | |
self.attributes = { | |
# standard | |
"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", | |
"xmlns": "http://www.phyloxml.org", | |
"xsi:schemaLocation": "http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd", | |
} | |
if attributes: | |
self.attributes.update(attributes) | |
self.phylogenies = phylogenies or [] | |
self.other = other or [] | |
def __getitem__(self, index): | |
"""Get a phylogeny by index or name.""" | |
if isinstance(index, (int, slice)): | |
return self.phylogenies[index] | |
if not isinstance(index, str): | |
raise KeyError(f"can't use {type(index)} as an index") | |
for tree in self.phylogenies: | |
if tree.name == index: | |
return tree | |
else: | |
raise KeyError(f"no phylogeny found with name {index!r}") | |
def __iter__(self): | |
"""Iterate through the phylogenetic trees in this object.""" | |
return iter(self.phylogenies) | |
def __len__(self): | |
"""Return the number of phylogenetic trees in this object.""" | |
return len(self.phylogenies) | |
def __str__(self): | |
"""Return name of phylogenies in the object.""" | |
return "%s([%s])" % ( | |
self.__class__.__name__, | |
",\n".join(map(str, self.phylogenies)), | |
) | |
class Other(PhyloElement): | |
"""Container for non-phyloXML elements in the tree. | |
Usually, an Other object will have either a 'value' or a non-empty list | |
of 'children', but not both. This is not enforced here, though. | |
:Parameters: | |
tag : string | |
local tag for the XML node | |
namespace : string | |
XML namespace for the node -- should not be the default phyloXML | |
namespace. | |
attributes : dict of strings | |
attributes on the XML node | |
value : string | |
text contained directly within this XML node | |
children : list | |
child nodes, if any (also ``Other`` instances) | |
""" | |
def __init__(self, tag, namespace=None, attributes=None, value=None, children=None): | |
"""Initialize values for non-phyloXML elements.""" | |
self.tag = tag | |
self.namespace = namespace | |
self.attributes = attributes or {} | |
self.value = value | |
self.children = children or [] | |
def __iter__(self): | |
"""Iterate through the children of this object (if any).""" | |
return iter(self.children) | |
class Phylogeny(PhyloElement, BaseTree.Tree): | |
"""A phylogenetic tree. | |
:Parameters: | |
root : Clade | |
the root node/clade of this tree | |
rooted : bool | |
True if this tree is rooted | |
rerootable : bool | |
True if this tree is rerootable | |
branch_length_unit : string | |
unit for branch_length values on clades | |
name : string | |
identifier for this tree, not required to be unique | |
id : Id | |
unique identifier for this tree | |
description : string | |
plain-text description | |
date : Date | |
date for the root node of this tree | |
confidences : list | |
Confidence objects for this tree | |
clade_relations : list | |
CladeRelation objects | |
sequence_relations : list | |
SequenceRelation objects | |
properties : list | |
Property objects | |
other : list | |
non-phyloXML elements (type ``Other``) | |
""" | |
def __init__( | |
self, | |
root=None, | |
rooted=True, | |
rerootable=None, | |
branch_length_unit=None, | |
type=None, | |
# Child nodes | |
name=None, | |
id=None, | |
description=None, | |
date=None, | |
# Collections | |
confidences=None, | |
clade_relations=None, | |
sequence_relations=None, | |
properties=None, | |
other=None, | |
): | |
"""Initialize values for phylogenetic tree object.""" | |
assert isinstance(rooted, bool) | |
self.root = root | |
self.rooted = rooted | |
self.rerootable = rerootable | |
self.branch_length_unit = branch_length_unit | |
self.type = type | |
self.name = name | |
self.id = id | |
self.description = description | |
self.date = date | |
self.confidences = confidences or [] | |
self.clade_relations = clade_relations or [] | |
self.sequence_relations = sequence_relations or [] | |
self.properties = properties or [] | |
self.other = other or [] | |
def from_tree(cls, tree, **kwargs): | |
"""Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree). | |
Keyword arguments are the usual ``Phylogeny`` constructor parameters. | |
""" | |
phy = cls( | |
root=Clade.from_clade(tree.root), | |
rooted=tree.rooted, | |
name=tree.name, | |
id=(tree.id is not None) and Id(str(tree.id)) or None, | |
) | |
phy.__dict__.update(kwargs) | |
return phy | |
def from_clade(cls, clade, **kwargs): | |
"""Create a new Phylogeny given a Newick or BaseTree Clade object. | |
Keyword arguments are the usual ``PhyloXML.Clade`` constructor parameters. | |
""" | |
return Clade.from_clade(clade).to_phylogeny(**kwargs) | |
def as_phyloxml(self): | |
"""Return this tree, a PhyloXML-compatible Phylogeny object. | |
Overrides the ``BaseTree`` method. | |
""" | |
return self | |
def to_phyloxml_container(self, **kwargs): | |
"""Create a new Phyloxml object containing just this phylogeny.""" | |
return Phyloxml(kwargs, phylogenies=[self]) | |
def to_alignment(self): | |
"""Construct a MultipleSeqAlignment from the aligned sequences in this tree.""" | |
def is_aligned_seq(elem): | |
if isinstance(elem, Sequence) and elem.mol_seq.is_aligned: | |
return True | |
return False | |
seqs = self._filter_search(is_aligned_seq, "preorder", True) | |
records = (seq.to_seqrecord() for seq in seqs) | |
return MultipleSeqAlignment(records) | |
def alignment(self): | |
"""Construct an Alignment object from the aligned sequences in this tree.""" | |
def is_aligned_seq(elem): | |
if isinstance(elem, Sequence) and elem.mol_seq.is_aligned: | |
return True | |
return False | |
seqs = self._filter_search(is_aligned_seq, "preorder", True) | |
records = [] | |
lines = [] | |
for seq in seqs: | |
record = seq.to_seqrecord() | |
lines.append(str(record.seq)) | |
record.seq = record.seq.replace("-", "") | |
records.append(record) | |
if lines: | |
coordinates = Alignment.infer_coordinates(lines) | |
else: | |
coordinates = None | |
return Alignment(records, coordinates) | |
# Singular property for plural attribute | |
def _get_confidence(self): | |
"""Equivalent to self.confidences[0] if there is only 1 value (PRIVATE). | |
See Also: ``Clade.confidence``, ``Clade.taxonomy`` | |
""" | |
if len(self.confidences) == 0: | |
return None | |
if len(self.confidences) > 1: | |
raise AttributeError( | |
"more than 1 confidence value available; use Phylogeny.confidences" | |
) | |
return self.confidences[0] | |
def _set_confidence(self, value): | |
if value is None: | |
# Special case: mirror the behavior of _get_confidence | |
self.confidences = [] | |
return | |
if isinstance(value, (float, int)): | |
value = Confidence(value) | |
elif not isinstance(value, Confidence): | |
raise ValueError("value must be a number or Confidence instance") | |
if len(self.confidences) == 0: | |
self.confidences.append(value) | |
elif len(self.confidences) == 1: | |
self.confidences[0] = value | |
else: | |
raise ValueError( | |
"multiple confidence values already exist; " | |
"use Phylogeny.confidences instead" | |
) | |
def _del_confidence(self): | |
self.confidences = [] | |
confidence = property(_get_confidence, _set_confidence, _del_confidence) | |
class Clade(PhyloElement, BaseTree.Clade): | |
"""Describes a branch of the current phylogenetic tree. | |
Used recursively, describes the topology of a phylogenetic tree. | |
Both ``color`` and ``width`` elements should be interpreted by client code | |
as applying to the whole clade, including all descendents, unless | |
overwritten in-sub clades. This module doesn't automatically assign these | |
attributes to sub-clades to achieve this cascade -- and neither should you. | |
:Parameters: | |
branch_length | |
parent branch length of this clade | |
id_source | |
link other elements to a clade (on the xml-level) | |
name : string | |
short label for this clade | |
confidences : list of Confidence objects | |
used to indicate the support for a clade/parent branch. | |
width : float | |
branch width for this clade (including branch from parent) | |
color : BranchColor | |
color used for graphical display of this clade | |
node_id | |
unique identifier for the root node of this clade | |
taxonomies : list | |
Taxonomy objects | |
sequences : list | |
Sequence objects | |
events : Events | |
describe such events as gene-duplications at the root node/parent | |
branch of this clade | |
binary_characters : BinaryCharacters | |
binary characters | |
distributions : list of Distribution objects | |
distribution(s) of this clade | |
date : Date | |
a date for the root node of this clade | |
references : list | |
Reference objects | |
properties : list | |
Property objects | |
clades : list Clade objects | |
Sub-clades | |
other : list of Other objects | |
non-phyloXML objects | |
""" | |
def __init__( | |
self, | |
# Attributes | |
branch_length=None, | |
id_source=None, | |
# Child nodes | |
name=None, | |
width=None, | |
color=None, | |
node_id=None, | |
events=None, | |
binary_characters=None, | |
date=None, | |
# Collections | |
confidences=None, | |
taxonomies=None, | |
sequences=None, | |
distributions=None, | |
references=None, | |
properties=None, | |
clades=None, | |
other=None, | |
): | |
"""Initialize value for the Clade object.""" | |
self.branch_length = branch_length | |
self.id_source = id_source | |
self.name = name | |
self.width = width | |
self.color = color | |
self.node_id = node_id | |
self.events = events | |
self.binary_characters = binary_characters | |
self.date = date | |
self.confidences = confidences or [] | |
self.taxonomies = taxonomies or [] | |
self.sequences = sequences or [] | |
self.distributions = distributions or [] | |
self.references = references or [] | |
self.properties = properties or [] | |
self.clades = clades or [] | |
self.other = other or [] | |
def from_clade(cls, clade, **kwargs): | |
"""Create a new PhyloXML Clade from a Newick or BaseTree Clade object. | |
Keyword arguments are the usual PhyloXML Clade constructor parameters. | |
""" | |
new_clade = cls(branch_length=clade.branch_length, name=clade.name) | |
new_clade.clades = [cls.from_clade(c) for c in clade] | |
new_clade.confidence = clade.confidence | |
new_clade.width = clade.width | |
new_clade.color = ( | |
BranchColor(clade.color.red, clade.color.green, clade.color.blue) | |
if clade.color | |
else None | |
) | |
new_clade.__dict__.update(kwargs) | |
return new_clade | |
def to_phylogeny(self, **kwargs): | |
"""Create a new phylogeny containing just this clade.""" | |
phy = Phylogeny(root=self, date=self.date) | |
phy.__dict__.update(kwargs) | |
return phy | |
# Shortcuts for list attributes that are usually only 1 item | |
# NB: Duplicated from Phylogeny class | |
def _get_confidence(self): | |
"""Return confidence values (PRIVATE).""" | |
if len(self.confidences) == 0: | |
return None | |
if len(self.confidences) > 1: | |
raise AttributeError( | |
"more than 1 confidence value available; use Clade.confidences" | |
) | |
return self.confidences[0] | |
def _set_confidence(self, value): | |
"""Set the confidence value (PRIVATE).""" | |
if value is None: | |
# Special case: mirror the behavior of _get_confidence | |
self.confidences = [] | |
return | |
if isinstance(value, (float, int)): | |
value = Confidence(value) | |
elif not isinstance(value, Confidence): | |
raise ValueError("value must be a number or Confidence instance") | |
if len(self.confidences) == 0: | |
self.confidences.append(value) | |
elif len(self.confidences) == 1: | |
self.confidences[0] = value | |
else: | |
raise ValueError( | |
"multiple confidence values already exist; " | |
"use Phylogeny.confidences instead" | |
) | |
def _del_confidence(self): | |
"""Delete confidences values (PRIVATE).""" | |
self.confidences = [] | |
confidence = property(_get_confidence, _set_confidence, _del_confidence) | |
def _get_taxonomy(self): | |
"""Get taxonomy list for the clade (PRIVATE).""" | |
if len(self.taxonomies) == 0: | |
return None | |
if len(self.taxonomies) > 1: | |
raise AttributeError( | |
"more than 1 taxonomy value available; use Clade.taxonomies" | |
) | |
return self.taxonomies[0] | |
def _set_taxonomy(self, value): | |
"""Set a taxonomy for the clade (PRIVATE).""" | |
if not isinstance(value, Taxonomy): | |
raise ValueError("assigned value must be a Taxonomy instance") | |
if len(self.taxonomies) == 0: | |
self.taxonomies.append(value) | |
elif len(self.taxonomies) == 1: | |
self.taxonomies[0] = value | |
else: | |
raise ValueError( | |
"multiple taxonomy values already exist; " | |
"use Phylogeny.taxonomies instead" | |
) | |
taxonomy = property(_get_taxonomy, _set_taxonomy) | |
# PhyloXML wrapper for a special BaseTree attribute | |
class BranchColor(PhyloElement, BaseTree.BranchColor): | |
"""Manage Tree branch's color.""" | |
def __init__(self, *args, **kwargs): | |
"""Initialize parameters for the BranchColor object.""" | |
BaseTree.BranchColor.__init__(self, *args, **kwargs) | |
# PhyloXML-specific complex types | |
class Accession(PhyloElement): | |
"""Captures the local part in a sequence identifier. | |
Example: In ``UniProtKB:P17304``, the Accession instance attribute ``value`` | |
is 'P17304' and the ``source`` attribute is 'UniProtKB'. | |
""" | |
def __init__(self, value, source): | |
"""Initialize value for Accession object.""" | |
self.value = value | |
self.source = source | |
def __str__(self): | |
"""Show the class name and an identifying attribute.""" | |
return f"{self.source}:{self.value}" | |
class Annotation(PhyloElement): | |
"""The annotation of a molecular sequence. | |
It is recommended to annotate by using the optional 'ref' attribute. | |
:Parameters: | |
ref : string | |
reference string, e.g. 'GO:0008270', | |
'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1' | |
source : string | |
plain-text source for this annotation | |
evidence : str | |
describe evidence as free text (e.g. 'experimental') | |
desc : string | |
free text description | |
confidence : Confidence | |
state the type and value of support (type Confidence) | |
properties : list | |
typed and referenced annotations from external resources | |
uri : Uri | |
link | |
""" | |
re_ref = re.compile(r"[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+") | |
def __init__( | |
self, | |
# Attributes | |
ref=None, | |
source=None, | |
evidence=None, | |
type=None, | |
# Child nodes | |
desc=None, | |
confidence=None, | |
uri=None, | |
# Collection | |
properties=None, | |
): | |
"""Initialize value for the Annotation object.""" | |
_check_str(ref, self.re_ref.match) | |
self.ref = ref | |
self.source = source | |
self.evidence = evidence | |
self.type = type | |
self.desc = desc | |
self.confidence = confidence | |
self.uri = uri | |
self.properties = properties or [] | |
class BinaryCharacters(PhyloElement): | |
"""Binary characters at the root of a clade. | |
The names and/or counts of binary characters present, gained, and lost | |
at the root of a clade. | |
""" | |
def __init__( | |
self, | |
# Attributes | |
type=None, | |
gained_count=None, | |
lost_count=None, | |
present_count=None, | |
absent_count=None, | |
# Child nodes (flattened into collections) | |
gained=None, | |
lost=None, | |
present=None, | |
absent=None, | |
): | |
"""Initialize values for the BinaryCharacters object.""" | |
self.type = type | |
self.gained_count = gained_count | |
self.lost_count = lost_count | |
self.present_count = present_count | |
self.absent_count = absent_count | |
self.gained = gained or [] | |
self.lost = lost or [] | |
self.present = present or [] | |
self.absent = absent or [] | |
class CladeRelation(PhyloElement): | |
"""Expresses a typed relationship between two clades. | |
For example, this could be used to describe multiple parents of a clade. | |
:type id_ref_0: str | |
:type id_ref_1: str | |
:type distance: str | |
:type type: str | |
:type confidence: Confidence | |
""" | |
def __init__(self, type, id_ref_0, id_ref_1, distance=None, confidence=None): | |
"""Initialize values for the CladeRelation object.""" | |
self.distance = distance | |
self.type = type | |
self.id_ref_0 = id_ref_0 | |
self.id_ref_1 = id_ref_1 | |
self.confidence = confidence | |
class Confidence(float, PhyloElement): | |
"""A general purpose confidence element. | |
For example, this can be used to express the bootstrap support value of a | |
clade (in which case the ``type`` attribute is 'bootstrap'). | |
:Parameters: | |
value : float | |
confidence value | |
type : string | |
label for the type of confidence, e.g. 'bootstrap' | |
""" | |
def __new__(cls, value, type="unknown"): | |
"""Create and return a Confidence object with the specified value and type.""" | |
obj = super(Confidence, cls).__new__(cls, value) | |
obj.type = type | |
return obj | |
def value(self): | |
"""Return the float value of the Confidence object.""" | |
return float(self) | |
class Date(PhyloElement): | |
"""A date associated with a clade/node. | |
Its value can be numerical by using the 'value' element and/or free text | |
with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it | |
is recommended to employ the 'unit' attribute. | |
:Parameters: | |
unit : string | |
type of numerical value (e.g. 'mya' for 'million years ago') | |
value : float | |
the date value | |
desc : string | |
plain-text description of the date | |
minimum : float | |
lower bound on the date value | |
maximum : float | |
upper bound on the date value | |
""" | |
def __init__(self, value=None, unit=None, desc=None, minimum=None, maximum=None): | |
"""Initialize values of the Date object.""" | |
self.value = value | |
self.unit = unit | |
self.desc = desc | |
self.minimum = minimum | |
self.maximum = maximum | |
def __str__(self): | |
"""Show the class name and the human-readable date.""" | |
if self.unit and self.value is not None: | |
return f"{self.value} {self.unit}" | |
if self.desc is not None: | |
return self.desc | |
return self.__class__.__name__ | |
class Distribution(PhyloElement): | |
"""Geographic distribution of the items of a clade (species, sequences). | |
Intended for phylogeographic applications. | |
:Parameters: | |
desc : string | |
free-text description of the location | |
points : list of ``Point`` objects | |
coordinates (similar to the 'Point' element in Google's KML format) | |
polygons : list of ``Polygon`` objects | |
coordinate sets defining geographic regions | |
""" | |
def __init__(self, desc=None, points=None, polygons=None): | |
"""Initialize values of Distribution object.""" | |
self.desc = desc | |
self.points = points or [] | |
self.polygons = polygons or [] | |
class DomainArchitecture(PhyloElement): | |
"""Domain architecture of a protein. | |
:Parameters: | |
length : int | |
total length of the protein sequence | |
domains : list ProteinDomain objects | |
the domains within this protein | |
""" | |
def __init__(self, length=None, domains=None): | |
"""Initialize values of the DomainArchitecture object.""" | |
self.length = length | |
self.domains = domains | |
class Events(PhyloElement): | |
"""Events at the root node of a clade (e.g. one gene duplication). | |
All attributes are set to None by default, but this object can also be | |
treated as a dictionary, in which case None values are treated as missing | |
keys and deleting a key resets that attribute's value back to None. | |
""" | |
ok_type = { | |
"transfer", | |
"fusion", | |
"speciation_or_duplication", | |
"other", | |
"mixed", | |
"unassigned", | |
} | |
def __init__( | |
self, | |
type=None, | |
duplications=None, | |
speciations=None, | |
losses=None, | |
confidence=None, | |
): | |
"""Initialize values of the Events object.""" | |
_check_str(type, self.ok_type.__contains__) | |
self.type = type | |
self.duplications = duplications | |
self.speciations = speciations | |
self.losses = losses | |
self.confidence = confidence | |
def items(self): | |
"""Return Event's items.""" | |
return [(k, v) for k, v in self.__dict__.items() if v is not None] | |
def keys(self): | |
"""Return Event's keys.""" | |
return [k for k, v in self.__dict__.items() if v is not None] | |
def values(self): | |
"""Return values from a key-value pair in an Events dict.""" | |
return [v for v in self.__dict__.values() if v is not None] | |
def __len__(self): | |
"""Return number of Events.""" | |
# TODO - Better way to do this? | |
return len(self.values()) | |
def __getitem__(self, key): | |
"""Get value of Event with the given key.""" | |
try: | |
val = getattr(self, key) | |
except AttributeError: | |
raise KeyError(key) from None | |
if val is None: | |
raise KeyError(f"{key!r} has not been set in this object") | |
return val | |
def __setitem__(self, key, val): | |
"""Add item to Event dict.""" | |
setattr(self, key, val) | |
def __delitem__(self, key): | |
"""Delete Event with given key.""" | |
setattr(self, key, None) | |
def __iter__(self): | |
"""Iterate over the keys present in a Events dict.""" | |
return iter(self.keys()) | |
def __contains__(self, key): | |
"""Return True if Event dict contains key.""" | |
try: | |
return getattr(self, key) is not None | |
except AttributeError: | |
return False | |
class Id(PhyloElement): | |
"""A general-purpose identifier element. | |
Allows to indicate the provider (or authority) of an identifier, e.g. NCBI, | |
along with the value itself. | |
""" | |
def __init__(self, value, provider=None): | |
"""Initialize values for the identifier object.""" | |
self.value = value | |
self.provider = provider | |
def __str__(self): | |
"""Return identifier as a string.""" | |
if self.provider is not None: | |
return f"{self.provider}:{self.value}" | |
return self.value | |
class MolSeq(PhyloElement): | |
"""Store a molecular sequence. | |
:Parameters: | |
value : string | |
the sequence itself | |
is_aligned : bool | |
True if this sequence is aligned with the others (usually meaning | |
all aligned seqs are the same length and gaps may be present) | |
""" | |
re_value = re.compile(r"[a-zA-Z\.\-\?\*_]+") | |
def __init__(self, value, is_aligned=None): | |
"""Initialize parameters for the MolSeq object.""" | |
_check_str(value, self.re_value.match) | |
self.value = value | |
self.is_aligned = is_aligned | |
def __str__(self): | |
"""Return the value of the Molecular Sequence object.""" | |
return self.value | |
class Point(PhyloElement): | |
"""Geographic coordinates of a point, with an optional altitude. | |
Used by element 'Distribution'. | |
:Parameters: | |
geodetic_datum : string, required | |
the geodetic datum (also called 'map datum'). For example, Google's | |
KML uses 'WGS84'. | |
lat : numeric | |
latitude | |
long : numeric | |
longitude | |
alt : numeric | |
altitude | |
alt_unit : string | |
unit for the altitude (e.g. 'meter') | |
""" | |
def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None): | |
"""Initialize value for the Point object.""" | |
self.geodetic_datum = geodetic_datum | |
self.lat = lat | |
self.long = long | |
self.alt = alt | |
self.alt_unit = alt_unit | |
class Polygon(PhyloElement): | |
"""A polygon defined by a list of 'Points' (used by element 'Distribution'). | |
:param points: list of 3 or more points representing vertices. | |
""" | |
def __init__(self, points=None): | |
"""Initialize value for the Polygon object.""" | |
self.points = points or [] | |
def __str__(self): | |
"""Return list of points as a string.""" | |
return "%s([%s])" % (self.__class__.__name__, ",\n".join(map(str, self.points))) | |
class Property(PhyloElement): | |
"""A typed and referenced property from an external resources. | |
Can be attached to ``Phylogeny``, ``Clade``, and ``Annotation`` objects. | |
:Parameters: | |
value : string | |
the value of the property | |
ref : string | |
reference to an external resource, e.g. "NOAA:depth" | |
applies_to : string | |
indicates the item to which a property applies to (e.g. 'node' for | |
the parent node of a clade, 'parent_branch' for the parent branch of | |
a clade, or just 'clade'). | |
datatype : string | |
the type of a property; limited to xsd-datatypes | |
(e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 'xsd:decimal', | |
'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI'). | |
unit : string (optional) | |
the unit of the property, e.g. "METRIC:m" | |
id_ref : Id (optional) | |
allows to attached a property specifically to one element (on the | |
xml-level) | |
""" | |
re_ref = re.compile(r"[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+") | |
ok_applies_to = { | |
"phylogeny", | |
"clade", | |
"node", | |
"annotation", | |
"parent_branch", | |
"other", | |
} | |
ok_datatype = { | |
"xsd:string", | |
"xsd:boolean", | |
"xsd:decimal", | |
"xsd:float", | |
"xsd:double", | |
"xsd:duration", | |
"xsd:dateTime", | |
"xsd:time", | |
"xsd:date", | |
"xsd:gYearMonth", | |
"xsd:gYear", | |
"xsd:gMonthDay", | |
"xsd:gDay", | |
"xsd:gMonth", | |
"xsd:hexBinary", | |
"xsd:base64Binary", | |
"xsd:anyURI", | |
"xsd:normalizedString", | |
"xsd:token", | |
"xsd:integer", | |
"xsd:nonPositiveInteger", | |
"xsd:negativeInteger", | |
"xsd:long", | |
"xsd:int", | |
"xsd:short", | |
"xsd:byte", | |
"xsd:nonNegativeInteger", | |
"xsd:unsignedLong", | |
"xsd:unsignedInt", | |
"xsd:unsignedShort", | |
"xsd:unsignedByte", | |
"xsd:positiveInteger", | |
} | |
def __init__(self, value, ref, applies_to, datatype, unit=None, id_ref=None): | |
"""Initialize value for the Property object.""" | |
_check_str(ref, self.re_ref.match) | |
_check_str(applies_to, self.ok_applies_to.__contains__) | |
_check_str(datatype, self.ok_datatype.__contains__) | |
_check_str(unit, self.re_ref.match) | |
self.unit = unit | |
self.id_ref = id_ref | |
self.value = value | |
self.ref = ref | |
self.applies_to = applies_to | |
self.datatype = datatype | |
class ProteinDomain(PhyloElement): | |
"""Represents an individual domain in a domain architecture. | |
The locations use 0-based indexing, as most Python objects including | |
SeqFeature do, rather than the usual biological convention starting at 1. | |
This means the start and end attributes can be used directly as slice | |
indexes on Seq objects. | |
:Parameters: | |
start : non-negative integer | |
start of the domain on the sequence, using 0-based indexing | |
end : non-negative integer | |
end of the domain on the sequence | |
confidence : float | |
can be used to store e.g. E-values | |
id : string | |
unique identifier/name | |
""" | |
def __init__(self, value, start, end, confidence=None, id=None): | |
"""Initialize value for a ProteinDomain object.""" | |
self.value = value | |
self.start = start | |
self.end = end | |
self.confidence = confidence | |
self.id = id | |
def from_seqfeature(cls, feat): | |
"""Create ProteinDomain object from SeqFeature.""" | |
return ProteinDomain( | |
feat.id, | |
feat.location.start, | |
feat.location.end, | |
confidence=feat.qualifiers.get("confidence"), | |
) | |
def to_seqfeature(self): | |
"""Create a SeqFeature from the ProteinDomain Object.""" | |
feat = SeqFeature(location=SimpleLocation(self.start, self.end), id=self.value) | |
try: | |
confidence = self.confidence | |
except AttributeError: | |
pass | |
else: | |
feat.qualifiers["confidence"] = confidence | |
return feat | |
class Reference(PhyloElement): | |
"""Literature reference for a clade. | |
NB: Whenever possible, use the ``doi`` attribute instead of the free-text | |
``desc`` element. | |
""" | |
re_doi = re.compile(r"[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+") | |
def __init__(self, doi=None, desc=None): | |
"""Initialize elements of the Reference class object.""" | |
_check_str(doi, self.re_doi.match) | |
self.doi = doi | |
self.desc = desc | |
class Sequence(PhyloElement): | |
"""A molecular sequence (Protein, DNA, RNA) associated with a node. | |
One intended use for ``id_ref`` is to link a sequence to a taxonomy (via the | |
taxonomy's ``id_source``) in case of multiple sequences and taxonomies per | |
node. | |
:Parameters: | |
type : {'dna', 'rna', 'protein'} | |
type of molecule this sequence represents | |
id_ref : string | |
reference to another resource | |
id_source : string | |
source for the reference | |
symbol : string | |
short symbol of the sequence, e.g. 'ACTM' (max. 10 chars) | |
accession : Accession | |
accession code for this sequence. | |
name : string | |
full name of the sequence, e.g. 'muscle Actin' | |
location | |
location of a sequence on a genome/chromosome. | |
mol_seq : MolSeq | |
the molecular sequence itself | |
uri : Uri | |
link | |
annotations : list of Annotation objects | |
annotations on this sequence | |
domain_architecture : DomainArchitecture | |
protein domains on this sequence | |
other : list of Other objects | |
non-phyloXML elements | |
""" | |
types = {"dna", "rna", "protein"} | |
re_symbol = re.compile(r"\S{1,10}") | |
def __init__( | |
self, | |
# Attributes | |
type=None, | |
id_ref=None, | |
id_source=None, | |
# Child nodes | |
symbol=None, | |
accession=None, | |
name=None, | |
location=None, | |
mol_seq=None, | |
uri=None, | |
domain_architecture=None, | |
# Collections | |
annotations=None, | |
other=None, | |
): | |
"""Initialize value for a Sequence object.""" | |
_check_str(type, self.types.__contains__) | |
_check_str(symbol, self.re_symbol.match) | |
self.type = type | |
self.id_ref = id_ref | |
self.id_source = id_source | |
self.symbol = symbol | |
self.accession = accession | |
self.name = name | |
self.location = location | |
self.mol_seq = mol_seq | |
self.uri = uri | |
self.domain_architecture = domain_architecture | |
self.annotations = annotations or [] | |
self.other = other or [] | |
def from_seqrecord(cls, record, is_aligned=None): | |
"""Create a new PhyloXML Sequence from a SeqRecord object.""" | |
if is_aligned is None: | |
is_aligned = "-" in record.seq | |
params = { | |
"accession": Accession(record.id, ""), | |
"symbol": record.name, | |
"name": record.description, | |
"mol_seq": MolSeq(str(record.seq), is_aligned), | |
} | |
molecule_type = record.annotations.get("molecule_type") | |
if molecule_type is not None: | |
if "DNA" in molecule_type: | |
params["type"] = "dna" | |
elif "RNA" in molecule_type: | |
params["type"] = "rna" | |
elif "protein" in molecule_type: | |
params["type"] = "protein" | |
# Unpack record.annotations | |
for key in ("id_ref", "id_source", "location"): | |
if key in record.annotations: | |
params[key] = record.annotations[key] | |
if isinstance(record.annotations.get("uri"), dict): | |
params["uri"] = Uri(**record.annotations["uri"]) | |
# Build a Sequence.annotation object | |
if record.annotations.get("annotations"): | |
params["annotations"] = [] | |
for annot in record.annotations["annotations"]: | |
ann_args = {} | |
for key in ("ref", "source", "evidence", "type", "desc"): | |
if key in annot: | |
ann_args[key] = annot[key] | |
if isinstance(annot.get("confidence"), list): | |
ann_args["confidence"] = Confidence(*annot["confidence"]) | |
if isinstance(annot.get("properties"), list): | |
ann_args["properties"] = [ | |
Property(**prop) | |
for prop in annot["properties"] | |
if isinstance(prop, dict) | |
] | |
params["annotations"].append(Annotation(**ann_args)) | |
# Unpack record.features | |
if record.features: | |
params["domain_architecture"] = DomainArchitecture( | |
length=len(record.seq), | |
domains=[ | |
ProteinDomain.from_seqfeature(feat) for feat in record.features | |
], | |
) | |
return Sequence(**params) | |
def to_seqrecord(self): | |
"""Create a SeqRecord object from this Sequence instance. | |
The seqrecord.annotations dictionary is packed like so:: | |
{ # Sequence attributes with no SeqRecord equivalent: | |
'id_ref': self.id_ref, | |
'id_source': self.id_source, | |
'location': self.location, | |
'uri': { 'value': self.uri.value, | |
'desc': self.uri.desc, | |
'type': self.uri.type }, | |
# Sequence.annotations attribute (list of Annotations) | |
'annotations': [{'ref': ann.ref, | |
'source': ann.source, | |
'evidence': ann.evidence, | |
'type': ann.type, | |
'confidence': [ann.confidence.value, | |
ann.confidence.type], | |
'properties': [{'value': prop.value, | |
'ref': prop.ref, | |
'applies_to': prop.applies_to, | |
'datatype': prop.datatype, | |
'unit': prop.unit, | |
'id_ref': prop.id_ref} | |
for prop in ann.properties], | |
} for ann in self.annotations], | |
} | |
""" | |
def clean_dict(dct): | |
"""Remove None-valued items from a dictionary.""" | |
return {key: val for key, val in dct.items() if val is not None} | |
seqrec = SeqRecord( | |
Seq(self.mol_seq.value), | |
**clean_dict( | |
{ | |
"id": str(self.accession), | |
"name": self.symbol, | |
"description": self.name, | |
# 'dbxrefs': None, | |
} | |
), | |
) | |
if self.domain_architecture: | |
seqrec.features = [ | |
dom.to_seqfeature() for dom in self.domain_architecture.domains | |
] | |
# Sequence attributes with no SeqRecord equivalent | |
if self.type == "dna": | |
molecule_type = "DNA" | |
elif self.type == "rna": | |
molecule_type = "RNA" | |
elif self.type == "protein": | |
molecule_type = "protein" | |
else: | |
molecule_type = None | |
seqrec.annotations = clean_dict( | |
{ | |
"id_ref": self.id_ref, | |
"id_source": self.id_source, | |
"location": self.location, | |
"uri": self.uri | |
and clean_dict( | |
{ | |
"value": self.uri.value, | |
"desc": self.uri.desc, | |
"type": self.uri.type, | |
} | |
), | |
"molecule_type": molecule_type, | |
"annotations": self.annotations | |
and [ | |
clean_dict( | |
{ | |
"ref": ann.ref, | |
"source": ann.source, | |
"evidence": ann.evidence, | |
"type": ann.type, | |
"confidence": ann.confidence | |
and [ann.confidence.value, ann.confidence.type], | |
"properties": [ | |
clean_dict( | |
{ | |
"value": prop.value, | |
"ref": prop.ref, | |
"applies_to": prop.applies_to, | |
"datatype": prop.datatype, | |
"unit": prop.unit, | |
"id_ref": prop.id_ref, | |
} | |
) | |
for prop in ann.properties | |
], | |
} | |
) | |
for ann in self.annotations | |
], | |
} | |
) | |
return seqrec | |
class SequenceRelation(PhyloElement): | |
"""Express a typed relationship between two sequences. | |
For example, this could be used to describe an orthology (in which case | |
attribute 'type' is 'orthology'). | |
:Parameters: | |
id_ref_0 : Id | |
first sequence reference identifier | |
id_ref_1 : Id | |
second sequence reference identifier | |
distance : float | |
distance between the two sequences | |
type : restricted string | |
describe the type of relationship | |
confidence : Confidence | |
confidence value for this relation | |
""" | |
ok_type = { | |
"orthology", | |
"one_to_one_orthology", | |
"super_orthology", | |
"paralogy", | |
"ultra_paralogy", | |
"xenology", | |
"unknown", | |
"other", | |
} | |
def __init__(self, type, id_ref_0, id_ref_1, distance=None, confidence=None): | |
"""Initialize the class.""" | |
_check_str(type, self.ok_type.__contains__) | |
self.distance = distance | |
self.type = type | |
self.id_ref_0 = id_ref_0 | |
self.id_ref_1 = id_ref_1 | |
self.confidence = confidence | |
class Taxonomy(PhyloElement): | |
"""Describe taxonomic information for a clade. | |
:Parameters: | |
id_source : Id | |
link other elements to a taxonomy (on the XML level) | |
id : Id | |
unique identifier of a taxon, e.g. Id('6500', | |
provider='ncbi_taxonomy') for the California sea hare | |
code : restricted string | |
store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' for the | |
California sea hare 'Aplysia californica' | |
scientific_name : string | |
the standard scientific name for this organism, e.g. 'Aplysia | |
californica' for the California sea hare | |
authority : string | |
keep the authority, such as 'J. G. Cooper, 1863', associated with | |
the 'scientific_name' | |
common_names : list of strings | |
common names for this organism | |
synonyms : list of strings | |
synonyms for this taxon? | |
rank : restricted string | |
taxonomic rank | |
uri : Uri | |
link | |
other : list of Other objects | |
non-phyloXML elements | |
""" | |
re_code = re.compile(r"[a-zA-Z0-9_]{2,10}") | |
ok_rank = { | |
"domain", | |
"kingdom", | |
"subkingdom", | |
"branch", | |
"infrakingdom", | |
"superphylum", | |
"phylum", | |
"subphylum", | |
"infraphylum", | |
"microphylum", | |
"superdivision", | |
"division", | |
"subdivision", | |
"infradivision", | |
"superclass", | |
"class", | |
"subclass", | |
"infraclass", | |
"superlegion", | |
"legion", | |
"sublegion", | |
"infralegion", | |
"supercohort", | |
"cohort", | |
"subcohort", | |
"infracohort", | |
"superorder", | |
"order", | |
"suborder", | |
"superfamily", | |
"family", | |
"subfamily", | |
"supertribe", | |
"tribe", | |
"subtribe", | |
"infratribe", | |
"genus", | |
"subgenus", | |
"superspecies", | |
"species", | |
"subspecies", | |
"variety", | |
"subvariety", | |
"form", | |
"subform", | |
"cultivar", | |
"unknown", | |
"other", | |
} | |
def __init__( | |
self, | |
# Attributes | |
id_source=None, | |
# Child nodes | |
id=None, | |
code=None, | |
scientific_name=None, | |
authority=None, | |
rank=None, | |
uri=None, | |
# Collections | |
common_names=None, | |
synonyms=None, | |
other=None, | |
): | |
"""Initialize the class.""" | |
_check_str(code, self.re_code.match) | |
_check_str(rank, self.ok_rank.__contains__) | |
self.id_source = id_source | |
self.id = id | |
self.code = code | |
self.scientific_name = scientific_name | |
self.authority = authority | |
self.rank = rank | |
self.uri = uri | |
self.common_names = common_names or [] | |
self.synonyms = synonyms or [] | |
self.other = other or [] | |
def __str__(self): | |
"""Show the class name and an identifying attribute.""" | |
if self.code is not None: | |
return self.code | |
if self.scientific_name is not None: | |
return self.scientific_name | |
if self.rank is not None: | |
return self.rank | |
if self.id is not None: | |
return str(self.id) | |
return self.__class__.__name__ | |
class Uri(PhyloElement): | |
"""A uniform resource identifier. | |
In general, this is expected to be an URL (for example, to link to an image | |
on a website, in which case the ``type`` attribute might be 'image' and | |
``desc`` might be 'image of a California sea hare'). | |
""" | |
def __init__(self, value, desc=None, type=None): | |
"""Initialize the class.""" | |
self.value = value | |
self.desc = desc | |
self.type = type | |
def __str__(self): | |
"""Return string representation of Uri.""" | |
if self.value: | |
return self.value | |
return repr(self) | |