Spaces:
No application file
No application file
# Copyright (C) 2009 by Eric Talevich ([email protected]) | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Base classes for Bio.Phylo objects. | |
All object representations for phylogenetic trees should derive from these base | |
classes in order to use the common methods defined on them. | |
""" | |
import collections | |
import copy | |
import itertools | |
import random | |
import re | |
import warnings | |
# General tree-traversal algorithms | |
def _level_traverse(root, get_children): | |
"""Traverse a tree in breadth-first (level) order (PRIVATE).""" | |
Q = collections.deque([root]) | |
while Q: | |
v = Q.popleft() | |
yield v | |
Q.extend(get_children(v)) | |
def _preorder_traverse(root, get_children): | |
"""Traverse a tree in depth-first pre-order (parent before children) (PRIVATE).""" | |
def dfs(elem): | |
yield elem | |
for v in get_children(elem): | |
yield from dfs(v) | |
yield from dfs(root) | |
def _postorder_traverse(root, get_children): | |
"""Traverse a tree in depth-first post-order (children before parent) (PRIVATE).""" | |
def dfs(elem): | |
for v in get_children(elem): | |
yield from dfs(v) | |
yield elem | |
yield from dfs(root) | |
def _sorted_attrs(elem): | |
"""Get a flat list of elem's attributes, sorted for consistency (PRIVATE).""" | |
singles = [] | |
lists = [] | |
# Sort attributes for consistent results | |
for attrname, child in sorted(elem.__dict__.items(), key=lambda kv: kv[0]): | |
if child is None: | |
continue | |
if isinstance(child, list): | |
lists.extend(child) | |
else: | |
singles.append(child) | |
return (x for x in singles + lists if isinstance(x, TreeElement)) | |
# Factory functions to generalize searching for clades/nodes | |
def _identity_matcher(target): | |
"""Match a node to the target object by identity (PRIVATE).""" | |
def match(node): | |
return node is target | |
return match | |
def _class_matcher(target_cls): | |
"""Match a node if it's an instance of the given class (PRIVATE).""" | |
def match(node): | |
return isinstance(node, target_cls) | |
return match | |
def _string_matcher(target): | |
def match(node): | |
if isinstance(node, (Clade, Tree)): | |
# Avoid triggering specialized or recursive magic methods | |
return node.name == target | |
return str(node) == target | |
return match | |
def _attribute_matcher(kwargs): | |
"""Match a node by specified attribute values (PRIVATE). | |
``terminal`` is a special case: True restricts the search to external (leaf) | |
nodes, False restricts to internal nodes, and None allows all tree elements | |
to be searched, including phyloXML annotations. | |
Otherwise, for a tree element to match the specification (i.e. for the | |
function produced by ``_attribute_matcher`` to return True when given a tree | |
element), it must have each of the attributes specified by the keys and | |
match each of the corresponding values -- think 'and', not 'or', for | |
multiple keys. | |
""" | |
def match(node): | |
if "terminal" in kwargs: | |
# Special case: restrict to internal/external/any nodes | |
kwa_copy = kwargs.copy() | |
pattern = kwa_copy.pop("terminal") | |
if pattern is not None and ( | |
not hasattr(node, "is_terminal") or node.is_terminal() != pattern | |
): | |
return False | |
else: | |
kwa_copy = kwargs | |
for key, pattern in kwa_copy.items(): | |
# Nodes must match all other specified attributes | |
if not hasattr(node, key): | |
return False | |
target = getattr(node, key) | |
if isinstance(pattern, str): | |
return isinstance(target, str) and re.match(pattern + "$", target) | |
if isinstance(pattern, bool): | |
return pattern == bool(target) | |
if isinstance(pattern, int): | |
return pattern == target | |
if pattern is None: | |
return target is None | |
raise TypeError(f"invalid query type: {type(pattern)}") | |
return True | |
return match | |
def _function_matcher(matcher_func): | |
"""Safer attribute lookup -- returns False instead of raising an error (PRIVATE).""" | |
def match(node): | |
try: | |
return matcher_func(node) | |
except (LookupError, AttributeError, ValueError, TypeError): | |
return False | |
return match | |
def _object_matcher(obj): | |
"""Retrieve a matcher function by passing an arbitrary object (PRIVATE). | |
Passing a ``TreeElement`` such as a ``Clade`` or ``Tree`` instance returns | |
an identity matcher, passing a type such as the ``PhyloXML.Taxonomy`` class | |
returns a class matcher, and passing a dictionary returns an attribute | |
matcher. | |
The resulting 'match' function returns True when given an object matching | |
the specification (identity, type or attribute values), otherwise False. | |
This is useful for writing functions that search the tree, and probably | |
shouldn't be used directly by the end user. | |
""" | |
if isinstance(obj, TreeElement): | |
return _identity_matcher(obj) | |
if isinstance(obj, type): | |
return _class_matcher(obj) | |
if isinstance(obj, str): | |
return _string_matcher(obj) | |
if isinstance(obj, dict): | |
return _attribute_matcher(obj) | |
if callable(obj): | |
return _function_matcher(obj) | |
raise ValueError(f"{obj} (type {type(obj)}) is not a valid type for comparison.") | |
def _combine_matchers(target, kwargs, require_spec): | |
"""Merge target specifications with keyword arguments (PRIVATE). | |
Dispatch the components to the various matcher functions, then merge into a | |
single boolean function. | |
""" | |
if not target: | |
if not kwargs: | |
if require_spec: | |
raise ValueError( | |
"you must specify a target object or keyword arguments." | |
) | |
return lambda x: True | |
return _attribute_matcher(kwargs) | |
match_obj = _object_matcher(target) | |
if not kwargs: | |
return match_obj | |
match_kwargs = _attribute_matcher(kwargs) | |
return lambda x: match_obj(x) and match_kwargs(x) | |
def _combine_args(first, *rest): | |
"""Convert ``[targets]`` or ``*targets`` arguments to a single iterable (PRIVATE). | |
This helps other functions work like the built-in functions ``max`` and | |
``min``. | |
""" | |
# Background: is_monophyletic takes a single list or iterable (like the | |
# same method in Bio.Nexus.Trees); root_with_outgroup and common_ancestor | |
# take separate arguments. This mismatch was in the initial release and I | |
# didn't notice the inconsistency until after Biopython 1.55. I can think | |
# of cases where either style is more convenient, so let's support both | |
# (for backward compatibility and consistency between methods). | |
if hasattr(first, "__iter__") and not ( | |
isinstance(first, TreeElement) | |
or isinstance(first, type) | |
or isinstance(first, str) | |
or isinstance(first, dict) | |
): | |
# terminals is an iterable of targets | |
if rest: | |
raise ValueError( | |
"Arguments must be either a single list of " | |
"targets, or separately specified targets " | |
"(e.g. foo(t1, t2, t3)), but not both." | |
) | |
return first | |
# terminals is a single target -- wrap in a container | |
return itertools.chain([first], rest) | |
# Class definitions | |
class TreeElement: | |
"""Base class for all Bio.Phylo classes.""" | |
def __repr__(self): | |
"""Show this object's constructor with its primitive arguments.""" | |
def pair_as_kwarg_string(key, val): | |
if isinstance(val, str): | |
val = val[:57] + "..." if len(val) > 60 else val | |
return f"{key}='{val}'" | |
return f"{key}={val}" | |
return "%s(%s)" % ( | |
self.__class__.__name__, | |
", ".join( | |
pair_as_kwarg_string(key, val) | |
for key, val in sorted(self.__dict__.items()) | |
if val is not None and type(val) in (str, int, float, bool, str) | |
), | |
) | |
__str__ = __repr__ | |
class TreeMixin: | |
"""Methods for Tree- and Clade-based classes. | |
This lets ``Tree`` and ``Clade`` support the same traversal and searching | |
operations without requiring Clade to inherit from Tree, so Clade isn't | |
required to have all of Tree's attributes -- just ``root`` (a Clade | |
instance) and ``is_terminal``. | |
""" | |
# Traversal methods | |
def _filter_search(self, filter_func, order, follow_attrs): | |
"""Perform a BFS or DFS traversal through all elements in this tree (PRIVATE). | |
:returns: generator of all elements for which ``filter_func`` is True. | |
""" | |
order_opts = { | |
"preorder": _preorder_traverse, | |
"postorder": _postorder_traverse, | |
"level": _level_traverse, | |
} | |
try: | |
order_func = order_opts[order] | |
except KeyError: | |
raise ValueError( | |
f"Invalid order '{order}'; must be one of: {tuple(order_opts)}" | |
) from None | |
if follow_attrs: | |
get_children = _sorted_attrs | |
root = self | |
else: | |
get_children = lambda elem: elem.clades # noqa: E731 | |
root = self.root | |
return filter(filter_func, order_func(root, get_children)) | |
def find_any(self, *args, **kwargs): | |
"""Return the first element found by find_elements(), or None. | |
This is also useful for checking whether any matching element exists in | |
the tree, and can be used in a conditional expression. | |
""" | |
hits = self.find_elements(*args, **kwargs) | |
try: | |
return next(hits) | |
except StopIteration: | |
return None | |
def find_elements(self, target=None, terminal=None, order="preorder", **kwargs): | |
"""Find all tree elements matching the given attributes. | |
The arbitrary keyword arguments indicate the attribute name of the | |
sub-element and the value to match: string, integer or boolean. Strings | |
are evaluated as regular expression matches; integers are compared | |
directly for equality, and booleans evaluate the attribute's truth value | |
(True or False) before comparing. To handle nonzero floats, search with | |
a boolean argument, then filter the result manually. | |
If no keyword arguments are given, then just the class type is used for | |
matching. | |
The result is an iterable through all matching objects, by depth-first | |
search. (Not necessarily the same order as the elements appear in the | |
source file!) | |
:Parameters: | |
target : TreeElement instance, type, dict, or callable | |
Specifies the characteristics to search for. (The default, | |
TreeElement, matches any standard Bio.Phylo type.) | |
terminal : bool | |
A boolean value to select for or against terminal nodes (a.k.a. | |
leaf nodes). True searches for only terminal nodes, False | |
excludes terminal nodes, and the default, None, searches both | |
terminal and non-terminal nodes, as well as any tree elements | |
lacking the ``is_terminal`` method. | |
order : {'preorder', 'postorder', 'level'} | |
Tree traversal order: 'preorder' (default) is depth-first | |
search, 'postorder' is DFS with child nodes preceding parents, | |
and 'level' is breadth-first search. | |
Examples | |
-------- | |
>>> from Bio import Phylo | |
>>> phx = Phylo.PhyloXMLIO.read('PhyloXML/phyloxml_examples.xml') | |
>>> matches = phx.phylogenies[5].find_elements(code='OCTVU') | |
>>> next(matches) | |
Taxonomy(code='OCTVU', scientific_name='Octopus vulgaris') | |
""" | |
if terminal is not None: | |
kwargs["terminal"] = terminal | |
is_matching_elem = _combine_matchers(target, kwargs, False) | |
return self._filter_search(is_matching_elem, order, True) | |
def find_clades(self, target=None, terminal=None, order="preorder", **kwargs): | |
"""Find each clade containing a matching element. | |
That is, find each element as with find_elements(), but return the | |
corresponding clade object. (This is usually what you want.) | |
:returns: an iterable through all matching objects, searching | |
depth-first (preorder) by default. | |
""" | |
def match_attrs(elem): | |
orig_clades = elem.__dict__.pop("clades") | |
found = elem.find_any(target, **kwargs) | |
elem.clades = orig_clades | |
return found is not None | |
if terminal is None: | |
is_matching_elem = match_attrs | |
else: | |
def is_matching_elem(elem): | |
return (elem.is_terminal() == terminal) and match_attrs(elem) | |
return self._filter_search(is_matching_elem, order, False) | |
def get_path(self, target=None, **kwargs): | |
"""List the clades directly between this root and the given target. | |
:returns: list of all clade objects along this path, ending with the | |
given target, but excluding the root clade. | |
""" | |
# Only one path will work -- ignore weights and visits | |
path = [] | |
match = _combine_matchers(target, kwargs, True) | |
def check_in_path(v): | |
if match(v): | |
path.append(v) | |
return True | |
elif v.is_terminal(): | |
return False | |
for child in v: | |
if check_in_path(child): | |
path.append(v) | |
return True | |
return False | |
if not check_in_path(self.root): | |
return None | |
return path[-2::-1] | |
def get_nonterminals(self, order="preorder"): | |
"""Get a list of all of this tree's nonterminal (internal) nodes.""" | |
return list(self.find_clades(terminal=False, order=order)) | |
def get_terminals(self, order="preorder"): | |
"""Get a list of all of this tree's terminal (leaf) nodes.""" | |
return list(self.find_clades(terminal=True, order=order)) | |
def trace(self, start, finish): | |
"""List of all clade object between two targets in this tree. | |
Excluding ``start``, including ``finish``. | |
""" | |
mrca = self.common_ancestor(start, finish) | |
fromstart = mrca.get_path(start)[-2::-1] | |
to = mrca.get_path(finish) | |
return fromstart + [mrca] + to | |
# Information methods | |
def common_ancestor(self, targets, *more_targets): | |
"""Most recent common ancestor (clade) of all the given targets. | |
Edge cases: | |
- If no target is given, returns self.root | |
- If 1 target is given, returns the target | |
- If any target is not found in this tree, raises a ValueError | |
""" | |
paths = [self.get_path(t) for t in _combine_args(targets, *more_targets)] | |
# Validation -- otherwise izip throws a spooky error below | |
for p, t in zip(paths, targets): | |
if p is None: | |
raise ValueError(f"target {t!r} is not in this tree") | |
mrca = self.root | |
for level in zip(*paths): | |
ref = level[0] | |
for other in level[1:]: | |
if ref is not other: | |
break | |
else: | |
mrca = ref | |
if ref is not mrca: | |
break | |
return mrca | |
def count_terminals(self): | |
"""Count the number of terminal (leaf) nodes within this tree.""" | |
return sum(1 for clade in self.find_clades(terminal=True)) | |
def depths(self, unit_branch_lengths=False): # noqa: D402 | |
"""Create a mapping of tree clades to depths (by branch length). | |
:Parameters: | |
unit_branch_lengths : bool | |
If True, count only the number of branches (levels in the tree). | |
By default the distance is the cumulative branch length leading | |
to the clade. | |
:returns: dict of {clade: depth}, where keys are all of the Clade | |
instances in the tree, and values are the distance from the root to | |
each clade (including terminals). | |
""" # noqa: D402 | |
if unit_branch_lengths: | |
depth_of = lambda c: 1 # noqa: E731 | |
else: | |
depth_of = lambda c: c.branch_length or 0 # noqa: E731 | |
depths = {} | |
def update_depths(node, curr_depth): | |
depths[node] = curr_depth | |
for child in node.clades: | |
new_depth = curr_depth + depth_of(child) | |
update_depths(child, new_depth) | |
update_depths(self.root, self.root.branch_length or 0) | |
return depths | |
def distance(self, target1, target2=None): | |
"""Calculate the sum of the branch lengths between two targets. | |
If only one target is specified, the other is the root of this tree. | |
""" | |
if target2 is None: | |
return sum( | |
n.branch_length | |
for n in self.get_path(target1) | |
if n.branch_length is not None | |
) | |
mrca = self.common_ancestor(target1, target2) | |
return mrca.distance(target1) + mrca.distance(target2) | |
def is_bifurcating(self): | |
"""Return True if tree downstream of node is strictly bifurcating. | |
I.e., all nodes have either 2 or 0 children (internal or external, | |
respectively). The root may have 3 descendents and still be considered | |
part of a bifurcating tree, because it has no ancestor. | |
""" | |
# Root can be trifurcating | |
if isinstance(self, Tree) and len(self.root) == 3: | |
return ( | |
self.root.clades[0].is_bifurcating() | |
and self.root.clades[1].is_bifurcating() | |
and self.root.clades[2].is_bifurcating() | |
) | |
if len(self.root) == 2: | |
return ( | |
self.root.clades[0].is_bifurcating() | |
and self.root.clades[1].is_bifurcating() | |
) | |
if len(self.root) == 0: | |
return True | |
return False | |
def is_monophyletic(self, terminals, *more_terminals): | |
"""MRCA of terminals if they comprise a complete subclade, or False. | |
I.e., there exists a clade such that its terminals are the same set as | |
the given targets. | |
The given targets must be terminals of the tree. | |
To match both ``Bio.Nexus.Trees`` and the other multi-target methods in | |
Bio.Phylo, arguments to this method can be specified either of two ways: | |
(i) as a single list of targets, or (ii) separately specified targets, | |
e.g. is_monophyletic(t1, t2, t3) -- but not both. | |
For convenience, this method returns the common ancestor (MCRA) of the | |
targets if they are monophyletic (instead of the value True), and False | |
otherwise. | |
:returns: common ancestor if terminals are monophyletic, otherwise False. | |
""" | |
target_set = set(_combine_args(terminals, *more_terminals)) | |
current = self.root | |
while True: | |
if set(current.get_terminals()) == target_set: | |
return current | |
# Try a narrower subclade | |
for subclade in current.clades: | |
if set(subclade.get_terminals()).issuperset(target_set): | |
current = subclade | |
break | |
else: | |
return False | |
def is_parent_of(self, target=None, **kwargs): | |
"""Check if target is a descendent of this tree. | |
Not required to be a direct descendent. | |
To check only direct descendents of a clade, simply use list membership | |
testing: ``if subclade in clade: ...`` | |
""" | |
return self.get_path(target, **kwargs) is not None | |
def is_preterminal(self): | |
"""Check if all direct descendents are terminal.""" | |
if self.root.is_terminal(): | |
return False | |
for clade in self.root.clades: | |
if not clade.is_terminal(): | |
return False | |
return True | |
def total_branch_length(self): | |
"""Calculate the sum of all the branch lengths in this tree.""" | |
return sum(node.branch_length for node in self.find_clades(branch_length=True)) | |
# Tree manipulation methods | |
def collapse(self, target=None, **kwargs): | |
"""Delete target from the tree, relinking its children to its parent. | |
:returns: the parent clade. | |
""" | |
path = self.get_path(target, **kwargs) | |
if not path: | |
raise ValueError("couldn't collapse %s in this tree" % (target or kwargs)) | |
if len(path) == 1: | |
parent = self.root | |
else: | |
parent = path[-2] | |
popped = parent.clades.pop(parent.clades.index(path[-1])) | |
extra_length = popped.branch_length or 0 | |
for child in popped: | |
child.branch_length += extra_length | |
parent.clades.extend(popped.clades) | |
return parent | |
def collapse_all(self, target=None, **kwargs): | |
"""Collapse all the descendents of this tree, leaving only terminals. | |
Total branch lengths are preserved, i.e. the distance to each terminal | |
stays the same. | |
For example, this will safely collapse nodes with poor bootstrap | |
support: | |
>>> from Bio import Phylo | |
>>> tree = Phylo.read('PhyloXML/apaf.xml', 'phyloxml') | |
>>> print("Total branch length %0.2f" % tree.total_branch_length()) | |
Total branch length 20.44 | |
>>> tree.collapse_all(lambda c: c.confidence is not None and c.confidence < 70) | |
>>> print("Total branch length %0.2f" % tree.total_branch_length()) | |
Total branch length 21.37 | |
This implementation avoids strange side-effects by using level-order | |
traversal and testing all clade properties (versus the target | |
specification) up front. In particular, if a clade meets the target | |
specification in the original tree, it will be collapsed. For example, | |
if the condition is: | |
>>> from Bio import Phylo | |
>>> tree = Phylo.read('PhyloXML/apaf.xml', 'phyloxml') | |
>>> print("Total branch length %0.2f" % tree.total_branch_length()) | |
Total branch length 20.44 | |
>>> tree.collapse_all(lambda c: c.branch_length < 0.1) | |
>>> print("Total branch length %0.2f" % tree.total_branch_length()) | |
Total branch length 21.13 | |
Collapsing a clade's parent node adds the parent's branch length to the | |
child, so during the execution of collapse_all, a clade's branch_length | |
may increase. In this implementation, clades are collapsed according to | |
their properties in the original tree, not the properties when tree | |
traversal reaches the clade. (It's easier to debug.) If you want the | |
other behavior (incremental testing), modifying the source code of this | |
function is straightforward. | |
""" | |
# Read the iterable into a list to protect against in-place changes | |
matches = list(self.find_clades(target, False, "level", **kwargs)) | |
if not matches: | |
# No matching nodes to collapse | |
return | |
# Skip the root node -- it can't be collapsed | |
if matches[0] == self.root: | |
matches.pop(0) | |
for clade in matches: | |
self.collapse(clade) | |
def ladderize(self, reverse=False): | |
"""Sort clades in-place according to the number of terminal nodes. | |
Deepest clades are last by default. Use ``reverse=True`` to sort clades | |
deepest-to-shallowest. | |
""" | |
self.root.clades.sort(key=lambda c: c.count_terminals(), reverse=reverse) | |
for subclade in self.root.clades: | |
subclade.ladderize(reverse=reverse) | |
def prune(self, target=None, **kwargs): | |
"""Prunes a terminal clade from the tree. | |
If taxon is from a bifurcation, the connecting node will be collapsed | |
and its branch length added to remaining terminal node. This might be no | |
longer be a meaningful value. | |
:returns: parent clade of the pruned target | |
""" | |
if "terminal" in kwargs and kwargs["terminal"]: | |
raise ValueError("target must be terminal") | |
path = self.get_path(target, terminal=True, **kwargs) | |
if not path: | |
raise ValueError("can't find a matching target below this root") | |
if len(path) == 1: | |
parent = self.root | |
else: | |
parent = path[-2] | |
parent.clades.remove(path[-1]) | |
if len(parent) == 1: | |
# We deleted a branch from a bifurcation | |
if parent == self.root: | |
# If we're at the root, move the root upwards | |
# NB: This loses the length of the original branch | |
newroot = parent.clades[0] | |
newroot.branch_length = None | |
parent = self.root = newroot | |
else: | |
# If we're not at the root, collapse this parent | |
child = parent.clades[0] | |
if child.branch_length is not None: | |
child.branch_length += parent.branch_length or 0.0 | |
if len(path) < 3: | |
grandparent = self.root | |
else: | |
grandparent = path[-3] | |
# Replace parent with child at the same place in grandparent | |
index = grandparent.clades.index(parent) | |
grandparent.clades.pop(index) | |
grandparent.clades.insert(index, child) | |
parent = grandparent | |
return parent | |
def split(self, n=2, branch_length=1.0): | |
"""Generate n (default 2) new descendants. | |
In a species tree, this is a speciation event. | |
New clades have the given branch_length and the same name as this | |
clade's root plus an integer suffix (counting from 0). For example, | |
splitting a clade named "A" produces sub-clades named "A0" and "A1". | |
If the clade has no name, the prefix "n" is used for child nodes, e.g. | |
"n0" and "n1". | |
""" | |
clade_cls = type(self.root) | |
base_name = self.root.name or "n" | |
for i in range(n): | |
clade = clade_cls(name=base_name + str(i), branch_length=branch_length) | |
self.root.clades.append(clade) | |
class Tree(TreeElement, TreeMixin): | |
"""A phylogenetic tree, containing global info for the phylogeny. | |
The structure and node-specific data is accessible through the 'root' | |
clade attached to the Tree instance. | |
:Parameters: | |
root : Clade | |
The starting node of the tree. If the tree is rooted, this will | |
usually be the root node. | |
rooted : bool | |
Whether or not the tree is rooted. By default, a tree is assumed to | |
be rooted. | |
id : str | |
The identifier of the tree, if there is one. | |
name : str | |
The name of the tree, in essence a label. | |
""" | |
def __init__(self, root=None, rooted=True, id=None, name=None): | |
"""Initialize parameter for phylogenetic tree.""" | |
self.root = root or Clade() | |
self.rooted = rooted | |
self.id = id | |
self.name = name | |
def from_clade(cls, clade, **kwargs): | |
"""Create a new Tree object given a clade. | |
Keyword arguments are the usual ``Tree`` constructor parameters. | |
""" | |
root = copy.deepcopy(clade) | |
return cls(root, **kwargs) | |
def randomized(cls, taxa, branch_length=1.0, branch_stdev=None): | |
"""Create a randomized bifurcating tree given a list of taxa. | |
:param taxa: Either an integer specifying the number of taxa to create | |
(automatically named taxon#), or an iterable of taxon names, as | |
strings. | |
:returns: a tree of the same type as this class. | |
""" | |
if isinstance(taxa, int): | |
taxa = [f"taxon{i + 1}" for i in range(taxa)] | |
elif hasattr(taxa, "__iter__"): | |
taxa = list(taxa) | |
else: | |
raise TypeError( | |
"taxa argument must be integer (# taxa) or iterable of taxon names." | |
) | |
rtree = cls() | |
terminals = [rtree.root] | |
while len(terminals) < len(taxa): | |
newsplit = random.choice(terminals) | |
newsplit.split(branch_length=branch_length) | |
newterms = newsplit.clades | |
if branch_stdev: | |
# Add some noise to the branch lengths | |
for nt in newterms: | |
nt.branch_length = max(0, random.gauss(branch_length, branch_stdev)) | |
terminals.remove(newsplit) | |
terminals.extend(newterms) | |
# Distribute taxon labels randomly | |
random.shuffle(taxa) | |
for node, name in zip(terminals, taxa): | |
node.name = name | |
return rtree | |
def clade(self): | |
"""Return first clade in this tree (not itself).""" | |
return self.root | |
def as_phyloxml(self, **kwargs): | |
"""Convert this tree to a PhyloXML-compatible Phylogeny. | |
This lets you use the additional annotation types PhyloXML defines, and | |
save this information when you write this tree as 'phyloxml'. | |
""" | |
from Bio.Phylo.PhyloXML import Phylogeny | |
return Phylogeny.from_tree(self, **kwargs) | |
def root_with_outgroup( | |
self, outgroup_targets, *more_targets, outgroup_branch_length=None | |
): | |
"""Reroot this tree with the outgroup clade containing outgroup_targets. | |
Operates in-place. | |
Edge cases: | |
- If ``outgroup == self.root``, no change | |
- If outgroup is terminal, create new bifurcating root node with a | |
0-length branch to the outgroup | |
- If outgroup is internal, use the given outgroup node as the new | |
trifurcating root, keeping branches the same | |
- If the original root was bifurcating, drop it from the tree, | |
preserving total branch lengths | |
:param outgroup_branch_length: length of the branch leading to the | |
outgroup after rerooting. If not specified (None), then: | |
- If the outgroup is an internal node (not a single terminal taxon), | |
then use that node as the new root. | |
- Otherwise, create a new root node as the parent of the outgroup. | |
""" | |
# This raises a ValueError if any target is not in this tree | |
# Otherwise, common_ancestor guarantees outgroup is in this tree | |
outgroup = self.common_ancestor(outgroup_targets, *more_targets) | |
outgroup_path = self.get_path(outgroup) | |
if len(outgroup_path) == 0: | |
# Outgroup is the current root -- no change | |
return | |
prev_blen = outgroup.branch_length or 0.0 | |
if outgroup.is_terminal() or outgroup_branch_length is not None: | |
# Create a new root with a 0-length branch to the outgroup | |
outgroup.branch_length = outgroup_branch_length or 0.0 | |
new_root = self.root.__class__( | |
branch_length=self.root.branch_length, clades=[outgroup] | |
) | |
# The first branch reversal (see the upcoming loop) is modified | |
if len(outgroup_path) == 1: | |
# No nodes between the original root and outgroup to rearrange. | |
# Most of the code below will be skipped, but we still need | |
# 'new_parent' pointing at the new root. | |
new_parent = new_root | |
else: | |
parent = outgroup_path.pop(-2) | |
# First iteration of reversing the path to the outgroup | |
parent.clades.pop(parent.clades.index(outgroup)) | |
(prev_blen, parent.branch_length) = ( | |
parent.branch_length, | |
prev_blen - outgroup.branch_length, | |
) | |
new_root.clades.insert(0, parent) | |
new_parent = parent | |
else: | |
# Use the given outgroup node as the new (trifurcating) root | |
new_root = outgroup | |
new_root.branch_length = self.root.branch_length | |
new_parent = new_root | |
# Tracing the outgroup lineage backwards, reattach the subclades under a | |
# new root clade. Reverse the branches directly above the outgroup in | |
# the tree, but keep the descendants of those clades as they are. | |
for parent in outgroup_path[-2::-1]: | |
parent.clades.pop(parent.clades.index(new_parent)) | |
prev_blen, parent.branch_length = parent.branch_length, prev_blen | |
new_parent.clades.insert(0, parent) | |
new_parent = parent | |
# Finally, handle the original root according to number of descendents | |
old_root = self.root | |
if outgroup in old_root.clades: | |
assert len(outgroup_path) == 1 | |
old_root.clades.pop(old_root.clades.index(outgroup)) | |
else: | |
old_root.clades.pop(old_root.clades.index(new_parent)) | |
if len(old_root) == 1: | |
# Delete the old bifurcating root & add branch lengths | |
ingroup = old_root.clades[0] | |
if ingroup.branch_length: | |
ingroup.branch_length += prev_blen | |
else: | |
ingroup.branch_length = prev_blen | |
new_parent.clades.insert(0, ingroup) | |
# ENH: If annotations are attached to old_root, do... something. | |
else: | |
# Keep the old trifurcating/polytomous root as an internal node | |
old_root.branch_length = prev_blen | |
new_parent.clades.insert(0, old_root) | |
self.root = new_root | |
self.rooted = True | |
def root_at_midpoint(self): | |
"""Root the tree at the midpoint of the two most distant taxa. | |
This operates in-place, leaving a bifurcating root. The topology of the | |
tree is otherwise retained, though no guarantees are made about the | |
stability of clade/node/taxon ordering. | |
""" | |
# Identify the largest pairwise distance | |
max_distance = 0.0 | |
tips = self.get_terminals() | |
for tip in tips: | |
self.root_with_outgroup(tip) | |
new_max = max(self.depths().items(), key=lambda nd: nd[1]) | |
if new_max[1] > max_distance: | |
tip1 = tip | |
tip2 = new_max[0] | |
max_distance = new_max[1] | |
self.root_with_outgroup(tip1) | |
# Depth to go from the ingroup tip toward the outgroup tip | |
root_remainder = 0.5 * (max_distance - (self.root.branch_length or 0)) | |
assert root_remainder >= 0 | |
# Identify the midpoint and reroot there. | |
# Trace the path to the outgroup tip until all of the root depth has | |
# been traveled/accounted for. | |
for node in self.get_path(tip2): | |
root_remainder -= node.branch_length | |
if root_remainder < 0: | |
outgroup_node = node | |
outgroup_branch_length = -root_remainder | |
break | |
else: | |
raise ValueError("Somehow, failed to find the midpoint!") | |
self.root_with_outgroup( | |
outgroup_node, outgroup_branch_length=outgroup_branch_length | |
) | |
# Method assumed by TreeMixin | |
def is_terminal(self): | |
"""Check if the root of this tree is terminal.""" | |
return not self.root.clades | |
# Convention from SeqRecord and Alignment classes | |
def __format__(self, format_spec): | |
"""Serialize the tree as a string in the specified file format. | |
This method supports Python's ``format`` built-in function. | |
:param format_spec: a lower-case string supported by ``Bio.Phylo.write`` | |
as an output file format. | |
""" | |
if format_spec: | |
from io import StringIO | |
from Bio.Phylo import _io | |
handle = StringIO() | |
_io.write([self], handle, format_spec) | |
return handle.getvalue() | |
else: | |
# Follow python convention and default to using __str__ | |
return str(self) | |
def format(self, fmt=None): | |
"""Serialize the tree as a string in the specified file format. | |
:param fmt: a lower-case string supported by ``Bio.Phylo.write`` | |
as an output file format. | |
""" | |
return self.__format__(fmt) | |
# Pretty-printer for the entire tree hierarchy | |
def __str__(self): | |
"""Return a string representation of the entire tree. | |
Serialize each sub-clade recursively using ``repr`` to create a summary | |
of the object structure. | |
""" | |
TAB = " " | |
textlines = [] | |
def print_tree(obj, indent): | |
"""Recursively serialize sub-elements. | |
This closes over textlines and modifies it in-place. | |
""" | |
if isinstance(obj, (Tree, Clade)): | |
# Avoid infinite recursion or special formatting from str() | |
objstr = repr(obj) | |
else: | |
objstr = str(obj) | |
textlines.append(TAB * indent + objstr) | |
indent += 1 | |
for attr in obj.__dict__: | |
child = getattr(obj, attr) | |
if isinstance(child, TreeElement): | |
print_tree(child, indent) | |
elif isinstance(child, list): | |
for elem in child: | |
if isinstance(elem, TreeElement): | |
print_tree(elem, indent) | |
print_tree(self, 0) | |
return "\n".join(textlines) | |
class Clade(TreeElement, TreeMixin): | |
"""A recursively defined sub-tree. | |
:Parameters: | |
branch_length : str | |
The length of the branch leading to the root node of this clade. | |
name : str | |
The clade's name (a label). | |
clades : list | |
Sub-trees rooted directly under this tree's root. | |
confidence : number | |
Support. | |
color : BranchColor | |
The display color of the branch and descendents. | |
width : number | |
The display width of the branch and descendents. | |
""" | |
def __init__( | |
self, | |
branch_length=None, | |
name=None, | |
clades=None, | |
confidence=None, | |
color=None, | |
width=None, | |
): | |
"""Define parameters for the Clade tree.""" | |
self.branch_length = branch_length | |
self.name = name | |
self.clades = clades or [] | |
self.confidence = confidence | |
self.color = color | |
self.width = width | |
def root(self): | |
"""Allow TreeMixin methods to traverse clades properly.""" | |
return self | |
def is_terminal(self): | |
"""Check if this is a terminal (leaf) node.""" | |
return not self.clades | |
# Sequence-type behavior methods | |
def __getitem__(self, index): | |
"""Get clades by index (integer or slice).""" | |
if isinstance(index, (int, slice)): | |
return self.clades[index] | |
ref = self | |
for idx in index: | |
ref = ref[idx] | |
return ref | |
def __iter__(self): | |
"""Iterate through this tree's direct descendent clades (sub-trees).""" | |
return iter(self.clades) | |
def __len__(self): | |
"""Return the number of clades directly under the root.""" | |
return len(self.clades) | |
def __bool__(self): | |
"""Boolean value of an instance of this class (True). | |
NB: If this method is not defined, but ``__len__`` is, then the object | |
is considered true if the result of ``__len__()`` is nonzero. We want | |
Clade instances to always be considered True. | |
""" | |
return True | |
def __str__(self): | |
"""Return name of the class instance.""" | |
if self.name: | |
return self.name[:37] + "..." if len(self.name) > 40 else self.name | |
return self.__class__.__name__ | |
# Syntax sugar for setting the branch color | |
def _get_color(self): | |
return self._color | |
def _set_color(self, arg): | |
if arg is None or isinstance(arg, BranchColor): | |
self._color = arg | |
elif isinstance(arg, str): | |
if arg in BranchColor.color_names: | |
# Known color name | |
self._color = BranchColor.from_name(arg) | |
elif arg.startswith("#") and len(arg) == 7: | |
# HTML-style hex string | |
self._color = BranchColor.from_hex(arg) | |
else: | |
raise ValueError(f"invalid color string {arg}") | |
elif hasattr(arg, "__iter__") and len(arg) == 3: | |
# RGB triplet | |
self._color = BranchColor(*arg) | |
else: | |
raise ValueError(f"invalid color value {arg}") | |
color = property(_get_color, _set_color, doc="Branch color.") | |
class BranchColor: | |
"""Indicates the color of a clade when rendered graphically. | |
The color should be interpreted by client code (e.g. visualization | |
programs) as applying to the whole clade, unless overwritten by the | |
color(s) of sub-clades. | |
Color values must be integers from 0 to 255. | |
""" | |
color_names = { | |
"red": (255, 0, 0), | |
"r": (255, 0, 0), | |
"yellow": (255, 255, 0), | |
"y": (255, 255, 0), | |
"green": (0, 128, 0), | |
"g": (0, 128, 0), | |
"cyan": (0, 255, 255), | |
"c": (0, 255, 255), | |
"blue": (0, 0, 255), | |
"b": (0, 0, 255), | |
"magenta": (255, 0, 255), | |
"m": (255, 0, 255), | |
"black": (0, 0, 0), | |
"k": (0, 0, 0), | |
"white": (255, 255, 255), | |
"w": (255, 255, 255), | |
# Names standardized in HTML/CSS spec | |
# http://w3schools.com/html/html_colornames.asp | |
"maroon": (128, 0, 0), | |
"olive": (128, 128, 0), | |
"lime": (0, 255, 0), | |
"aqua": (0, 255, 255), | |
"teal": (0, 128, 128), | |
"navy": (0, 0, 128), | |
"fuchsia": (255, 0, 255), | |
"purple": (128, 0, 128), | |
"silver": (192, 192, 192), | |
"gray": (128, 128, 128), | |
# More definitions from matplotlib/gcolor2 | |
"grey": (128, 128, 128), | |
"pink": (255, 192, 203), | |
"salmon": (250, 128, 114), | |
"orange": (255, 165, 0), | |
"gold": (255, 215, 0), | |
"tan": (210, 180, 140), | |
"brown": (165, 42, 42), | |
} | |
def __init__(self, red, green, blue): | |
"""Initialize BranchColor for a tree.""" | |
for color in (red, green, blue): | |
assert ( | |
isinstance(color, int) and 0 <= color <= 255 | |
), "Color values must be integers between 0 and 255." | |
self.red = red | |
self.green = green | |
self.blue = blue | |
def from_hex(cls, hexstr): | |
"""Construct a BranchColor object from a hexadecimal string. | |
The string format is the same style used in HTML and CSS, such as | |
'#FF8000' for an RGB value of (255, 128, 0). | |
""" | |
assert ( | |
isinstance(hexstr, str) and hexstr.startswith("#") and len(hexstr) == 7 | |
), "need a 24-bit hexadecimal string, e.g. #000000" | |
RGB = hexstr[1:3], hexstr[3:5], hexstr[5:] | |
return cls(*(int("0x" + cc, base=16) for cc in RGB)) | |
def from_name(cls, colorname): | |
"""Construct a BranchColor object by the color's name.""" | |
return cls(*cls.color_names[colorname]) | |
def to_hex(self): | |
"""Return a 24-bit hexadecimal RGB representation of this color. | |
The returned string is suitable for use in HTML/CSS, as a color | |
parameter in matplotlib, and perhaps other situations. | |
Examples | |
-------- | |
>>> bc = BranchColor(12, 200, 100) | |
>>> bc.to_hex() | |
'#0cc864' | |
""" | |
return f"#{self.red:02x}{self.green:02x}{self.blue:02x}" | |
def to_rgb(self): | |
"""Return a tuple of RGB values (0 to 255) representing this color. | |
Examples | |
-------- | |
>>> bc = BranchColor(255, 165, 0) | |
>>> bc.to_rgb() | |
(255, 165, 0) | |
""" | |
return (self.red, self.green, self.blue) | |
def __repr__(self): | |
"""Preserve the standard RGB order when representing this object.""" | |
return "%s(red=%d, green=%d, blue=%d)" % ( | |
self.__class__.__name__, | |
self.red, | |
self.green, | |
self.blue, | |
) | |
def __str__(self): | |
"""Show the color's RGB values.""" | |
return "(%d, %d, %d)" % (self.red, self.green, self.blue) | |