Spaces:
No application file
No application file
# Copyright (C) 2009 by Eric Talevich ([email protected]) | |
# Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox. | |
# All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""I/O function wrappers for the Newick file format. | |
See: http://evolution.genetics.washington.edu/phylip/newick_doc.html | |
""" | |
import re | |
from io import StringIO | |
from Bio.Phylo import Newick | |
class NewickError(Exception): | |
"""Exception raised when Newick object construction cannot continue.""" | |
pass | |
tokens = [ | |
(r"\(", "open parens"), | |
(r"\)", "close parens"), | |
(r"[^\s\(\)\[\]\'\:\;\,]+", "unquoted node label"), | |
(r"\:\ ?[+-]?[0-9]*\.?[0-9]+([eE][+-]?[0-9]+)?", "edge length"), | |
(r"\,", "comma"), | |
(r"\[(\\.|[^\]])*\]", "comment"), | |
(r"\'(\\.|[^\'])*\'", "quoted node label"), | |
(r"\;", "semicolon"), | |
(r"\n", "newline"), | |
] | |
tokenizer = re.compile(f"({'|'.join(token[0] for token in tokens)})") | |
token_dict = {name: re.compile(token) for token, name in tokens} | |
# --------------------------------------------------------- | |
# Public API | |
def parse(handle, **kwargs): | |
"""Iterate over the trees in a Newick file handle. | |
:returns: generator of Bio.Phylo.Newick.Tree objects. | |
""" | |
return Parser(handle).parse(**kwargs) | |
def write(trees, handle, plain=False, **kwargs): | |
"""Write a trees in Newick format to the given file handle. | |
:returns: number of trees written. | |
""" | |
return Writer(trees).write(handle, plain=plain, **kwargs) | |
# --------------------------------------------------------- | |
# Input | |
def _parse_confidence(text): | |
if text.isdigit(): | |
return int(text) | |
# NB: Could make this more consistent by treating as a percentage | |
# return int(text) / 100. | |
try: | |
return float(text) | |
# NB: This should be in [0.0, 1.0], but who knows what people will do | |
# assert 0 <= current_clade.confidence <= 1 | |
except ValueError: | |
return None | |
def _format_comment(text): | |
return "[%s]" % (text.replace("[", "\\[").replace("]", "\\]")) | |
def _get_comment(clade): | |
try: | |
comment = clade.comment | |
except AttributeError: | |
pass | |
else: | |
if comment: | |
return _format_comment(str(comment)) | |
return "" | |
class Parser: | |
"""Parse a Newick tree given a file handle. | |
Based on the parser in ``Bio.Nexus.Trees``. | |
""" | |
def __init__(self, handle): | |
"""Initialize file handle for the Newick Tree.""" | |
if handle.read(0) != "": | |
raise ValueError("Newick files must be opened in text mode") from None | |
self.handle = handle | |
def from_string(cls, treetext): | |
"""Instantiate the Newick Tree class from the given string.""" | |
handle = StringIO(treetext) | |
return cls(handle) | |
def parse( | |
self, values_are_confidence=False, comments_are_confidence=False, rooted=False | |
): | |
"""Parse the text stream this object was initialized with.""" | |
self.values_are_confidence = values_are_confidence | |
self.comments_are_confidence = comments_are_confidence | |
self.rooted = rooted | |
buf = "" | |
for line in self.handle: | |
buf += line.rstrip() | |
if buf.endswith(";"): | |
yield self._parse_tree(buf) | |
buf = "" | |
if buf: | |
# Last tree is missing a terminal ';' character -- that's OK | |
yield self._parse_tree(buf) | |
def _parse_tree(self, text): | |
"""Parse the text representation into an Tree object (PRIVATE).""" | |
tokens = re.finditer(tokenizer, text.strip()) | |
new_clade = self.new_clade | |
root_clade = new_clade() | |
current_clade = root_clade | |
entering_branch_length = False | |
lp_count = 0 | |
rp_count = 0 | |
for match in tokens: | |
token = match.group() | |
if token.startswith("'"): | |
# quoted label; add characters to clade name | |
current_clade.name = token[1:-1] | |
elif token.startswith("["): | |
# comment | |
current_clade.comment = token[1:-1] | |
if self.comments_are_confidence: | |
# Try to use this comment as a numeric support value | |
current_clade.confidence = _parse_confidence(current_clade.comment) | |
elif token == "(": | |
# start a new clade, which is a child of the current clade | |
current_clade = new_clade(current_clade) | |
entering_branch_length = False | |
lp_count += 1 | |
elif token == ",": | |
# if the current clade is the root, then the external parentheses | |
# are missing and a new root should be created | |
if current_clade is root_clade: | |
root_clade = new_clade() | |
current_clade.parent = root_clade | |
# start a new child clade at the same level as the current clade | |
parent = self.process_clade(current_clade) | |
current_clade = new_clade(parent) | |
entering_branch_length = False | |
elif token == ")": | |
# done adding children for this parent clade | |
parent = self.process_clade(current_clade) | |
if not parent: | |
raise NewickError("Parenthesis mismatch.") | |
current_clade = parent | |
entering_branch_length = False | |
rp_count += 1 | |
elif token == ";": | |
break | |
elif token.startswith(":"): | |
# branch length or confidence | |
value = float(token[1:]) | |
if self.values_are_confidence: | |
current_clade.confidence = value | |
else: | |
current_clade.branch_length = value | |
elif token == "\n": | |
pass | |
else: | |
# unquoted node label | |
current_clade.name = token | |
if lp_count != rp_count: | |
raise NewickError( | |
f"Mismatch, {lp_count} open vs {rp_count} close parentheses." | |
) | |
# if ; token broke out of for loop, there should be no remaining tokens | |
try: | |
next_token = next(tokens) | |
raise NewickError( | |
f"Text after semicolon in Newick tree: {next_token.group()}" | |
) | |
except StopIteration: | |
pass | |
self.process_clade(current_clade) | |
self.process_clade(root_clade) | |
return Newick.Tree(root=root_clade, rooted=self.rooted) | |
def new_clade(self, parent=None): | |
"""Return new Newick.Clade, optionally with temporary reference to parent.""" | |
clade = Newick.Clade() | |
if parent: | |
clade.parent = parent | |
return clade | |
def process_clade(self, clade): | |
"""Remove node's parent and return it. Final processing of parsed clade.""" | |
if ( | |
(clade.name) | |
and not (self.values_are_confidence or self.comments_are_confidence) | |
and (clade.confidence is None) | |
and (clade.clades) | |
): | |
clade.confidence = _parse_confidence(clade.name) | |
if clade.confidence is not None: | |
clade.name = None | |
try: | |
parent = clade.parent | |
except AttributeError: | |
pass | |
else: | |
parent.clades.append(clade) | |
del clade.parent | |
return parent | |
# --------------------------------------------------------- | |
# Output | |
class Writer: | |
"""Based on the writer in Bio.Nexus.Trees (str, to_string).""" | |
def __init__(self, trees): | |
"""Initialize parameter for Tree Writer object.""" | |
self.trees = trees | |
def write(self, handle, **kwargs): | |
"""Write this instance's trees to a file handle.""" | |
count = 0 | |
for treestr in self.to_strings(**kwargs): | |
handle.write(treestr + "\n") | |
count += 1 | |
return count | |
def to_strings( | |
self, | |
confidence_as_branch_length=False, | |
branch_length_only=False, | |
plain=False, | |
plain_newick=True, | |
ladderize=None, | |
max_confidence=1.0, | |
format_confidence="%1.2f", | |
format_branch_length="%1.5f", | |
): | |
"""Return an iterable of PAUP-compatible tree lines.""" | |
# If there's a conflict in the arguments, we override plain=True | |
if confidence_as_branch_length or branch_length_only: | |
plain = False | |
make_info_string = self._info_factory( | |
plain, | |
confidence_as_branch_length, | |
branch_length_only, | |
max_confidence, | |
format_confidence, | |
format_branch_length, | |
) | |
def newickize(clade): | |
"""Convert a node tree to a Newick tree string, recursively.""" | |
label = clade.name or "" | |
if label: | |
unquoted_label = re.match(token_dict["unquoted node label"], label) | |
if (not unquoted_label) or (unquoted_label.end() < len(label)): | |
label = "'%s'" % label.replace("\\", "\\\\").replace("'", "\\'") | |
if clade.is_terminal(): # terminal | |
return label + make_info_string(clade, terminal=True) | |
else: | |
subtrees = (newickize(sub) for sub in clade) | |
return f"({','.join(subtrees)}){label + make_info_string(clade)}" | |
# Convert each tree to a string | |
for tree in self.trees: | |
if ladderize in ("left", "LEFT", "right", "RIGHT"): | |
# Nexus compatibility shim, kind of | |
tree.ladderize(reverse=(ladderize in ("right", "RIGHT"))) | |
rawtree = newickize(tree.root) + ";" | |
if plain_newick: | |
yield rawtree | |
continue | |
# Nexus-style (?) notation before the raw Newick tree | |
treeline = ["tree", (tree.name or "a_tree"), "="] | |
if tree.weight != 1: | |
treeline.append(f"[&W{round(float(tree.weight), 3)}]") | |
if tree.rooted: | |
treeline.append("[&R]") | |
treeline.append(rawtree) | |
yield " ".join(treeline) | |
def _info_factory( | |
self, | |
plain, | |
confidence_as_branch_length, | |
branch_length_only, | |
max_confidence, | |
format_confidence, | |
format_branch_length, | |
): | |
"""Return a function that creates a nicely formatted node tag (PRIVATE).""" | |
if plain: | |
# Plain tree only. That's easy. | |
def make_info_string(clade, terminal=False): | |
return _get_comment(clade) | |
elif confidence_as_branch_length: | |
# Support as branchlengths (eg. PAUP), ignore actual branchlengths | |
def make_info_string(clade, terminal=False): | |
if terminal: | |
# terminal branches have 100% support | |
return (":" + format_confidence % max_confidence) + _get_comment( | |
clade | |
) | |
else: | |
return (":" + format_confidence % clade.confidence) + _get_comment( | |
clade | |
) | |
elif branch_length_only: | |
# write only branchlengths, ignore support | |
def make_info_string(clade, terminal=False): | |
return ( | |
":" + format_branch_length % clade.branch_length | |
) + _get_comment(clade) | |
else: | |
# write support and branchlengths (e.g. .con tree of mrbayes) | |
def make_info_string(clade, terminal=False): | |
if ( | |
terminal | |
or not hasattr(clade, "confidence") | |
or clade.confidence is None | |
): | |
return (":" + format_branch_length) % ( | |
clade.branch_length or 0.0 | |
) + _get_comment(clade) | |
else: | |
return (format_confidence + ":" + format_branch_length) % ( | |
clade.confidence, | |
clade.branch_length or 0.0, | |
) + _get_comment(clade) | |
return make_info_string | |