# Copyright 2003 by Bartek Wilczynski. All rights reserved. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Parsing TRANSFAC files.""" from Bio import motifs class Motif(motifs.Motif, dict): """Store the information for one TRANSFAC motif. This class inherits from the Bio.motifs.Motif base class, as well as from a Python dictionary. All motif information found by the parser is stored as attributes of the base class when possible; see the Bio.motifs.Motif base class for a description of these attributes. All other information associated with the motif is stored as (key, value) pairs in the dictionary, where the key is the two-letter fields as found in the TRANSFAC file. References are an exception: These are stored in the .references attribute. These fields are commonly found in TRANSFAC files:: AC: Accession number AS: Accession numbers, secondary BA: Statistical basis BF: Binding factors BS: Factor binding sites underlying the matrix [sequence; SITE accession number; start position for matrix sequence; length of sequence used; number of gaps inserted; strand orientation.] CC: Comments CO: Copyright notice DE: Short factor description DR: External databases [database name: database accession number] DT: Date created/updated HC: Subfamilies HP: Superfamilies ID: Identifier NA: Name of the binding factor OC: Taxonomic classification OS: Species/Taxon OV: Older version PV: Preferred version TY: Type XX: Empty line; these are not stored in the Record. References are stored in an .references attribute, which is a list of dictionaries with the following keys:: RN: Reference number RA: Reference authors RL: Reference data RT: Reference title RX: PubMed ID For more information, see the TRANSFAC documentation. """ multiple_value_keys = {"BF", "OV", "HP", "BS", "HC", "DT", "DR"} # These keys can occur multiple times for one motif reference_keys = {"RX", "RA", "RT", "RL"} # These keys occur for references class Record(list): """Store the information in a TRANSFAC matrix table. The record inherits from a list containing the individual motifs. Attributes: - version - The version number, corresponding to the 'VV' field in the TRANSFAC file; """ def __init__(self): """Initialize the class.""" self.version = None def __str__(self): """Turn the TRANSFAC matrix into a string.""" return write(self) def read(handle, strict=True): """Parse a transfac format handle into a Record object.""" annotations = {} references = [] counts = None record = Record() for line in handle: line = line.strip() if not line: continue key_value = line.split(None, 1) key = key_value[0].strip() if strict: if len(key) != 2: raise ValueError( "The key value of a TRANSFAC motif line should have 2 characters:" f'"{line}"' ) if len(key_value) == 2: value = key_value[1].strip() if strict: if not line.partition(" ")[1]: raise ValueError( "A TRANSFAC motif line should have 2 " "spaces between key and value columns: " f'"{line}"' ) if key == "VV": record.version = value elif key in ("P0", "PO"): # Old TRANSFAC files use PO instead of P0 counts = {} if value.split()[:4] != ["A", "C", "G", "T"]: raise ValueError( f'A TRANSFAC matrix "{key}" line should be ' f'followed by "A C G T": {line}' ) length = 0 for c in "ACGT": counts[c] = [] for line in handle: line = line.strip() key_value = line.split(None, 1) key = key_value[0].strip() if len(key_value) == 2: value = key_value[1].strip() if strict: if not line.partition(" ")[1]: raise ValueError( "A TRANSFAC motif line should have 2 spaces" f' between key and value columns: "{line}"' ) try: i = int(key) except ValueError: break if length == 0 and i == 0: if strict: raise ValueError( 'A TRANSFAC matrix should start with "01" as first row' f' of the matrix, but this matrix uses "00": "{line}' ) else: length += 1 if i != length: raise ValueError( "The TRANSFAC matrix row number does not match the position" f' in the matrix: "{line}"' ) if strict: if len(key) == 1: raise ValueError( "A TRANSFAC matrix line should have a 2 digit" f' key at the start of the line ("{i:02d}"),' f' but this matrix uses "{i:d}": "{line:s}".' ) if len(key_value) != 2: raise ValueError( "A TRANSFAC matrix line should have a key and a" f' value: "{line}"' ) values = value.split()[:4] if len(values) != 4: raise ValueError( "A TRANSFAC matrix line should have a value for each" f' nucleotide (A, C, G and T): "{line}"' ) for c, v in zip("ACGT", values): counts[c].append(float(v)) if line == "XX": pass elif key == "RN": index, separator, accession = value.partition(";") if index[0] != "[": raise ValueError( f'The index "{index}" in a TRANSFAC RN line should start' f' with a "[": "{line}"' ) if index[-1] != "]": raise ValueError( f'The index "{index}" in a TRANSFAC RN line should end' f' with a "]": "{line}"' ) index = int(index[1:-1]) if len(references) != index - 1: raise ValueError( f'The index "{index:d}" of the TRANSFAC RN line does not ' "match the current number of seen references " f'"{len(references) + 1:d}": "{line:s}"' ) reference = {key: value} references.append(reference) elif key == "//": if counts is not None: motif = Motif(alphabet="ACGT", counts=counts) motif.update(annotations) motif.references = references record.append(motif) annotations = {} references = [] elif key in Motif.reference_keys: reference[key] = value elif key in Motif.multiple_value_keys: if key not in annotations: annotations[key] = [] annotations[key].append(value) else: annotations[key] = value return record def write(motifs): """Write the representation of a motif in TRANSFAC format.""" blocks = [] try: version = motifs.version except AttributeError: pass else: if version is not None: block = ( """\ VV %s XX // """ % version ) blocks.append(block) multiple_value_keys = Motif.multiple_value_keys sections = ( ("AC", "AS"), # Accession ("ID",), # ID ("DT", "CO"), # Date, copyright ("NA",), # Name ("DE",), # Short factor description ("TY",), # Type ("OS", "OC"), # Organism ("HP", "HC"), # Superfamilies, subfamilies ("BF",), # Binding factors ("P0",), # Frequency matrix ("BA",), # Statistical basis ("BS",), # Factor binding sites ("CC",), # Comments ("DR",), # External databases ("OV", "PV"), # Versions ) for motif in motifs: lines = [] for section in sections: blank = False for key in section: if key == "P0": # Frequency matrix length = motif.length if length == 0: continue sequence = motif.degenerate_consensus letters = sorted(motif.alphabet) line = " ".join(["P0"] + letters) lines.append(line) for i in range(length): line = ( " ".join(["%02.d"] + ["%6.20g" for _ in letters]) + " %s" ) line = line % tuple( [i + 1] + [motif.counts[_][i] for _ in letters] + [sequence[i]] ) lines.append(line) blank = True else: try: value = motif.get(key) except AttributeError: value = None if value is not None: if key in multiple_value_keys: for v in value: line = f"{key} {v}" lines.append(line) else: line = f"{key} {value}" lines.append(line) blank = True if key == "PV": # References try: references = motif.references except AttributeError: pass else: keys = ("RN", "RX", "RA", "RT", "RL") for reference in references: for key in keys: value = reference.get(key) if value is None: continue line = f"{key} {value}" lines.append(line) blank = True if blank: line = "XX" lines.append(line) # Finished this motif; glue the lines together line = "//" lines.append(line) block = "\n".join(lines) + "\n" blocks.append(block) # Finished all motifs; glue the blocks together text = "".join(blocks) return text