Spaces:
No application file
No application file
# Copyright 2003 by Bartek Wilczynski. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Parsing TRANSFAC files.""" | |
from Bio import motifs | |
class Motif(motifs.Motif, dict): | |
"""Store the information for one TRANSFAC motif. | |
This class inherits from the Bio.motifs.Motif base class, as well | |
as from a Python dictionary. All motif information found by the parser | |
is stored as attributes of the base class when possible; see the | |
Bio.motifs.Motif base class for a description of these attributes. All | |
other information associated with the motif is stored as (key, value) | |
pairs in the dictionary, where the key is the two-letter fields as found | |
in the TRANSFAC file. References are an exception: These are stored in | |
the .references attribute. | |
These fields are commonly found in TRANSFAC files:: | |
AC: Accession number | |
AS: Accession numbers, secondary | |
BA: Statistical basis | |
BF: Binding factors | |
BS: Factor binding sites underlying the matrix | |
[sequence; SITE accession number; start position for matrix | |
sequence; length of sequence used; number of gaps inserted; | |
strand orientation.] | |
CC: Comments | |
CO: Copyright notice | |
DE: Short factor description | |
DR: External databases | |
[database name: database accession number] | |
DT: Date created/updated | |
HC: Subfamilies | |
HP: Superfamilies | |
ID: Identifier | |
NA: Name of the binding factor | |
OC: Taxonomic classification | |
OS: Species/Taxon | |
OV: Older version | |
PV: Preferred version | |
TY: Type | |
XX: Empty line; these are not stored in the Record. | |
References are stored in an .references attribute, which is a list of | |
dictionaries with the following keys:: | |
RN: Reference number | |
RA: Reference authors | |
RL: Reference data | |
RT: Reference title | |
RX: PubMed ID | |
For more information, see the TRANSFAC documentation. | |
""" | |
multiple_value_keys = {"BF", "OV", "HP", "BS", "HC", "DT", "DR"} | |
# These keys can occur multiple times for one motif | |
reference_keys = {"RX", "RA", "RT", "RL"} | |
# These keys occur for references | |
class Record(list): | |
"""Store the information in a TRANSFAC matrix table. | |
The record inherits from a list containing the individual motifs. | |
Attributes: | |
- version - The version number, corresponding to the 'VV' field | |
in the TRANSFAC file; | |
""" | |
def __init__(self): | |
"""Initialize the class.""" | |
self.version = None | |
def __str__(self): | |
"""Turn the TRANSFAC matrix into a string.""" | |
return write(self) | |
def read(handle, strict=True): | |
"""Parse a transfac format handle into a Record object.""" | |
annotations = {} | |
references = [] | |
counts = None | |
record = Record() | |
for line in handle: | |
line = line.strip() | |
if not line: | |
continue | |
key_value = line.split(None, 1) | |
key = key_value[0].strip() | |
if strict: | |
if len(key) != 2: | |
raise ValueError( | |
"The key value of a TRANSFAC motif line should have 2 characters:" | |
f'"{line}"' | |
) | |
if len(key_value) == 2: | |
value = key_value[1].strip() | |
if strict: | |
if not line.partition(" ")[1]: | |
raise ValueError( | |
"A TRANSFAC motif line should have 2 " | |
"spaces between key and value columns: " | |
f'"{line}"' | |
) | |
if key == "VV": | |
record.version = value | |
elif key in ("P0", "PO"): # Old TRANSFAC files use PO instead of P0 | |
counts = {} | |
if value.split()[:4] != ["A", "C", "G", "T"]: | |
raise ValueError( | |
f'A TRANSFAC matrix "{key}" line should be ' | |
f'followed by "A C G T": {line}' | |
) | |
length = 0 | |
for c in "ACGT": | |
counts[c] = [] | |
for line in handle: | |
line = line.strip() | |
key_value = line.split(None, 1) | |
key = key_value[0].strip() | |
if len(key_value) == 2: | |
value = key_value[1].strip() | |
if strict: | |
if not line.partition(" ")[1]: | |
raise ValueError( | |
"A TRANSFAC motif line should have 2 spaces" | |
f' between key and value columns: "{line}"' | |
) | |
try: | |
i = int(key) | |
except ValueError: | |
break | |
if length == 0 and i == 0: | |
if strict: | |
raise ValueError( | |
'A TRANSFAC matrix should start with "01" as first row' | |
f' of the matrix, but this matrix uses "00": "{line}' | |
) | |
else: | |
length += 1 | |
if i != length: | |
raise ValueError( | |
"The TRANSFAC matrix row number does not match the position" | |
f' in the matrix: "{line}"' | |
) | |
if strict: | |
if len(key) == 1: | |
raise ValueError( | |
"A TRANSFAC matrix line should have a 2 digit" | |
f' key at the start of the line ("{i:02d}"),' | |
f' but this matrix uses "{i:d}": "{line:s}".' | |
) | |
if len(key_value) != 2: | |
raise ValueError( | |
"A TRANSFAC matrix line should have a key and a" | |
f' value: "{line}"' | |
) | |
values = value.split()[:4] | |
if len(values) != 4: | |
raise ValueError( | |
"A TRANSFAC matrix line should have a value for each" | |
f' nucleotide (A, C, G and T): "{line}"' | |
) | |
for c, v in zip("ACGT", values): | |
counts[c].append(float(v)) | |
if line == "XX": | |
pass | |
elif key == "RN": | |
index, separator, accession = value.partition(";") | |
if index[0] != "[": | |
raise ValueError( | |
f'The index "{index}" in a TRANSFAC RN line should start' | |
f' with a "[": "{line}"' | |
) | |
if index[-1] != "]": | |
raise ValueError( | |
f'The index "{index}" in a TRANSFAC RN line should end' | |
f' with a "]": "{line}"' | |
) | |
index = int(index[1:-1]) | |
if len(references) != index - 1: | |
raise ValueError( | |
f'The index "{index:d}" of the TRANSFAC RN line does not ' | |
"match the current number of seen references " | |
f'"{len(references) + 1:d}": "{line:s}"' | |
) | |
reference = {key: value} | |
references.append(reference) | |
elif key == "//": | |
if counts is not None: | |
motif = Motif(alphabet="ACGT", counts=counts) | |
motif.update(annotations) | |
motif.references = references | |
record.append(motif) | |
annotations = {} | |
references = [] | |
elif key in Motif.reference_keys: | |
reference[key] = value | |
elif key in Motif.multiple_value_keys: | |
if key not in annotations: | |
annotations[key] = [] | |
annotations[key].append(value) | |
else: | |
annotations[key] = value | |
return record | |
def write(motifs): | |
"""Write the representation of a motif in TRANSFAC format.""" | |
blocks = [] | |
try: | |
version = motifs.version | |
except AttributeError: | |
pass | |
else: | |
if version is not None: | |
block = ( | |
"""\ | |
VV %s | |
XX | |
// | |
""" | |
% version | |
) | |
blocks.append(block) | |
multiple_value_keys = Motif.multiple_value_keys | |
sections = ( | |
("AC", "AS"), # Accession | |
("ID",), # ID | |
("DT", "CO"), # Date, copyright | |
("NA",), # Name | |
("DE",), # Short factor description | |
("TY",), # Type | |
("OS", "OC"), # Organism | |
("HP", "HC"), # Superfamilies, subfamilies | |
("BF",), # Binding factors | |
("P0",), # Frequency matrix | |
("BA",), # Statistical basis | |
("BS",), # Factor binding sites | |
("CC",), # Comments | |
("DR",), # External databases | |
("OV", "PV"), # Versions | |
) | |
for motif in motifs: | |
lines = [] | |
for section in sections: | |
blank = False | |
for key in section: | |
if key == "P0": | |
# Frequency matrix | |
length = motif.length | |
if length == 0: | |
continue | |
sequence = motif.degenerate_consensus | |
letters = sorted(motif.alphabet) | |
line = " ".join(["P0"] + letters) | |
lines.append(line) | |
for i in range(length): | |
line = ( | |
" ".join(["%02.d"] + ["%6.20g" for _ in letters]) | |
+ " %s" | |
) | |
line = line % tuple( | |
[i + 1] | |
+ [motif.counts[_][i] for _ in letters] | |
+ [sequence[i]] | |
) | |
lines.append(line) | |
blank = True | |
else: | |
try: | |
value = motif.get(key) | |
except AttributeError: | |
value = None | |
if value is not None: | |
if key in multiple_value_keys: | |
for v in value: | |
line = f"{key} {v}" | |
lines.append(line) | |
else: | |
line = f"{key} {value}" | |
lines.append(line) | |
blank = True | |
if key == "PV": | |
# References | |
try: | |
references = motif.references | |
except AttributeError: | |
pass | |
else: | |
keys = ("RN", "RX", "RA", "RT", "RL") | |
for reference in references: | |
for key in keys: | |
value = reference.get(key) | |
if value is None: | |
continue | |
line = f"{key} {value}" | |
lines.append(line) | |
blank = True | |
if blank: | |
line = "XX" | |
lines.append(line) | |
# Finished this motif; glue the lines together | |
line = "//" | |
lines.append(line) | |
block = "\n".join(lines) + "\n" | |
blocks.append(block) | |
# Finished all motifs; glue the blocks together | |
text = "".join(blocks) | |
return text | |