aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2003 by Bartek Wilczynski. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Parsing TRANSFAC files."""
from Bio import motifs
class Motif(motifs.Motif, dict):
"""Store the information for one TRANSFAC motif.
This class inherits from the Bio.motifs.Motif base class, as well
as from a Python dictionary. All motif information found by the parser
is stored as attributes of the base class when possible; see the
Bio.motifs.Motif base class for a description of these attributes. All
other information associated with the motif is stored as (key, value)
pairs in the dictionary, where the key is the two-letter fields as found
in the TRANSFAC file. References are an exception: These are stored in
the .references attribute.
These fields are commonly found in TRANSFAC files::
AC: Accession number
AS: Accession numbers, secondary
BA: Statistical basis
BF: Binding factors
BS: Factor binding sites underlying the matrix
[sequence; SITE accession number; start position for matrix
sequence; length of sequence used; number of gaps inserted;
strand orientation.]
CC: Comments
CO: Copyright notice
DE: Short factor description
DR: External databases
[database name: database accession number]
DT: Date created/updated
HC: Subfamilies
HP: Superfamilies
ID: Identifier
NA: Name of the binding factor
OC: Taxonomic classification
OS: Species/Taxon
OV: Older version
PV: Preferred version
TY: Type
XX: Empty line; these are not stored in the Record.
References are stored in an .references attribute, which is a list of
dictionaries with the following keys::
RN: Reference number
RA: Reference authors
RL: Reference data
RT: Reference title
RX: PubMed ID
For more information, see the TRANSFAC documentation.
"""
multiple_value_keys = {"BF", "OV", "HP", "BS", "HC", "DT", "DR"}
# These keys can occur multiple times for one motif
reference_keys = {"RX", "RA", "RT", "RL"}
# These keys occur for references
class Record(list):
"""Store the information in a TRANSFAC matrix table.
The record inherits from a list containing the individual motifs.
Attributes:
- version - The version number, corresponding to the 'VV' field
in the TRANSFAC file;
"""
def __init__(self):
"""Initialize the class."""
self.version = None
def __str__(self):
"""Turn the TRANSFAC matrix into a string."""
return write(self)
def read(handle, strict=True):
"""Parse a transfac format handle into a Record object."""
annotations = {}
references = []
counts = None
record = Record()
for line in handle:
line = line.strip()
if not line:
continue
key_value = line.split(None, 1)
key = key_value[0].strip()
if strict:
if len(key) != 2:
raise ValueError(
"The key value of a TRANSFAC motif line should have 2 characters:"
f'"{line}"'
)
if len(key_value) == 2:
value = key_value[1].strip()
if strict:
if not line.partition(" ")[1]:
raise ValueError(
"A TRANSFAC motif line should have 2 "
"spaces between key and value columns: "
f'"{line}"'
)
if key == "VV":
record.version = value
elif key in ("P0", "PO"): # Old TRANSFAC files use PO instead of P0
counts = {}
if value.split()[:4] != ["A", "C", "G", "T"]:
raise ValueError(
f'A TRANSFAC matrix "{key}" line should be '
f'followed by "A C G T": {line}'
)
length = 0
for c in "ACGT":
counts[c] = []
for line in handle:
line = line.strip()
key_value = line.split(None, 1)
key = key_value[0].strip()
if len(key_value) == 2:
value = key_value[1].strip()
if strict:
if not line.partition(" ")[1]:
raise ValueError(
"A TRANSFAC motif line should have 2 spaces"
f' between key and value columns: "{line}"'
)
try:
i = int(key)
except ValueError:
break
if length == 0 and i == 0:
if strict:
raise ValueError(
'A TRANSFAC matrix should start with "01" as first row'
f' of the matrix, but this matrix uses "00": "{line}'
)
else:
length += 1
if i != length:
raise ValueError(
"The TRANSFAC matrix row number does not match the position"
f' in the matrix: "{line}"'
)
if strict:
if len(key) == 1:
raise ValueError(
"A TRANSFAC matrix line should have a 2 digit"
f' key at the start of the line ("{i:02d}"),'
f' but this matrix uses "{i:d}": "{line:s}".'
)
if len(key_value) != 2:
raise ValueError(
"A TRANSFAC matrix line should have a key and a"
f' value: "{line}"'
)
values = value.split()[:4]
if len(values) != 4:
raise ValueError(
"A TRANSFAC matrix line should have a value for each"
f' nucleotide (A, C, G and T): "{line}"'
)
for c, v in zip("ACGT", values):
counts[c].append(float(v))
if line == "XX":
pass
elif key == "RN":
index, separator, accession = value.partition(";")
if index[0] != "[":
raise ValueError(
f'The index "{index}" in a TRANSFAC RN line should start'
f' with a "[": "{line}"'
)
if index[-1] != "]":
raise ValueError(
f'The index "{index}" in a TRANSFAC RN line should end'
f' with a "]": "{line}"'
)
index = int(index[1:-1])
if len(references) != index - 1:
raise ValueError(
f'The index "{index:d}" of the TRANSFAC RN line does not '
"match the current number of seen references "
f'"{len(references) + 1:d}": "{line:s}"'
)
reference = {key: value}
references.append(reference)
elif key == "//":
if counts is not None:
motif = Motif(alphabet="ACGT", counts=counts)
motif.update(annotations)
motif.references = references
record.append(motif)
annotations = {}
references = []
elif key in Motif.reference_keys:
reference[key] = value
elif key in Motif.multiple_value_keys:
if key not in annotations:
annotations[key] = []
annotations[key].append(value)
else:
annotations[key] = value
return record
def write(motifs):
"""Write the representation of a motif in TRANSFAC format."""
blocks = []
try:
version = motifs.version
except AttributeError:
pass
else:
if version is not None:
block = (
"""\
VV %s
XX
//
"""
% version
)
blocks.append(block)
multiple_value_keys = Motif.multiple_value_keys
sections = (
("AC", "AS"), # Accession
("ID",), # ID
("DT", "CO"), # Date, copyright
("NA",), # Name
("DE",), # Short factor description
("TY",), # Type
("OS", "OC"), # Organism
("HP", "HC"), # Superfamilies, subfamilies
("BF",), # Binding factors
("P0",), # Frequency matrix
("BA",), # Statistical basis
("BS",), # Factor binding sites
("CC",), # Comments
("DR",), # External databases
("OV", "PV"), # Versions
)
for motif in motifs:
lines = []
for section in sections:
blank = False
for key in section:
if key == "P0":
# Frequency matrix
length = motif.length
if length == 0:
continue
sequence = motif.degenerate_consensus
letters = sorted(motif.alphabet)
line = " ".join(["P0"] + letters)
lines.append(line)
for i in range(length):
line = (
" ".join(["%02.d"] + ["%6.20g" for _ in letters])
+ " %s"
)
line = line % tuple(
[i + 1]
+ [motif.counts[_][i] for _ in letters]
+ [sequence[i]]
)
lines.append(line)
blank = True
else:
try:
value = motif.get(key)
except AttributeError:
value = None
if value is not None:
if key in multiple_value_keys:
for v in value:
line = f"{key} {v}"
lines.append(line)
else:
line = f"{key} {value}"
lines.append(line)
blank = True
if key == "PV":
# References
try:
references = motif.references
except AttributeError:
pass
else:
keys = ("RN", "RX", "RA", "RT", "RL")
for reference in references:
for key in keys:
value = reference.get(key)
if value is None:
continue
line = f"{key} {value}"
lines.append(line)
blank = True
if blank:
line = "XX"
lines.append(line)
# Finished this motif; glue the lines together
line = "//"
lines.append(line)
block = "\n".join(lines) + "\n"
blocks.append(block)
# Finished all motifs; glue the blocks together
text = "".join(blocks)
return text