Spaces:
No application file
No application file
# Copyright 2008-2015 by Peter Cock. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format. | |
This module is for reading and writing IntelliGenetics format files as | |
SeqRecord objects. This file format appears to be the same as the MASE | |
multiple sequence alignment format. | |
You are expected to use this module via the Bio.SeqIO functions. | |
""" | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from .Interfaces import SequenceIterator | |
class IgIterator(SequenceIterator): | |
"""Parser for IntelliGenetics files.""" | |
def __init__(self, source): | |
"""Iterate over IntelliGenetics records (as SeqRecord objects). | |
source - file-like object opened in text mode, or a path to a file | |
The optional free format file header lines (which start with two | |
semi-colons) are ignored. | |
The free format commentary lines at the start of each record (which | |
start with a semi-colon) are recorded as a single string with embedded | |
new line characters in the SeqRecord's annotations dictionary under the | |
key 'comment'. | |
Examples | |
-------- | |
>>> with open("IntelliGenetics/TAT_mase_nuc.txt") as handle: | |
... for record in IgIterator(handle): | |
... print("%s length %i" % (record.id, len(record))) | |
... | |
A_U455 length 303 | |
B_HXB2R length 306 | |
C_UG268A length 267 | |
D_ELI length 309 | |
F_BZ163A length 309 | |
O_ANT70 length 342 | |
O_MVP5180 length 348 | |
CPZGAB length 309 | |
CPZANT length 309 | |
A_ROD length 390 | |
B_EHOA length 420 | |
D_MM251 length 390 | |
STM_STM length 387 | |
VER_AGM3 length 354 | |
GRI_AGM677 length 264 | |
SAB_SAB1C length 219 | |
SYK_SYK length 330 | |
""" | |
super().__init__(source, mode="t", fmt="IntelliGenetics") | |
def parse(self, handle): | |
"""Start parsing the file, and return a SeqRecord generator.""" | |
records = self.iterate(handle) | |
return records | |
def iterate(self, handle): | |
"""Iterate over the records in the IntelliGenetics file.""" | |
# Skip any file header text before the first record (;; lines) | |
for line in handle: | |
if not line.startswith(";;"): | |
break | |
else: | |
# Empty file, or header only | |
return | |
if line[0] != ";": | |
raise ValueError(f"Records should start with ';' and not:\n{line!r}") | |
while line: | |
# Now iterate over the records | |
# Try and agree with SeqRecord convention from the GenBank parser, | |
# (and followed in the SwissProt parser) which stores the comments | |
# as a long string with newlines under annotations key 'comment'. | |
# Note some examples use "; ..." and others ";..." | |
comment_lines = [] | |
while line.startswith(";"): | |
# TODO - Extract identifier from lines like "LOCUS\tB_SF2"? | |
comment_lines.append(line[1:].strip()) | |
line = next(handle) | |
title = line.rstrip() | |
seq_lines = [] | |
for line in handle: | |
if line[0] == ";": | |
break | |
# Remove trailing whitespace, and any internal spaces | |
seq_lines.append(line.rstrip().replace(" ", "")) | |
else: | |
line = None | |
seq_str = "".join(seq_lines) | |
if seq_str.endswith("1"): | |
# Remove the optional terminator (digit one) | |
seq_str = seq_str[:-1] | |
if "1" in seq_str: | |
raise ValueError( | |
"Potential terminator digit one found within sequence." | |
) | |
# Return the record and then continue... | |
yield SeqRecord( | |
Seq(seq_str), | |
id=title, | |
name=title, | |
annotations={"comment": "\n".join(comment_lines)}, | |
) | |
# We should be at the end of the file now | |
assert not line | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest(verbose=0) | |