Spaces:
No application file
No application file
File size: 4,433 Bytes
b7731cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# Copyright 2008-2015 by Peter Cock. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format.
This module is for reading and writing IntelliGenetics format files as
SeqRecord objects. This file format appears to be the same as the MASE
multiple sequence alignment format.
You are expected to use this module via the Bio.SeqIO functions.
"""
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from .Interfaces import SequenceIterator
class IgIterator(SequenceIterator):
"""Parser for IntelliGenetics files."""
def __init__(self, source):
"""Iterate over IntelliGenetics records (as SeqRecord objects).
source - file-like object opened in text mode, or a path to a file
The optional free format file header lines (which start with two
semi-colons) are ignored.
The free format commentary lines at the start of each record (which
start with a semi-colon) are recorded as a single string with embedded
new line characters in the SeqRecord's annotations dictionary under the
key 'comment'.
Examples
--------
>>> with open("IntelliGenetics/TAT_mase_nuc.txt") as handle:
... for record in IgIterator(handle):
... print("%s length %i" % (record.id, len(record)))
...
A_U455 length 303
B_HXB2R length 306
C_UG268A length 267
D_ELI length 309
F_BZ163A length 309
O_ANT70 length 342
O_MVP5180 length 348
CPZGAB length 309
CPZANT length 309
A_ROD length 390
B_EHOA length 420
D_MM251 length 390
STM_STM length 387
VER_AGM3 length 354
GRI_AGM677 length 264
SAB_SAB1C length 219
SYK_SYK length 330
"""
super().__init__(source, mode="t", fmt="IntelliGenetics")
def parse(self, handle):
"""Start parsing the file, and return a SeqRecord generator."""
records = self.iterate(handle)
return records
def iterate(self, handle):
"""Iterate over the records in the IntelliGenetics file."""
# Skip any file header text before the first record (;; lines)
for line in handle:
if not line.startswith(";;"):
break
else:
# Empty file, or header only
return
if line[0] != ";":
raise ValueError(f"Records should start with ';' and not:\n{line!r}")
while line:
# Now iterate over the records
# Try and agree with SeqRecord convention from the GenBank parser,
# (and followed in the SwissProt parser) which stores the comments
# as a long string with newlines under annotations key 'comment'.
# Note some examples use "; ..." and others ";..."
comment_lines = []
while line.startswith(";"):
# TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
comment_lines.append(line[1:].strip())
line = next(handle)
title = line.rstrip()
seq_lines = []
for line in handle:
if line[0] == ";":
break
# Remove trailing whitespace, and any internal spaces
seq_lines.append(line.rstrip().replace(" ", ""))
else:
line = None
seq_str = "".join(seq_lines)
if seq_str.endswith("1"):
# Remove the optional terminator (digit one)
seq_str = seq_str[:-1]
if "1" in seq_str:
raise ValueError(
"Potential terminator digit one found within sequence."
)
# Return the record and then continue...
yield SeqRecord(
Seq(seq_str),
id=title,
name=title,
annotations={"comment": "\n".join(comment_lines)},
)
# We should be at the end of the file now
assert not line
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest(verbose=0)
|