Spaces:
No application file
No application file
# Copyright 2008-2017,2020 by Peter Cock. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.SeqIO support for the "tab" (simple tab separated) file format. | |
You are expected to use this module via the Bio.SeqIO functions. | |
The "tab" format is an ad-hoc plain text file format where each sequence is | |
on one (long) line. Each line contains the identifier/description, followed | |
by a tab, followed by the sequence. For example, consider the following | |
short FASTA format file:: | |
>ID123456 possible binding site? | |
CATCNAGATGACACTACGACTACGACTCAGACTAC | |
>ID123457 random sequence | |
ACACTACGACTACGACTCAGACTACAAN | |
Apart from the descriptions, this can be represented in the simple two column | |
tab separated format as follows:: | |
ID123456(tab)CATCNAGATGACACTACGACTACGACTCAGACTAC | |
ID123457(tab)ACACTACGACTACGACTCAGACTACAAN | |
When reading this file, "ID123456" or "ID123457" will be taken as the record's | |
.id and .name property. There is no other information to record. | |
Similarly, when writing to this format, Biopython will ONLY record the record's | |
.id and .seq (and not the description or any other information) as in the | |
example above. | |
""" | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from .Interfaces import _clean | |
from .Interfaces import _get_seq_string | |
from .Interfaces import SequenceIterator | |
from .Interfaces import SequenceWriter | |
class TabIterator(SequenceIterator): | |
"""Parser for tab-delimited files.""" | |
def __init__(self, source): | |
"""Iterate over tab separated lines as SeqRecord objects. | |
Each line of the file should contain one tab only, dividing the line | |
into an identifier and the full sequence. | |
Arguments: | |
- source - file-like object opened in text mode, or a path to a file | |
The first field is taken as the record's .id and .name (regardless of | |
any spaces within the text) and the second field is the sequence. | |
Any blank lines are ignored. | |
Examples | |
-------- | |
>>> with open("GenBank/NC_005816.tsv") as handle: | |
... for record in TabIterator(handle): | |
... print("%s length %i" % (record.id, len(record))) | |
gi|45478712|ref|NP_995567.1| length 340 | |
gi|45478713|ref|NP_995568.1| length 260 | |
gi|45478714|ref|NP_995569.1| length 64 | |
gi|45478715|ref|NP_995570.1| length 123 | |
gi|45478716|ref|NP_995571.1| length 145 | |
gi|45478717|ref|NP_995572.1| length 357 | |
gi|45478718|ref|NP_995573.1| length 138 | |
gi|45478719|ref|NP_995574.1| length 312 | |
gi|45478720|ref|NP_995575.1| length 99 | |
gi|45478721|ref|NP_995576.1| length 90 | |
""" | |
super().__init__(source, mode="t", fmt="Tab-separated plain-text") | |
def parse(self, handle): | |
"""Start parsing the file, and return a SeqRecord generator.""" | |
records = self.iterate(handle) | |
return records | |
def iterate(self, handle): | |
"""Parse the file and generate SeqRecord objects.""" | |
for line in handle: | |
try: | |
title, seq = line.split("\t") # will fail if more than one tab! | |
except ValueError: | |
if line.strip() == "": | |
# It's a blank line, ignore it | |
continue | |
raise ValueError( | |
"Each line should have one tab separating the" | |
+ " title and sequence, this line has %i tabs: %r" | |
% (line.count("\t"), line) | |
) from None | |
title = title.strip() | |
seq = seq.strip() # removes the trailing new line | |
yield SeqRecord(Seq(seq), id=title, name=title, description="") | |
class TabWriter(SequenceWriter): | |
"""Class to write simple tab separated format files. | |
Each line consists of "id(tab)sequence" only. | |
Any description, name or other annotation is not recorded. | |
This class is not intended to be used directly. Instead, please use | |
the function ``as_tab``, or the top level ``Bio.SeqIO.write()`` function | |
with ``format="tab"``. | |
""" | |
def write_record(self, record): | |
"""Write a single tab line to the file.""" | |
assert self._header_written | |
assert not self._footer_written | |
self._record_written = True | |
self.handle.write(as_tab(record)) | |
def as_tab(record): | |
"""Return record as tab separated (id(tab)seq) string.""" | |
title = _clean(record.id) | |
seq = _get_seq_string(record) # Catches sequence being None | |
assert "\t" not in title | |
assert "\n" not in title | |
assert "\r" not in title | |
assert "\t" not in seq | |
assert "\n" not in seq | |
assert "\r" not in seq | |
return f"{title}\t{seq}\n" | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest(verbose=0) | |