Spaces:
No application file
No application file
# Copyright 2007-2017 by Peter Cock. All rights reserved. | |
# Revisions copyright 2010 by Uri Laserson. All rights reserved. | |
# This code is part of the Biopython distribution and governed by its | |
# license. Please see the LICENSE file that should have been included | |
# as part of this package. | |
"""Internal code for parsing GenBank and EMBL files (PRIVATE). | |
This code is NOT intended for direct use. It provides a basic scanner | |
(for use with a event consumer such as Bio.GenBank._FeatureConsumer) | |
to parse a GenBank or EMBL file (with their shared INSDC feature table). | |
It is used by Bio.GenBank to parse GenBank files | |
It is also used by Bio.SeqIO to parse GenBank and EMBL files | |
Feature Table Documentation: | |
- http://www.insdc.org/files/feature_table.html | |
- http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html | |
- ftp://ftp.ncbi.nih.gov/genbank/docs/ | |
""" | |
# 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. | |
# These are GenBank files that summarize the content of a project, and provide lists of | |
# scaffold and contig files in the project. These will be in annotations['wgs'] and | |
# annotations['wgs_scafld']. These GenBank files do not have sequences. See | |
# http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 | |
# http://is.gd/nNgk | |
# for more details of this format, and an example. | |
# Added by Ying Huang & Iddo Friedberg | |
import warnings | |
import re | |
import sys | |
from collections import defaultdict | |
from Bio.File import as_handle | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from Bio import BiopythonParserWarning | |
class InsdcScanner: | |
"""Basic functions for breaking up a GenBank/EMBL file into sub sections. | |
The International Nucleotide Sequence Database Collaboration (INSDC) | |
between the DDBJ, EMBL, and GenBank. These organisations all use the | |
same "Feature Table" layout in their plain text flat file formats. | |
However, the header and sequence sections of an EMBL file are very | |
different in layout to those produced by GenBank/DDBJ. | |
""" | |
# These constants get redefined with sensible values in the sub classes: | |
RECORD_START = "XXX" # "LOCUS " or "ID " | |
HEADER_WIDTH = 3 # 12 or 5 | |
FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] | |
FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] | |
FEATURE_QUALIFIER_INDENT = 0 | |
FEATURE_QUALIFIER_SPACER = "" | |
SEQUENCE_HEADERS = ["XXX"] # with right hand side spaces removed | |
def __init__(self, debug=0): | |
"""Initialize the class.""" | |
assert len(self.RECORD_START) == self.HEADER_WIDTH | |
for marker in self.SEQUENCE_HEADERS: | |
assert marker == marker.rstrip() | |
assert len(self.FEATURE_QUALIFIER_SPACER) == self.FEATURE_QUALIFIER_INDENT | |
self.debug = debug | |
self.handle = None | |
self.line = None | |
def set_handle(self, handle): | |
"""Set the handle attribute.""" | |
self.handle = handle | |
self.line = "" | |
def find_start(self): | |
"""Read in lines until find the ID/LOCUS line, which is returned. | |
Any preamble (such as the header used by the NCBI on ``*.seq.gz`` archives) | |
will we ignored. | |
""" | |
while True: | |
if self.line: | |
line = self.line | |
self.line = "" | |
else: | |
line = self.handle.readline() | |
if not line: | |
if self.debug: | |
print("End of file") | |
return None | |
if isinstance(line[0], int): | |
# Same exception as for FASTQ files | |
raise ValueError("Is this handle in binary mode not text mode?") | |
if line[: self.HEADER_WIDTH] == self.RECORD_START: | |
if self.debug > 1: | |
print("Found the start of a record:\n" + line) | |
break | |
line = line.rstrip() | |
if line == "//": | |
if self.debug > 1: | |
print("Skipping // marking end of last record") | |
elif line == "": | |
if self.debug > 1: | |
print("Skipping blank line before record") | |
else: | |
# Ignore any header before the first ID/LOCUS line. | |
if self.debug > 1: | |
print("Skipping header line before record:\n" + line) | |
self.line = line | |
return line | |
def parse_header(self): | |
"""Return list of strings making up the header. | |
New line characters are removed. | |
Assumes you have just read in the ID/LOCUS line. | |
""" | |
if self.line[: self.HEADER_WIDTH] != self.RECORD_START: | |
raise ValueError("Not at start of record") | |
header_lines = [] | |
while True: | |
line = self.handle.readline() | |
if not line: | |
raise ValueError("Premature end of line during sequence data") | |
line = line.rstrip() | |
if line in self.FEATURE_START_MARKERS: | |
if self.debug: | |
print("Found feature table") | |
break | |
# if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]: | |
# if self.debug : print("Found header table (?)") | |
# break | |
if line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: | |
if self.debug: | |
print("Found start of sequence") | |
break | |
if line == "//": | |
raise ValueError("Premature end of sequence data marker '//' found") | |
header_lines.append(line) | |
self.line = line | |
return header_lines | |
def parse_features(self, skip=False): | |
"""Return list of tuples for the features (if present). | |
Each feature is returned as a tuple (key, location, qualifiers) | |
where key and location are strings (e.g. "CDS" and | |
"complement(join(490883..490885,1..879))") while qualifiers | |
is a list of two string tuples (feature qualifier keys and values). | |
Assumes you have already read to the start of the features table. | |
""" | |
if self.line.rstrip() not in self.FEATURE_START_MARKERS: | |
if self.debug: | |
print("Didn't find any feature table") | |
return [] | |
while self.line.rstrip() in self.FEATURE_START_MARKERS: | |
self.line = self.handle.readline() | |
features = [] | |
line = self.line | |
while True: | |
if not line: | |
raise ValueError("Premature end of line during features table") | |
if line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: | |
if self.debug: | |
print("Found start of sequence") | |
break | |
line = line.rstrip() | |
if line == "//": | |
raise ValueError("Premature end of features table, marker '//' found") | |
if line in self.FEATURE_END_MARKERS: | |
if self.debug: | |
print("Found end of features") | |
line = self.handle.readline() | |
break | |
if line[2 : self.FEATURE_QUALIFIER_INDENT].strip() == "": | |
# This is an empty feature line between qualifiers. Empty | |
# feature lines within qualifiers are handled below (ignored). | |
line = self.handle.readline() | |
continue | |
if len(line) < self.FEATURE_QUALIFIER_INDENT: | |
warnings.warn( | |
f"line too short to contain a feature: {line!r}", | |
BiopythonParserWarning, | |
) | |
line = self.handle.readline() | |
continue | |
if skip: | |
line = self.handle.readline() | |
while ( | |
line[: self.FEATURE_QUALIFIER_INDENT] | |
== self.FEATURE_QUALIFIER_SPACER | |
): | |
line = self.handle.readline() | |
else: | |
# Build up a list of the lines making up this feature: | |
if ( | |
line[self.FEATURE_QUALIFIER_INDENT] != " " | |
and " " in line[self.FEATURE_QUALIFIER_INDENT :] | |
): | |
# The feature table design enforces a length limit on the feature keys. | |
# Some third party files (e.g. IGMT's EMBL like files) solve this by | |
# over indenting the location and qualifiers. | |
feature_key, line = line[2:].strip().split(None, 1) | |
feature_lines = [line] | |
warnings.warn( | |
f"Over indented {feature_key} feature?", | |
BiopythonParserWarning, | |
) | |
else: | |
feature_key = line[2 : self.FEATURE_QUALIFIER_INDENT].strip() | |
feature_lines = [line[self.FEATURE_QUALIFIER_INDENT :]] | |
line = self.handle.readline() | |
while line[ | |
: self.FEATURE_QUALIFIER_INDENT | |
] == self.FEATURE_QUALIFIER_SPACER or ( | |
line != "" and line.rstrip() == "" | |
): # cope with blank lines in the midst of a feature | |
# Use strip to remove any harmless trailing white space AND and leading | |
# white space (e.g. out of spec files with too much indentation) | |
feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT :].strip()) | |
line = self.handle.readline() | |
features.append(self.parse_feature(feature_key, feature_lines)) | |
self.line = line | |
return features | |
def parse_feature(self, feature_key, lines): | |
r"""Parse a feature given as a list of strings into a tuple. | |
Expects a feature as a list of strings, returns a tuple (key, location, | |
qualifiers) | |
For example given this GenBank feature:: | |
CDS complement(join(490883..490885,1..879)) | |
/locus_tag="NEQ001" | |
/note="conserved hypothetical [Methanococcus jannaschii]; | |
COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear | |
localization signal; IPR002743: Protein of unknown | |
function DUF57" | |
/codon_start=1 | |
/transl_table=11 | |
/product="hypothetical protein" | |
/protein_id="NP_963295.1" | |
/db_xref="GI:41614797" | |
/db_xref="GeneID:2732620" | |
/translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK | |
EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK | |
KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP | |
IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE | |
EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS | |
LNSMGFGFVNTKKNSAR" | |
Then should give input key="CDS" and the rest of the data as a list of strings | |
lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] | |
where the leading spaces and trailing newlines have been removed. | |
Returns tuple containing: (key as string, location string, qualifiers as list) | |
as follows for this example: | |
key = "CDS", string | |
location = "complement(join(490883..490885,1..879))", string | |
qualifiers = list of string tuples: | |
[('locus_tag', '"NEQ001"'), | |
('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), | |
('codon_start', '1'), | |
('transl_table', '11'), | |
('product', '"hypothetical protein"'), | |
('protein_id', '"NP_963295.1"'), | |
('db_xref', '"GI:41614797"'), | |
('db_xref', '"GeneID:2732620"'), | |
('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] | |
In the above example, the "note" and "translation" were edited for compactness, | |
and they would contain multiple new line characters (displayed above as \n) | |
If a qualifier is quoted (in this case, everything except codon_start and | |
transl_table) then the quotes are NOT removed. | |
Note that no whitespace is removed. | |
""" | |
# Skip any blank lines | |
iterator = (x for x in lines if x) | |
try: | |
line = next(iterator) | |
feature_location = line.strip() | |
while feature_location[-1:] == ",": | |
# Multiline location, still more to come! | |
line = next(iterator) | |
feature_location += line.strip() | |
if feature_location.count("(") > feature_location.count(")"): | |
# Including the prev line in warning would be more explicit, | |
# but this way get one-and-only-one warning shown by default: | |
warnings.warn( | |
"Non-standard feature line wrapping (didn't break on comma)?", | |
BiopythonParserWarning, | |
) | |
while feature_location[-1:] == "," or feature_location.count( | |
"(" | |
) > feature_location.count(")"): | |
line = next(iterator) | |
feature_location += line.strip() | |
qualifiers = [] | |
for line_number, line in enumerate(iterator): | |
# check for extra wrapping of the location closing parentheses | |
if line_number == 0 and line.startswith(")"): | |
feature_location += line.strip() | |
elif line[0] == "/": | |
# New qualifier | |
i = line.find("=") | |
key = line[1:i] # does not work if i==-1 | |
value = line[i + 1 :] # we ignore 'value' if i==-1 | |
if i and value.startswith(" ") and value.lstrip().startswith('"'): | |
warnings.warn( | |
"White space after equals in qualifier", | |
BiopythonParserWarning, | |
) | |
value = value.lstrip() | |
if i == -1: | |
# Qualifier with no key, e.g. /pseudo | |
key = line[1:] | |
qualifiers.append((key, None)) | |
elif not value: | |
# ApE can output /note= | |
qualifiers.append((key, "")) | |
elif value == '"': | |
# One single quote | |
if self.debug: | |
print(f"Single quote {key}:{value}") | |
# DO NOT remove the quote... | |
qualifiers.append((key, value)) | |
elif value[0] == '"': | |
# Quoted... | |
value_list = [value] | |
while value_list[-1][-1] != '"': | |
value_list.append(next(iterator)) | |
value = "\n".join(value_list) | |
# DO NOT remove the quotes... | |
qualifiers.append((key, value)) | |
else: | |
# Unquoted | |
# if debug : print("Unquoted line %s:%s" % (key,value)) | |
qualifiers.append((key, value)) | |
else: | |
# Unquoted continuation | |
assert len(qualifiers) > 0 | |
assert key == qualifiers[-1][0] | |
# if debug : print("Unquoted Cont %s:%s" % (key, line)) | |
if qualifiers[-1][1] is None: | |
raise StopIteration | |
qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) | |
return feature_key, feature_location, qualifiers | |
except StopIteration: | |
# Bummer | |
raise ValueError( | |
"Problem with '%s' feature:\n%s" % (feature_key, "\n".join(lines)) | |
) from None | |
def parse_footer(self): | |
"""Return a tuple containing a list of any misc strings, and the sequence.""" | |
# This is a basic bit of code to scan and discard the sequence, | |
# which was useful when developing the sub classes. | |
if self.line in self.FEATURE_END_MARKERS: | |
while self.line[: self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS: | |
self.line = self.handle.readline() | |
if not self.line: | |
raise ValueError("Premature end of file") | |
self.line = self.line.rstrip() | |
if self.line[: self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS: | |
raise ValueError("Not at start of sequence") | |
while True: | |
line = self.handle.readline() | |
if not line: | |
raise ValueError("Premature end of line during sequence data") | |
line = line.rstrip() | |
if line == "//": | |
break | |
self.line = line | |
return [], "" # Dummy values! | |
def _feed_first_line(self, consumer, line): | |
"""Handle the LOCUS/ID line, passing data to the consumer (PRIVATE). | |
This should be implemented by the EMBL / GenBank specific subclass | |
Used by the parse_records() and parse() methods. | |
""" | |
pass | |
def _feed_header_lines(self, consumer, lines): | |
"""Handle the header lines (list of strings), passing data to the consumer (PRIVATE). | |
This should be implemented by the EMBL / GenBank specific subclass | |
Used by the parse_records() and parse() methods. | |
""" | |
pass | |
def _feed_feature_table(consumer, feature_tuples): | |
"""Handle the feature table (list of tuples), passing data to the consumer (PRIVATE). | |
Used by the parse_records() and parse() methods. | |
""" | |
consumer.start_feature_table() | |
for feature_key, location_string, qualifiers in feature_tuples: | |
consumer.feature_key(feature_key) | |
consumer.location(location_string) | |
for q_key, q_value in qualifiers: | |
if q_value is None: | |
consumer.feature_qualifier(q_key, q_value) | |
else: | |
consumer.feature_qualifier(q_key, q_value.replace("\n", " ")) | |
def _feed_misc_lines(self, consumer, lines): | |
"""Handle any lines between features and sequence (list of strings), passing data to the consumer (PRIVATE). | |
This should be implemented by the EMBL / GenBank specific subclass | |
Used by the parse_records() and parse() methods. | |
""" | |
pass | |
def feed(self, handle, consumer, do_features=True): | |
"""Feed a set of data into the consumer. | |
This method is intended for use with the "old" code in Bio.GenBank | |
Arguments: | |
- handle - A handle with the information to parse. | |
- consumer - The consumer that should be informed of events. | |
- do_features - Boolean, should the features be parsed? | |
Skipping the features can be much faster. | |
Return values: | |
- true - Passed a record | |
- false - Did not find a record | |
""" | |
# Should work with both EMBL and GenBank files provided the | |
# equivalent Bio.GenBank._FeatureConsumer methods are called... | |
self.set_handle(handle) | |
if not self.find_start(): | |
# Could not find (another) record | |
consumer.data = None | |
return False | |
# We use the above class methods to parse the file into a simplified format. | |
# The first line, header lines and any misc lines after the features will be | |
# dealt with by GenBank / EMBL specific derived classes. | |
# First line and header: | |
self._feed_first_line(consumer, self.line) | |
self._feed_header_lines(consumer, self.parse_header()) | |
# Features (common to both EMBL and GenBank): | |
if do_features: | |
self._feed_feature_table(consumer, self.parse_features(skip=False)) | |
else: | |
self.parse_features(skip=True) # ignore the data | |
# Footer and sequence | |
misc_lines, sequence_string = self.parse_footer() | |
self._feed_misc_lines(consumer, misc_lines) | |
consumer.sequence(sequence_string) | |
# Calls to consumer.base_number() do nothing anyway | |
consumer.record_end("//") | |
assert self.line == "//" | |
# And we are done | |
return True | |
def parse(self, handle, do_features=True): | |
"""Return a SeqRecord (with SeqFeatures if do_features=True). | |
See also the method parse_records() for use on multi-record files. | |
""" | |
from Bio.GenBank import _FeatureConsumer | |
from Bio.GenBank.utils import FeatureValueCleaner | |
consumer = _FeatureConsumer( | |
use_fuzziness=1, feature_cleaner=FeatureValueCleaner() | |
) | |
if self.feed(handle, consumer, do_features): | |
return consumer.data | |
else: | |
return None | |
def parse_records(self, handle, do_features=True): | |
"""Parse records, return a SeqRecord object iterator. | |
Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord | |
The SeqRecord objects include SeqFeatures if do_features=True | |
This method is intended for use in Bio.SeqIO | |
""" | |
# This is a generator function | |
with as_handle(handle) as handle: | |
while True: | |
record = self.parse(handle, do_features) | |
if record is None: | |
break | |
if record.id is None: | |
raise ValueError( | |
"Failed to parse the record's ID. Invalid ID line?" | |
) | |
if record.name == "<unknown name>": | |
raise ValueError( | |
"Failed to parse the record's name. Invalid ID line?" | |
) | |
if record.description == "<unknown description>": | |
raise ValueError("Failed to parse the record's description") | |
yield record | |
def parse_cds_features( | |
self, handle, alphabet=None, tags2id=("protein_id", "locus_tag", "product") | |
): | |
"""Parse CDS features, return SeqRecord object iterator. | |
Each CDS feature becomes a SeqRecord. | |
Arguments: | |
- alphabet - Obsolete, should be left as None. | |
- tags2id - Tuple of three strings, the feature keys to use | |
for the record id, name and description, | |
This method is intended for use in Bio.SeqIO | |
""" | |
if alphabet is not None: | |
raise ValueError("The alphabet argument is no longer supported") | |
with as_handle(handle) as handle: | |
self.set_handle(handle) | |
while self.find_start(): | |
# Got an EMBL or GenBank record... | |
self.parse_header() # ignore header lines! | |
feature_tuples = self.parse_features() | |
# self.parse_footer() # ignore footer lines! | |
while True: | |
line = self.handle.readline() | |
if not line: | |
break | |
if line[:2] == "//": | |
break | |
self.line = line.rstrip() | |
# Now go though those features... | |
for key, location_string, qualifiers in feature_tuples: | |
if key == "CDS": | |
# Create SeqRecord | |
# ================ | |
# SeqRecord objects cannot be created with annotations, they | |
# must be added afterwards. So create an empty record and | |
# then populate it: | |
record = SeqRecord(seq=None) | |
annotations = record.annotations | |
annotations["molecule_type"] = "protein" | |
# Should we add a location object to the annotations? | |
# I *think* that only makes sense for SeqFeatures with their | |
# sub features... | |
annotations["raw_location"] = location_string.replace(" ", "") | |
for (qualifier_name, qualifier_data) in qualifiers: | |
if ( | |
qualifier_data is not None | |
and qualifier_data[0] == '"' | |
and qualifier_data[-1] == '"' | |
): | |
# Remove quotes | |
qualifier_data = qualifier_data[1:-1] | |
# Append the data to the annotation qualifier... | |
if qualifier_name == "translation": | |
assert record.seq is None, "Multiple translations!" | |
record.seq = Seq(qualifier_data.replace("\n", "")) | |
elif qualifier_name == "db_xref": | |
# its a list, possibly empty. Its safe to extend | |
record.dbxrefs.append(qualifier_data) | |
else: | |
if qualifier_data is not None: | |
qualifier_data = qualifier_data.replace( | |
"\n", " " | |
).replace(" ", " ") | |
try: | |
annotations[qualifier_name] += " " + qualifier_data | |
except KeyError: | |
# Not an addition to existing data, its the first bit | |
annotations[qualifier_name] = qualifier_data | |
# Fill in the ID, Name, Description | |
# ================================= | |
try: | |
record.id = annotations[tags2id[0]] | |
except KeyError: | |
pass | |
try: | |
record.name = annotations[tags2id[1]] | |
except KeyError: | |
pass | |
try: | |
record.description = annotations[tags2id[2]] | |
except KeyError: | |
pass | |
yield record | |
class EmblScanner(InsdcScanner): | |
"""For extracting chunks of information in EMBL files.""" | |
RECORD_START = "ID " | |
HEADER_WIDTH = 5 | |
FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", "FH"] | |
FEATURE_END_MARKERS = ["XX"] # XX can also mark the end of many things! | |
FEATURE_QUALIFIER_INDENT = 21 | |
FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT - 2) | |
SEQUENCE_HEADERS = ["SQ", "CO"] # Remove trailing spaces | |
EMBL_INDENT = HEADER_WIDTH | |
EMBL_SPACER = " " * EMBL_INDENT | |
def parse_footer(self): | |
"""Return a tuple containing a list of any misc strings, and the sequence.""" | |
if self.line[: self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS: | |
raise ValueError(f"Footer format unexpected: '{self.line}'") | |
# Note that the SQ line can be split into several lines... | |
misc_lines = [] | |
while self.line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: | |
misc_lines.append(self.line) | |
self.line = self.handle.readline() | |
if not self.line: | |
raise ValueError("Premature end of file") | |
self.line = self.line.rstrip() | |
if not ( | |
self.line[: self.HEADER_WIDTH] == " " * self.HEADER_WIDTH | |
or self.line.strip() == "//" | |
): | |
raise ValueError(f"Unexpected content after SQ or CO line: {self.line!r}") | |
seq_lines = [] | |
line = self.line | |
while True: | |
if not line: | |
raise ValueError("Premature end of file in sequence data") | |
line = line.strip() | |
if not line: | |
raise ValueError("Blank line in sequence data") | |
if line == "//": | |
break | |
if self.line[: self.HEADER_WIDTH] != (" " * self.HEADER_WIDTH): | |
raise ValueError( | |
"Problem with characters in header line, " | |
" or incorrect header width: " + self.line | |
) | |
# Remove tailing number now, remove spaces later | |
linersplit = line.rsplit(None, 1) | |
if len(linersplit) == 2 and linersplit[1].isdigit(): | |
seq_lines.append(linersplit[0]) | |
elif line.isdigit(): | |
# Special case of final blank line with no bases | |
# just the sequence coordinate | |
pass | |
else: | |
warnings.warn( | |
"EMBL sequence line missing coordinates", BiopythonParserWarning | |
) | |
seq_lines.append(line) | |
line = self.handle.readline() | |
self.line = line | |
return misc_lines, "".join(seq_lines).replace(" ", "") | |
def _feed_first_line(self, consumer, line): | |
assert line[: self.HEADER_WIDTH].rstrip() == "ID" | |
if line[self.HEADER_WIDTH :].count(";") == 6: | |
# Looks like the semi colon separated style introduced in 2006 | |
self._feed_first_line_new(consumer, line) | |
elif line[self.HEADER_WIDTH :].count(";") == 3: | |
if line.rstrip().endswith(" SQ"): | |
# EMBL-bank patent data | |
self._feed_first_line_patents(consumer, line) | |
else: | |
# Looks like the pre 2006 style | |
self._feed_first_line_old(consumer, line) | |
elif line[self.HEADER_WIDTH :].count(";") == 2: | |
# Looks like KIKO patent data | |
self._feed_first_line_patents_kipo(consumer, line) | |
else: | |
raise ValueError("Did not recognise the ID line layout:\n" + line) | |
def _feed_first_line_patents(self, consumer, line): | |
# Old style EMBL patent records where ID line ended SQ | |
# Not 100% sure that PRT here is really molecule type and | |
# not the data file division... | |
# | |
# Either Non-Redundant Level 1 database records, | |
# ID <accession>; <molecule type>; <non-redundant level 1>; <cluster size L1> | |
# e.g. ID NRP_AX000635; PRT; NR1; 15 SQ | |
# | |
# Or, Non-Redundant Level 2 database records: | |
# ID <L2-accession>; <molecule type>; <non-redundant level 2>; <cluster size L2> | |
# e.g. ID NRP0000016E; PRT; NR2; 5 SQ | |
# e.g. ID NRP_AX000635; PRT; NR1; 15 SQ | |
fields = [ | |
data.strip() for data in line[self.HEADER_WIDTH :].strip()[:-3].split(";") | |
] | |
assert len(fields) == 4 | |
consumer.locus(fields[0]) | |
consumer.residue_type(fields[1]) # semi-redundant | |
consumer.data_file_division(fields[2]) | |
# TODO - Record cluster size? | |
def _feed_first_line_patents_kipo(self, consumer, line): | |
# EMBL format patent sequence from KIPO, e.g. | |
# ftp://ftp.ebi.ac.uk/pub/databases/patentdata/kipo_prt.dat.gz | |
# | |
# e.g. ID DI500001 STANDARD; PRT; 111 AA. | |
# | |
# This follows the style of _feed_first_line_old | |
assert line[: self.HEADER_WIDTH].rstrip() == "ID" | |
fields = [line[self.HEADER_WIDTH :].split(None, 1)[0]] | |
fields.extend(line[self.HEADER_WIDTH :].split(None, 1)[1].split(";")) | |
fields = [entry.strip() for entry in fields] | |
""" | |
The tokens represent: | |
0. Primary accession number | |
(space sep) | |
1. ??? (e.g. standard) | |
(semi-colon) | |
2. Molecule type (protein)? Division? Always 'PRT' | |
3. Sequence length (e.g. '111 AA.') | |
""" | |
consumer.locus(fields[0]) # Should we also call the accession consumer? | |
# consumer.molecule_type(fields[2]) | |
self._feed_seq_length(consumer, fields[3]) | |
def _feed_first_line_old(self, consumer, line): | |
# Expects an ID line in the style before 2006, e.g. | |
# ID SC10H5 standard; DNA; PRO; 4870 BP. | |
# ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. | |
assert line[: self.HEADER_WIDTH].rstrip() == "ID" | |
fields = [line[self.HEADER_WIDTH :].split(None, 1)[0]] | |
fields.extend(line[self.HEADER_WIDTH :].split(None, 1)[1].split(";")) | |
fields = [entry.strip() for entry in fields] | |
""" | |
The tokens represent: | |
0. Primary accession number | |
(space sep) | |
1. ??? (e.g. standard) | |
(semi-colon) | |
2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') | |
3. Taxonomic division (e.g. 'PRO') | |
4. Sequence length (e.g. '4639675 BP.') | |
""" | |
consumer.locus(fields[0]) # Should we also call the accession consumer? | |
consumer.residue_type(fields[2]) | |
if "circular" in fields[2]: | |
consumer.topology("circular") | |
consumer.molecule_type(fields[2].replace("circular", "").strip()) | |
elif "linear" in fields[2]: | |
consumer.topology("linear") | |
consumer.molecule_type(fields[2].replace("linear", "").strip()) | |
else: | |
consumer.molecule_type(fields[2].strip()) | |
consumer.data_file_division(fields[3]) | |
self._feed_seq_length(consumer, fields[4]) | |
def _feed_first_line_new(self, consumer, line): | |
# Expects an ID line in the style introduced in 2006, e.g. | |
# ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
# ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. | |
assert line[: self.HEADER_WIDTH].rstrip() == "ID" | |
fields = [data.strip() for data in line[self.HEADER_WIDTH :].strip().split(";")] | |
assert len(fields) == 7 | |
""" | |
The tokens represent: | |
0. Primary accession number | |
1. Sequence version number | |
2. Topology: 'circular' or 'linear' | |
3. Molecule type (e.g. 'genomic DNA') | |
4. Data class (e.g. 'STD') | |
5. Taxonomic division (e.g. 'PRO') | |
6. Sequence length (e.g. '4639675 BP.') | |
""" | |
consumer.locus(fields[0]) | |
# Call the accession consumer now, to make sure we record | |
# something as the record.id, in case there is no AC line | |
consumer.accession(fields[0]) | |
# TODO - How to deal with the version field? At the moment the consumer | |
# will try and use this for the ID which isn't ideal for EMBL files. | |
version_parts = fields[1].split() | |
if ( | |
len(version_parts) == 2 | |
and version_parts[0] == "SV" | |
and version_parts[1].isdigit() | |
): | |
consumer.version_suffix(version_parts[1]) | |
# Based on how the old GenBank parser worked, merge these two: | |
consumer.residue_type(" ".join(fields[2:4])) # Semi-obsolete | |
consumer.topology(fields[2]) | |
consumer.molecule_type(fields[3]) | |
# consumer.xxx(fields[4]) # TODO - What should we do with the data class? | |
consumer.data_file_division(fields[5]) | |
self._feed_seq_length(consumer, fields[6]) | |
def _feed_seq_length(consumer, text): | |
length_parts = text.split() | |
assert len(length_parts) == 2, f"Invalid sequence length string {text!r}" | |
assert length_parts[1].upper() in ["BP", "BP.", "AA", "AA."] | |
consumer.size(length_parts[0]) | |
def _feed_header_lines(self, consumer, lines): | |
consumer_dict = { | |
"AC": "accession", | |
"SV": "version", # SV line removed in June 2006, now part of ID line | |
"DE": "definition", | |
# 'RN' : 'reference_num', | |
# 'RC' : reference comment... TODO | |
# 'RP' : 'reference_bases', | |
# 'RX' : reference cross reference... DOI or Pubmed | |
"RG": "consrtm", # optional consortium | |
# 'RA' : 'authors', | |
# 'RT' : 'title', | |
"RL": "journal", | |
"OS": "organism", | |
"OC": "taxonomy", | |
# 'DR' : data reference | |
"CC": "comment", | |
# 'XX' : splitter | |
} | |
# We have to handle the following specially: | |
# RX (depending on reference type...) | |
for line in lines: | |
line_type = line[: self.EMBL_INDENT].strip() | |
data = line[self.EMBL_INDENT :].strip() | |
if line_type == "XX": | |
pass | |
elif line_type == "RN": | |
# Reformat reference numbers for the GenBank based consumer | |
# e.g. '[1]' becomes '1' | |
if data[0] == "[" and data[-1] == "]": | |
data = data[1:-1] | |
consumer.reference_num(data) | |
elif line_type == "RP": | |
if data.strip() == "[-]": | |
# Patent EMBL files from KIPO just use: RN [-] | |
pass | |
else: | |
# Reformat reference numbers for the GenBank based consumer | |
# e.g. '1-4639675' becomes '(bases 1 to 4639675)' | |
# and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)' | |
# Note could be multi-line, and end with a comma | |
parts = [ | |
bases.replace("-", " to ").strip() | |
for bases in data.split(",") | |
if bases.strip() | |
] | |
consumer.reference_bases(f"(bases {'; '.join(parts)})") | |
elif line_type == "RT": | |
# Remove the enclosing quotes and trailing semi colon. | |
# Note the title can be split over multiple lines. | |
if data.startswith('"'): | |
data = data[1:] | |
if data.endswith('";'): | |
data = data[:-2] | |
consumer.title(data) | |
elif line_type == "RX": | |
# EMBL support three reference types at the moment: | |
# - PUBMED PUBMED bibliographic database (NLM) | |
# - DOI Digital Object Identifier (International DOI Foundation) | |
# - AGRICOLA US National Agriculture Library (NAL) of the US Department | |
# of Agriculture (USDA) | |
# | |
# Format: | |
# RX resource_identifier; identifier. | |
# | |
# e.g. | |
# RX DOI; 10.1016/0024-3205(83)90010-3. | |
# RX PUBMED; 264242. | |
# | |
# Currently our reference object only supports PUBMED and MEDLINE | |
# (as these were in GenBank files?). | |
key, value = data.split(";", 1) | |
if value.endswith("."): | |
value = value[:-1] | |
value = value.strip() | |
if key == "PUBMED": | |
consumer.pubmed_id(value) | |
# TODO - Handle other reference types (here and in BioSQL bindings) | |
elif line_type == "CC": | |
# Have to pass a list of strings for this one (not just a string) | |
consumer.comment([data]) | |
elif line_type == "DR": | |
# Database Cross-reference, format: | |
# DR database_identifier; primary_identifier; secondary_identifier. | |
# | |
# e.g. | |
# DR MGI; 98599; Tcrb-V4. | |
# | |
# TODO - How should we store any secondary identifier? | |
parts = data.rstrip(".").split(";") | |
# Turn it into "database_identifier:primary_identifier" to | |
# mimic the GenBank parser. e.g. "MGI:98599" | |
if len(parts) == 1: | |
warnings.warn( | |
"Malformed DR line in EMBL file.", BiopythonParserWarning | |
) | |
else: | |
consumer.dblink(f"{parts[0].strip()}:{parts[1].strip()}") | |
elif line_type == "RA": | |
# Remove trailing ; at end of authors list | |
consumer.authors(data.rstrip(";")) | |
elif line_type == "PR": | |
# In the EMBL patent files, this is a PR (PRiority) line which | |
# provides the earliest active priority within the family. | |
# The priority number comes first, followed by the priority date. | |
# | |
# e.g. | |
# PR JP19990377484 16-DEC-1999 | |
# | |
# However, in most EMBL files this is a PR (PRoject) line which | |
# gives the BioProject reference number. | |
# | |
# e.g. | |
# PR Project:PRJNA60715; | |
# | |
# In GenBank files this corresponds to the old PROJECT line | |
# which was later replaced with the DBLINK line. | |
if data.startswith("Project:"): | |
# Remove trailing ; at end of the project reference | |
consumer.project(data.rstrip(";")) | |
elif line_type == "KW": | |
consumer.keywords(data.rstrip(";")) | |
elif line_type in consumer_dict: | |
# Its a semi-automatic entry! | |
getattr(consumer, consumer_dict[line_type])(data) | |
else: | |
if self.debug: | |
print(f"Ignoring EMBL header line:\n{line}") | |
def _feed_misc_lines(self, consumer, lines): | |
# TODO - Should we do something with the information on the SQ line(s)? | |
lines.append("") | |
line_iter = iter(lines) | |
try: | |
for line in line_iter: | |
if line.startswith("CO "): | |
line = line[5:].strip() | |
contig_location = line | |
while True: | |
line = next(line_iter) | |
if not line: | |
break | |
elif line.startswith("CO "): | |
# Don't need to preserve the whitespace here. | |
contig_location += line[5:].strip() | |
else: | |
raise ValueError( | |
"Expected CO (contig) continuation line, got:\n" + line | |
) | |
consumer.contig_location(contig_location) | |
if line.startswith("SQ Sequence "): | |
# e.g. | |
# SQ Sequence 219 BP; 82 A; 48 C; 33 G; 45 T; 11 other; | |
# | |
# Or, EMBL-bank patent, e.g. | |
# SQ Sequence 465 AA; 3963407aa91d3a0d622fec679a4524e0; MD5; | |
self._feed_seq_length( | |
consumer, line[14:].rstrip().rstrip(";").split(";", 1)[0] | |
) | |
# TODO - Record the checksum etc? | |
return | |
except StopIteration: | |
raise ValueError("Problem in misc lines before sequence") from None | |
class _ImgtScanner(EmblScanner): | |
"""For extracting chunks of information in IMGT (EMBL like) files (PRIVATE). | |
IMGT files are like EMBL files but in order to allow longer feature types | |
the features should be indented by 25 characters not 21 characters. In | |
practice the IMGT flat files tend to use either 21 or 25 characters, so we | |
must cope with both. | |
This is private to encourage use of Bio.SeqIO rather than Bio.GenBank. | |
""" | |
FEATURE_START_MARKERS = [ | |
"FH Key Location/Qualifiers", | |
"FH Key Location/Qualifiers (from EMBL)", | |
"FH Key Location/Qualifiers", | |
"FH", | |
] | |
def _feed_first_line(self, consumer, line): | |
assert line[: self.HEADER_WIDTH].rstrip() == "ID" | |
if line[self.HEADER_WIDTH :].count(";") != 5: | |
# Assume its an older EMBL-like line, | |
return EmblScanner._feed_first_line(self, consumer, line) | |
# Otherwise assume its the new (circa 2016) IMGT style | |
# as used in the IPD-IMGT/HLA Database | |
# | |
# https://github.com/ANHIG/IMGTHLA/ | |
# | |
# The key changes post 3.16 are the addition of an SV value | |
# to the ID line, these additions should make the format more | |
# similar to the ENA style. | |
# | |
# ID HLA00001 standard; DNA; HUM; 3503 BP. | |
# | |
# becomes | |
# | |
# ID HLA00001; SV 1; standard; DNA; HUM; 3503 BP. | |
fields = [data.strip() for data in line[self.HEADER_WIDTH :].strip().split(";")] | |
assert len(fields) == 6 | |
""" | |
The tokens represent: | |
0. Primary accession number (eg 'HLA00001') | |
1. Sequence version number (eg 'SV 1') | |
2. ??? eg 'standard' | |
3. Molecule type (e.g. 'DNA') | |
4. Taxonomic division (e.g. 'HUM') | |
5. Sequence length (e.g. '3503 BP.') | |
""" | |
consumer.locus(fields[0]) | |
# See TODO on the EMBL _feed_first_line_new about version field | |
version_parts = fields[1].split() | |
if ( | |
len(version_parts) == 2 | |
and version_parts[0] == "SV" | |
and version_parts[1].isdigit() | |
): | |
consumer.version_suffix(version_parts[1]) | |
consumer.residue_type(fields[3]) | |
if "circular" in fields[3]: | |
consumer.topology("circular") | |
consumer.molecule_type(fields[3].replace("circular", "").strip()) | |
elif "linear" in fields[3]: | |
consumer.topology("linear") | |
consumer.molecule_type(fields[3].replace("linear", "").strip()) | |
else: | |
consumer.molecule_type(fields[3].strip()) | |
consumer.data_file_division(fields[4]) | |
self._feed_seq_length(consumer, fields[5]) | |
def parse_features(self, skip=False): | |
"""Return list of tuples for the features (if present). | |
Each feature is returned as a tuple (key, location, qualifiers) | |
where key and location are strings (e.g. "CDS" and | |
"complement(join(490883..490885,1..879))") while qualifiers | |
is a list of two string tuples (feature qualifier keys and values). | |
Assumes you have already read to the start of the features table. | |
""" | |
if self.line.rstrip() not in self.FEATURE_START_MARKERS: | |
if self.debug: | |
print("Didn't find any feature table") | |
return [] | |
while self.line.rstrip() in self.FEATURE_START_MARKERS: | |
self.line = self.handle.readline() | |
bad_position_re = re.compile(r"([0-9]+)>") | |
features = [] | |
line = self.line | |
while True: | |
if not line: | |
raise ValueError("Premature end of line during features table") | |
if line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: | |
if self.debug: | |
print("Found start of sequence") | |
break | |
line = line.rstrip() | |
if line == "//": | |
raise ValueError("Premature end of features table, marker '//' found") | |
if line in self.FEATURE_END_MARKERS: | |
if self.debug: | |
print("Found end of features") | |
line = self.handle.readline() | |
break | |
if line[2 : self.FEATURE_QUALIFIER_INDENT].strip() == "": | |
# This is an empty feature line between qualifiers. Empty | |
# feature lines within qualifiers are handled below (ignored). | |
line = self.handle.readline() | |
continue | |
if skip: | |
line = self.handle.readline() | |
while ( | |
line[: self.FEATURE_QUALIFIER_INDENT] | |
== self.FEATURE_QUALIFIER_SPACER | |
): | |
line = self.handle.readline() | |
else: | |
assert line[:2] == "FT" | |
try: | |
feature_key, location_start = line[2:].strip().split() | |
except ValueError: | |
# e.g. "FT TRANSMEMBRANE-REGION2163..2240\n" | |
# Assume indent of 25 as per IMGT spec, with the location | |
# start in column 26 (one-based). | |
feature_key = line[2:25].strip() | |
location_start = line[25:].strip() | |
feature_lines = [location_start] | |
line = self.handle.readline() | |
while ( | |
line[: self.FEATURE_QUALIFIER_INDENT] | |
== self.FEATURE_QUALIFIER_SPACER | |
or line.rstrip() == "" | |
): # cope with blank lines in the midst of a feature | |
# Use strip to remove any harmless trailing white space AND and leading | |
# white space (copes with 21 or 26 indents and orther variants) | |
assert line[:2] == "FT" | |
feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT :].strip()) | |
line = self.handle.readline() | |
feature_key, location, qualifiers = self.parse_feature( | |
feature_key, feature_lines | |
) | |
# Try to handle known problems with IMGT locations here: | |
if ">" in location: | |
# Nasty hack for common IMGT bug, should be >123 not 123> | |
# in a location string. At least here the meaning is clear, | |
# and since it is so common I don't want to issue a warning | |
# warnings.warn("Feature location %s is invalid, " | |
# "moving greater than sign before position" | |
# % location, BiopythonParserWarning) | |
location = bad_position_re.sub(r">\1", location) | |
features.append((feature_key, location, qualifiers)) | |
self.line = line | |
return features | |
class GenBankScanner(InsdcScanner): | |
"""For extracting chunks of information in GenBank files.""" | |
RECORD_START = "LOCUS " | |
HEADER_WIDTH = 12 | |
FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers", "FEATURES"] | |
FEATURE_END_MARKERS = [] | |
FEATURE_QUALIFIER_INDENT = 21 | |
FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT | |
SEQUENCE_HEADERS = [ | |
"CONTIG", | |
"ORIGIN", | |
"BASE COUNT", | |
"WGS", | |
"TSA", | |
"TLS", | |
] # trailing spaces removed | |
GENBANK_INDENT = HEADER_WIDTH | |
GENBANK_SPACER = " " * GENBANK_INDENT | |
STRUCTURED_COMMENT_START = "-START##" | |
STRUCTURED_COMMENT_END = "-END##" | |
STRUCTURED_COMMENT_DELIM = " :: " | |
def parse_footer(self): | |
"""Return a tuple containing a list of any misc strings, and the sequence.""" | |
if self.line[: self.HEADER_WIDTH].rstrip() not in self.SEQUENCE_HEADERS: | |
raise ValueError(f"Footer format unexpected: '{self.line}'") | |
misc_lines = [] | |
while ( | |
self.line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS | |
or self.line[: self.HEADER_WIDTH] == " " * self.HEADER_WIDTH | |
or "WGS" == self.line[:3] | |
): | |
misc_lines.append(self.line.rstrip()) | |
self.line = self.handle.readline() | |
if not self.line: | |
raise ValueError("Premature end of file") | |
if self.line[: self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: | |
raise ValueError(f"Eh? '{self.line}'") | |
# Now just consume the sequence lines until reach the // marker | |
# or a CONTIG line | |
seq_lines = [] | |
line = self.line | |
while True: | |
if not line: | |
warnings.warn( | |
"Premature end of file in sequence data", BiopythonParserWarning | |
) | |
line = "//" | |
break | |
line = line.rstrip() | |
if not line: | |
warnings.warn("Blank line in sequence data", BiopythonParserWarning) | |
line = self.handle.readline() | |
continue | |
if line == "//": | |
break | |
if line.startswith("CONTIG"): | |
break | |
if len(line) > 9 and line[9:10] != " ": | |
# Some broken programs indent the sequence by one space too many | |
# so try to get rid of that and test again. | |
warnings.warn( | |
"Invalid indentation for sequence line", BiopythonParserWarning | |
) | |
line = line[1:] | |
if len(line) > 9 and line[9:10] != " ": | |
raise ValueError(f"Sequence line mal-formed, '{line}'") | |
seq_lines.append(line[10:]) # remove spaces later | |
line = self.handle.readline() | |
self.line = line | |
return misc_lines, "".join(seq_lines).replace(" ", "") | |
def _feed_first_line(self, consumer, line): | |
"""Scan over and parse GenBank LOCUS line (PRIVATE). | |
This must cope with several variants, primarily the old and new column | |
based standards from GenBank. Additionally EnsEMBL produces GenBank | |
files where the LOCUS line is space separated rather that following | |
the column based layout. | |
We also try to cope with GenBank like files with partial LOCUS lines. | |
As of release 229.0, the columns are no longer strictly in a given | |
position. See GenBank format release notes: | |
"Historically, the LOCUS line has had a fixed length and its | |
elements have been presented at specific column positions... | |
But with the anticipated increases in the lengths of accession | |
numbers, and the advent of sequences that are gigabases long, | |
maintaining the column positions will not always be possible and | |
the overall length of the LOCUS line could exceed 79 characters." | |
""" | |
##################################### | |
# LOCUS line # | |
##################################### | |
if line[0 : self.GENBANK_INDENT] != "LOCUS ": | |
raise ValueError("LOCUS line does not start correctly:\n" + line) | |
# Have to break up the locus line, and handle the different bits of it. | |
# There are at least two different versions of the locus line... | |
if line[29:33] in [" bp ", " aa ", " rc "] and line[55:62] == " ": | |
# Old... note we insist on the 55:62 being empty to avoid trying | |
# to parse space separated LOCUS lines from Ensembl etc, see below. | |
# | |
# Positions Contents | |
# --------- -------- | |
# 00:06 LOCUS | |
# 06:12 spaces | |
# 12:?? Locus name | |
# ??:?? space | |
# ??:29 Length of sequence, right-justified | |
# 29:33 space, bp, space | |
# 33:41 strand type / molecule type, e.g. DNA | |
# 41:42 space | |
# 42:51 Blank (implies linear), linear or circular | |
# 51:52 space | |
# 52:55 The division code (e.g. BCT, VRL, INV) | |
# 55:62 space | |
# 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) | |
# | |
# assert line[29:33] in [' bp ', ' aa ',' rc '] , \ | |
# 'LOCUS line does not contain size units at expected position:\n' + line | |
if line[41:42] != " ": | |
raise ValueError( | |
"LOCUS line does not contain space at position 42:\n" + line | |
) | |
if line[42:51].strip() not in ["", "linear", "circular"]: | |
raise ValueError( | |
"LOCUS line does not contain valid entry " | |
"(linear, circular, ...):\n" + line | |
) | |
if line[51:52] != " ": | |
raise ValueError( | |
"LOCUS line does not contain space at position 52:\n" + line | |
) | |
# if line[55:62] != ' ': | |
# raise ValueError('LOCUS line does not contain spaces from position 56 to 62:\n' + line) | |
if line[62:73].strip(): | |
if line[64:65] != "-": | |
raise ValueError( | |
"LOCUS line does not contain - at " | |
"position 65 in date:\n" + line | |
) | |
if line[68:69] != "-": | |
raise ValueError( | |
"LOCUS line does not contain - at " | |
"position 69 in date:\n" + line | |
) | |
name_and_length_str = line[self.GENBANK_INDENT : 29] | |
while " " in name_and_length_str: | |
name_and_length_str = name_and_length_str.replace(" ", " ") | |
name_and_length = name_and_length_str.split(" ") | |
if len(name_and_length) > 2: | |
raise ValueError( | |
"Cannot parse the name and length in the LOCUS line:\n" + line | |
) | |
if len(name_and_length) == 1: | |
raise ValueError("Name and length collide in the LOCUS line:\n" + line) | |
# Should be possible to split them based on position, if | |
# a clear definition of the standard exists THAT AGREES with | |
# existing files. | |
name, length = name_and_length | |
if len(name) > 16: | |
# As long as the sequence is short, can steal its leading spaces | |
# to extend the name over the current 16 character limit. | |
# However, that deserves a warning as it is out of spec. | |
warnings.warn( | |
"GenBank LOCUS line identifier over 16 characters", | |
BiopythonParserWarning, | |
) | |
consumer.locus(name) | |
consumer.size(length) | |
# consumer.residue_type(line[33:41].strip()) | |
if line[33:51].strip() == "" and line[29:33] == " aa ": | |
# Amino acids -> protein (even if there is no residue type given) | |
consumer.residue_type("PROTEIN") | |
else: | |
consumer.residue_type(line[33:51].strip()) | |
consumer.molecule_type(line[33:41].strip()) | |
consumer.topology(line[42:51].strip()) | |
consumer.data_file_division(line[52:55]) | |
if line[62:73].strip(): | |
consumer.date(line[62:73]) | |
elif line[40:44] in [" bp ", " aa ", " rc "] and line[54:64].strip() in [ | |
"", | |
"linear", | |
"circular", | |
]: | |
# New... linear/circular/big blank test should avoid EnsEMBL style | |
# LOCUS line being treated like a proper column based LOCUS line. | |
# | |
# Positions Contents | |
# --------- -------- | |
# 00:06 LOCUS | |
# 06:12 spaces | |
# 12:?? Locus name | |
# ??:?? space | |
# ??:40 Length of sequence, right-justified | |
# 40:44 space, bp, space | |
# 44:47 Blank, ss-, ds-, ms- | |
# 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA | |
# 54:55 space | |
# 55:63 Blank (implies linear), linear or circular | |
# 63:64 space | |
# 64:67 The division code (e.g. BCT, VRL, INV) | |
# 67:68 space | |
# 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) | |
# | |
if len(line) < 79: | |
# JBEI genbank files seem to miss a division code and date | |
# See issue #1656 e.g. | |
# LOCUS pEH010 5743 bp DNA circular | |
warnings.warn( | |
f"Truncated LOCUS line found - is this correct?\n:{line!r}", | |
BiopythonParserWarning, | |
) | |
padding_len = 79 - len(line) | |
padding = " " * padding_len | |
line += padding | |
if line[40:44] not in [" bp ", " aa ", " rc "]: | |
raise ValueError( | |
"LOCUS line does not contain size units at " | |
"expected position:\n" + line | |
) | |
if line[44:47] not in [" ", "ss-", "ds-", "ms-"]: | |
raise ValueError( | |
"LOCUS line does not have valid strand " | |
"type (Single stranded, ...):\n" + line | |
) | |
if not ( | |
line[47:54].strip() == "" | |
or "DNA" in line[47:54].strip().upper() | |
or "RNA" in line[47:54].strip().upper() | |
): | |
raise ValueError( | |
"LOCUS line does not contain valid " | |
"sequence type (DNA, RNA, ...):\n" + line | |
) | |
if line[54:55] != " ": | |
raise ValueError( | |
"LOCUS line does not contain space at position 55:\n" + line | |
) | |
if line[55:63].strip() not in ["", "linear", "circular"]: | |
raise ValueError( | |
"LOCUS line does not contain valid " | |
"entry (linear, circular, ...):\n" + line | |
) | |
if line[63:64] != " ": | |
raise ValueError( | |
"LOCUS line does not contain space at position 64:\n" + line | |
) | |
if line[67:68] != " ": | |
raise ValueError( | |
"LOCUS line does not contain space at position 68:\n" + line | |
) | |
if line[68:79].strip(): | |
if line[70:71] != "-": | |
raise ValueError( | |
"LOCUS line does not contain - at " | |
"position 71 in date:\n" + line | |
) | |
if line[74:75] != "-": | |
raise ValueError( | |
"LOCUS line does not contain - at " | |
"position 75 in date:\n" + line | |
) | |
name_and_length_str = line[self.GENBANK_INDENT : 40] | |
while " " in name_and_length_str: | |
name_and_length_str = name_and_length_str.replace(" ", " ") | |
name_and_length = name_and_length_str.split(" ") | |
if len(name_and_length) > 2: | |
raise ValueError( | |
"Cannot parse the name and length in the LOCUS line:\n" + line | |
) | |
if len(name_and_length) == 1: | |
raise ValueError("Name and length collide in the LOCUS line:\n" + line) | |
# Should be possible to split them based on position, if | |
# a clear definition of the stand exists THAT AGREES with | |
# existing files. | |
consumer.locus(name_and_length[0]) | |
consumer.size(name_and_length[1]) | |
if line[44:54].strip() == "" and line[40:44] == " aa ": | |
# Amino acids -> protein (even if there is no residue type given) | |
consumer.residue_type(("PROTEIN " + line[54:63]).strip()) | |
else: | |
consumer.residue_type(line[44:63].strip()) | |
consumer.molecule_type(line[44:54].strip()) | |
consumer.topology(line[55:63].strip()) | |
if line[64:76].strip(): | |
consumer.data_file_division(line[64:67]) | |
if line[68:79].strip(): | |
consumer.date(line[68:79]) | |
elif line[self.GENBANK_INDENT :].strip().count(" ") == 0: | |
# Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 | |
# | |
# e.g. | |
# | |
# "LOCUS U00096" | |
# | |
# rather than: | |
# | |
# "LOCUS U00096 4639675 bp DNA circular BCT" | |
# | |
# Positions Contents | |
# --------- -------- | |
# 00:06 LOCUS | |
# 06:12 spaces | |
# 12:?? Locus name | |
if line[self.GENBANK_INDENT :].strip() != "": | |
consumer.locus(line[self.GENBANK_INDENT :].strip()) | |
else: | |
# Must just have just "LOCUS ", is this even legitimate? | |
# We should be able to continue parsing... we need real world testcases! | |
warnings.warn( | |
f"Minimal LOCUS line found - is this correct?\n:{line!r}", | |
BiopythonParserWarning, | |
) | |
elif ( | |
len(line.split()) == 8 | |
and line.split()[3] in ("aa", "bp") | |
and line.split()[5] in ("linear", "circular") | |
): | |
# Cope with invalidly spaced GenBank LOCUS lines like | |
# LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001 | |
# This will also cope with extra long accession numbers and | |
# sequence lengths | |
splitline = line.split() | |
consumer.locus(splitline[1]) | |
# Provide descriptive error message if the sequence is too long | |
# for python to handle | |
if int(splitline[2]) > sys.maxsize: | |
raise ValueError( | |
"Tried to load a sequence with a length %s, " | |
"your installation of python can only load " | |
"sesquences of length %s" % (splitline[2], sys.maxsize) | |
) | |
else: | |
consumer.size(splitline[2]) | |
consumer.residue_type(splitline[4]) | |
consumer.topology(splitline[5]) | |
consumer.data_file_division(splitline[6]) | |
consumer.date(splitline[7]) | |
if len(line) < 80: | |
warnings.warn( | |
"Attempting to parse malformed locus line:\n%r\n" | |
"Found locus %r size %r residue_type %r\n" | |
"Some fields may be wrong." | |
% (line, splitline[1], splitline[2], splitline[4]), | |
BiopythonParserWarning, | |
) | |
elif len(line.split()) == 7 and line.split()[3] in ["aa", "bp"]: | |
# Cope with EnsEMBL genbank files which use space separation rather | |
# than the expected column based layout. e.g. | |
# LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011 | |
# LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011 | |
# LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011 | |
# LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011 | |
# Notice that the 'bp' can occur in the position expected by either | |
# the old or the new fixed column standards (parsed above). | |
splitline = line.split() | |
consumer.locus(splitline[1]) | |
consumer.size(splitline[2]) | |
consumer.residue_type(splitline[4]) | |
consumer.data_file_division(splitline[5]) | |
consumer.date(splitline[6]) | |
elif len(line.split()) >= 4 and line.split()[3] in ["aa", "bp"]: | |
# Cope with EMBOSS seqret output where it seems the locus id can cause | |
# the other fields to overflow. We just IGNORE the other fields! | |
warnings.warn( | |
f"Malformed LOCUS line found - is this correct?\n:{line!r}", | |
BiopythonParserWarning, | |
) | |
consumer.locus(line.split()[1]) | |
consumer.size(line.split()[2]) | |
elif len(line.split()) >= 4 and line.split()[-1] in ["aa", "bp"]: | |
# Cope with pseudo-GenBank files like this: | |
# "LOCUS RNA5 complete 1718 bp" | |
# Treat everything between LOCUS and the size as the identifier. | |
warnings.warn( | |
f"Malformed LOCUS line found - is this correct?\n:{line!r}", | |
BiopythonParserWarning, | |
) | |
consumer.locus(line[5:].rsplit(None, 2)[0].strip()) | |
consumer.size(line.split()[-2]) | |
else: | |
raise ValueError("Did not recognise the LOCUS line layout:\n" + line) | |
def _feed_header_lines(self, consumer, lines): | |
# Following dictionary maps GenBank lines to the associated | |
# consumer methods - the special cases like LOCUS where one | |
# genbank line triggers several consumer calls have to be | |
# handled individually. | |
consumer_dict = { | |
"DEFINITION": "definition", | |
"ACCESSION": "accession", | |
"NID": "nid", | |
"PID": "pid", | |
"DBSOURCE": "db_source", | |
"KEYWORDS": "keywords", | |
"SEGMENT": "segment", | |
"SOURCE": "source", | |
"AUTHORS": "authors", | |
"CONSRTM": "consrtm", | |
"PROJECT": "project", | |
"TITLE": "title", | |
"JOURNAL": "journal", | |
"MEDLINE": "medline_id", | |
"PUBMED": "pubmed_id", | |
"REMARK": "remark", | |
} | |
# We have to handle the following specially: | |
# ORIGIN (locus, size, residue_type, data_file_division and date) | |
# COMMENT (comment) | |
# VERSION (version and gi) | |
# DBLINK (database links like projects, newlines important) | |
# REFERENCE (eference_num and reference_bases) | |
# ORGANISM (organism and taxonomy) | |
lines = [_f for _f in lines if _f] | |
lines.append("") # helps avoid getting StopIteration all the time | |
line_iter = iter(lines) | |
try: | |
line = next(line_iter) | |
while True: | |
if not line: | |
break | |
line_type = line[: self.GENBANK_INDENT].strip() | |
data = line[self.GENBANK_INDENT :].strip() | |
if line_type == "VERSION": | |
# Need to call consumer.version(), and maybe also consumer.gi() as well. | |
# e.g. | |
# VERSION AC007323.5 GI:6587720 | |
while " " in data: | |
data = data.replace(" ", " ") | |
if " GI:" not in data: | |
consumer.version(data) | |
else: | |
if self.debug: | |
print( | |
"Version [" | |
+ data.split(" GI:")[0] | |
+ "], gi [" | |
+ data.split(" GI:")[1] | |
+ "]" | |
) | |
consumer.version(data.split(" GI:")[0]) | |
consumer.gi(data.split(" GI:")[1]) | |
# Read in the next line! | |
line = next(line_iter) | |
elif line_type == "DBLINK": | |
# Need to call consumer.dblink() for each line, e.g. | |
# DBLINK Project: 57779 | |
# BioProject: PRJNA57779 | |
consumer.dblink(data.strip()) | |
# Read in the next line, and see if its more of the DBLINK section: | |
while True: | |
line = next(line_iter) | |
if line[: self.GENBANK_INDENT] == self.GENBANK_SPACER: | |
# Add this continuation to the data string | |
consumer.dblink(line[self.GENBANK_INDENT :].strip()) | |
else: | |
# End of the DBLINK, leave this text in the variable "line" | |
break | |
elif line_type == "REFERENCE": | |
if self.debug > 1: | |
print("Found reference [" + data + "]") | |
# Need to call consumer.reference_num() and consumer.reference_bases() | |
# e.g. | |
# REFERENCE 1 (bases 1 to 86436) | |
# | |
# Note that this can be multiline, see Bug 1968, e.g. | |
# | |
# REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to | |
# 28259) | |
# | |
# For such cases we will call the consumer once only. | |
data = data.strip() | |
# Read in the next line, and see if its more of the reference: | |
while True: | |
line = next(line_iter) | |
if line[: self.GENBANK_INDENT] == self.GENBANK_SPACER: | |
# Add this continuation to the data string | |
data += " " + line[self.GENBANK_INDENT :] | |
if self.debug > 1: | |
print("Extended reference text [" + data + "]") | |
else: | |
# End of the reference, leave this text in the variable "line" | |
break | |
# We now have all the reference line(s) stored in a string, data, | |
# which we pass to the consumer | |
while " " in data: | |
data = data.replace(" ", " ") | |
if " " not in data: | |
if self.debug > 2: | |
print('Reference number "' + data + '"') | |
consumer.reference_num(data) | |
else: | |
if self.debug > 2: | |
print( | |
'Reference number "' | |
+ data[: data.find(" ")] | |
+ '", "' | |
+ data[data.find(" ") + 1 :] | |
+ '"' | |
) | |
consumer.reference_num(data[: data.find(" ")]) | |
consumer.reference_bases(data[data.find(" ") + 1 :]) | |
elif line_type == "ORGANISM": | |
# Typically the first line is the organism, and subsequent lines | |
# are the taxonomy lineage. However, given longer and longer | |
# species names (as more and more strains and sub strains get | |
# sequenced) the oragnism name can now get wrapped onto multiple | |
# lines. The NCBI say we have to recognise the lineage line by | |
# the presence of semi-colon delimited entries. In the long term, | |
# they are considering adding a new keyword (e.g. LINEAGE). | |
# See Bug 2591 for details. | |
organism_data = data | |
lineage_data = "" | |
while True: | |
line = next(line_iter) | |
if line[0 : self.GENBANK_INDENT] == self.GENBANK_SPACER: | |
if lineage_data or ";" in line: | |
lineage_data += " " + line[self.GENBANK_INDENT :] | |
elif line[self.GENBANK_INDENT :].strip() == ".": | |
# No lineage data, just . place holder | |
pass | |
else: | |
organism_data += ( | |
" " + line[self.GENBANK_INDENT :].strip() | |
) | |
else: | |
# End of organism and taxonomy | |
break | |
consumer.organism(organism_data) | |
if lineage_data.strip() == "" and self.debug > 1: | |
print("Taxonomy line(s) missing or blank") | |
consumer.taxonomy(lineage_data.strip()) | |
del organism_data, lineage_data | |
elif line_type == "COMMENT": | |
# A COMMENT can either be plain text or tabular (Structured Comment), | |
# or contain both. Multi-line comments are common. The code calls | |
# consumer.comment() once with a list where each entry | |
# is a line. If there's a structured comment consumer.structured_comment() | |
# is called with a dict of dicts where the secondary key/value pairs are | |
# the same as those in the structured comment table. The primary key is | |
# the title or header of the table (e.g. Assembly-Data, FluData). See | |
# http://www.ncbi.nlm.nih.gov/genbank/structuredcomment | |
# for more information on Structured Comments. | |
data = line[self.GENBANK_INDENT :] | |
if self.debug > 1: | |
print("Found comment") | |
comment_list = [] | |
structured_comment_dict = defaultdict(dict) | |
regex = rf"([^#]+){self.STRUCTURED_COMMENT_START}$" | |
structured_comment_key = re.search(regex, data) | |
if structured_comment_key is not None: | |
structured_comment_key = structured_comment_key.group(1) | |
if self.debug > 1: | |
print("Found Structured Comment") | |
else: | |
comment_list.append(data) | |
while True: | |
line = next(line_iter) | |
data = line[self.GENBANK_INDENT :] | |
if line[0 : self.GENBANK_INDENT] == self.GENBANK_SPACER: | |
if self.STRUCTURED_COMMENT_START in data: | |
regex = r"([^#]+){}$".format( | |
self.STRUCTURED_COMMENT_START | |
) | |
structured_comment_key = re.search(regex, data) | |
if structured_comment_key is not None: | |
structured_comment_key = ( | |
structured_comment_key.group(1) | |
) | |
else: | |
comment_list.append(data) | |
elif ( | |
structured_comment_key is not None | |
and self.STRUCTURED_COMMENT_DELIM in data | |
): | |
match = re.search( | |
r"(.+?)\s*{}\s*(.+)".format( | |
self.STRUCTURED_COMMENT_DELIM | |
), | |
data, | |
) | |
structured_comment_dict[structured_comment_key][ | |
match.group(1) | |
] = match.group(2) | |
if self.debug > 2: | |
print( | |
"Structured Comment continuation [" + data + "]" | |
) | |
elif ( | |
structured_comment_key is not None | |
and self.STRUCTURED_COMMENT_END not in data | |
): | |
# Don't die on a malformed comment, just warn and carry on | |
if ( | |
structured_comment_key | |
not in structured_comment_dict | |
): | |
warnings.warn( | |
"Structured comment not parsed for %s. Is it malformed?" | |
% consumer.data.name, | |
BiopythonParserWarning, | |
) | |
continue | |
# The current structured comment has a multiline value | |
previous_value_line = structured_comment_dict[ | |
structured_comment_key | |
][match.group(1)] | |
structured_comment_dict[structured_comment_key][ | |
match.group(1) | |
] = (previous_value_line + " " + line.strip()) | |
elif self.STRUCTURED_COMMENT_END in data: | |
# End of structured comment | |
structured_comment_key = None | |
else: | |
comment_list.append(data) | |
if self.debug > 2: | |
print("Comment continuation [" + data + "]") | |
else: | |
# End of the comment | |
break | |
if comment_list: | |
consumer.comment(comment_list) | |
if structured_comment_dict: | |
consumer.structured_comment(structured_comment_dict) | |
del comment_list, structured_comment_key, structured_comment_dict | |
elif line_type in consumer_dict: | |
# It's a semi-automatic entry! | |
# Now, this may be a multi line entry... | |
while True: | |
line = next(line_iter) | |
if line[0 : self.GENBANK_INDENT] == self.GENBANK_SPACER: | |
data += " " + line[self.GENBANK_INDENT :] | |
else: | |
# We now have all the data for this entry: | |
# The DEFINITION field must ends with a period | |
# # see ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt [3.4.5] | |
# and discussion https://github.com/biopython/biopython/pull/616 | |
# We consider this period belong to the syntax, not to the data | |
# So remove it if it exist | |
if line_type == "DEFINITION" and data.endswith("."): | |
data = data[:-1] | |
getattr(consumer, consumer_dict[line_type])(data) | |
# End of continuation - return to top of loop! | |
break | |
else: | |
if self.debug: | |
print("Ignoring GenBank header line:\n" % line) | |
# Read in next line | |
line = next(line_iter) | |
except StopIteration: | |
raise ValueError("Problem in header") from None | |
def _feed_misc_lines(self, consumer, lines): | |
# Deals with a few misc lines between the features and the sequence | |
lines.append("") | |
line_iter = iter(lines) | |
try: | |
for line in line_iter: | |
if line.startswith("BASE COUNT"): | |
line = line[10:].strip() | |
if line: | |
if self.debug: | |
print("base_count = " + line) | |
consumer.base_count(line) | |
if line.startswith("ORIGIN"): | |
line = line[6:].strip() | |
if line: | |
if self.debug: | |
print("origin_name = " + line) | |
consumer.origin_name(line) | |
if line.startswith("TLS "): | |
line = line[3:].strip() | |
consumer.tls(line) | |
if line.startswith("TSA "): | |
line = line[3:].strip() | |
consumer.tsa(line) | |
if line.startswith("WGS "): | |
line = line[3:].strip() | |
consumer.wgs(line) | |
if line.startswith("WGS_SCAFLD"): | |
line = line[10:].strip() | |
consumer.add_wgs_scafld(line) | |
if line.startswith("CONTIG"): | |
line = line[6:].strip() | |
contig_location = line | |
while True: | |
line = next(line_iter) | |
if not line: | |
break | |
elif line[: self.GENBANK_INDENT] == self.GENBANK_SPACER: | |
# Don't need to preserve the whitespace here. | |
contig_location += line[self.GENBANK_INDENT :].rstrip() | |
elif line.startswith("ORIGIN"): | |
# Strange, seen this in GenPept files via Entrez gbwithparts | |
line = line[6:].strip() | |
if line: | |
consumer.origin_name(line) | |
break | |
else: | |
raise ValueError( | |
"Expected CONTIG continuation line, got:\n" + line | |
) | |
consumer.contig_location(contig_location) | |
return | |
except StopIteration: | |
raise ValueError("Problem in misc lines before sequence") from None | |