Spaces:
No application file
No application file
# Copyright 2000 by Jeffrey Chang, Brad Chapman. All rights reserved. | |
# Copyright 2006-2017 by Peter Cock. All rights reserved. | |
# | |
# This code is part of the Biopython distribution and governed by its | |
# license. Please see the LICENSE file that should have been included | |
# as part of this package. | |
"""Code to work with GenBank formatted files. | |
Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with | |
the "genbank" or "embl" format names to parse GenBank or EMBL files into | |
SeqRecord and SeqFeature objects (see the Biopython tutorial for details). | |
Using Bio.GenBank directly to parse GenBank files is only useful if you want | |
to obtain GenBank-specific Record objects, which is a much closer | |
representation to the raw file contents than the SeqRecord alternative from | |
the FeatureParser (used in Bio.SeqIO). | |
To use the Bio.GenBank parser, there are two helper functions: | |
- read Parse a handle containing a single GenBank record | |
as Bio.GenBank specific Record objects. | |
- parse Iterate over a handle containing multiple GenBank | |
records as Bio.GenBank specific Record objects. | |
The following internal classes are not intended for direct use and may | |
be deprecated in a future release. | |
Classes: | |
- Iterator Iterate through a file of GenBank entries | |
- FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects. | |
- RecordParser Parse GenBank data into a Record object. | |
Exceptions: | |
- ParserFailureError Exception indicating a failure in the parser (ie. | |
scanner or consumer) | |
""" | |
import re | |
import warnings | |
from Bio import BiopythonParserWarning | |
from Bio.Seq import Seq | |
from Bio.SeqFeature import Location | |
from Bio.SeqFeature import Reference | |
from Bio.SeqFeature import SeqFeature | |
from Bio.SeqFeature import SimpleLocation | |
from Bio.SeqFeature import LocationParserError | |
# other Bio.GenBank stuff | |
from .utils import FeatureValueCleaner | |
from .Scanner import GenBankScanner | |
# Constants used to parse GenBank header lines | |
GENBANK_INDENT = 12 | |
GENBANK_SPACER = " " * GENBANK_INDENT | |
# Constants for parsing GenBank feature lines | |
FEATURE_KEY_INDENT = 5 | |
FEATURE_QUALIFIER_INDENT = 21 | |
FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT | |
FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT | |
class Iterator: | |
"""Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE). | |
This class is likely to be deprecated in a future release of Biopython. | |
Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...) | |
for SeqRecord and GenBank specific Record objects respectively instead. | |
""" | |
def __init__(self, handle, parser=None): | |
"""Initialize the iterator. | |
Arguments: | |
- handle - A handle with GenBank entries to iterate through. | |
- parser - An optional parser to pass the entries through before | |
returning them. If None, then the raw entry will be returned. | |
""" | |
self.handle = handle | |
self._parser = parser | |
def __next__(self): | |
"""Return the next GenBank record from the handle. | |
Will return None if we ran out of records. | |
""" | |
if self._parser is None: | |
lines = [] | |
while True: | |
line = self.handle.readline() | |
if not line: | |
return None # Premature end of file? | |
lines.append(line) | |
if line.rstrip() == "//": | |
break | |
return "".join(lines) | |
try: | |
return self._parser.parse(self.handle) | |
except StopIteration: | |
return None | |
def __iter__(self): | |
"""Iterate over the records.""" | |
return iter(self.__next__, None) | |
class ParserFailureError(ValueError): | |
"""Failure caused by some kind of problem in the parser.""" | |
pass | |
_cleaner = FeatureValueCleaner() | |
class FeatureParser: | |
"""Parse GenBank files into Seq + Feature objects (OBSOLETE). | |
Direct use of this class is discouraged, and may be deprecated in | |
a future release of Biopython. | |
Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead. | |
""" | |
def __init__(self, debug_level=0, use_fuzziness=1, feature_cleaner=None): | |
"""Initialize a GenBank parser and Feature consumer. | |
Arguments: | |
- debug_level - An optional argument that species the amount of | |
debugging information the parser should spit out. By default we have | |
no debugging info (the fastest way to do things), but if you want | |
you can set this as high as two and see exactly where a parse fails. | |
- use_fuzziness - Specify whether or not to use fuzzy representations. | |
The default is 1 (use fuzziness). | |
- feature_cleaner - A class which will be used to clean out the | |
values of features. This class must implement the function | |
clean_value. GenBank.utils has a "standard" cleaner class, which | |
is used by default. | |
""" | |
self._scanner = GenBankScanner(debug_level) | |
self.use_fuzziness = use_fuzziness | |
if feature_cleaner: | |
self._cleaner = feature_cleaner | |
else: | |
self._cleaner = _cleaner # default | |
def parse(self, handle): | |
"""Parse the specified handle.""" | |
_consumer = _FeatureConsumer(self.use_fuzziness, self._cleaner) | |
self._scanner.feed(handle, _consumer) | |
return _consumer.data | |
class RecordParser: | |
"""Parse GenBank files into Record objects (OBSOLETE). | |
Direct use of this class is discouraged, and may be deprecated in | |
a future release of Biopython. | |
Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions | |
instead. | |
""" | |
def __init__(self, debug_level=0): | |
"""Initialize the parser. | |
Arguments: | |
- debug_level - An optional argument that species the amount of | |
debugging information the parser should spit out. By default we have | |
no debugging info (the fastest way to do things), but if you want | |
you can set this as high as two and see exactly where a parse fails. | |
""" | |
self._scanner = GenBankScanner(debug_level) | |
def parse(self, handle): | |
"""Parse the specified handle into a GenBank record.""" | |
_consumer = _RecordConsumer() | |
self._scanner.feed(handle, _consumer) | |
return _consumer.data | |
class _BaseGenBankConsumer: | |
"""Abstract GenBank consumer providing useful general functions (PRIVATE). | |
This just helps to eliminate some duplication in things that most | |
GenBank consumers want to do. | |
""" | |
# Special keys in GenBank records that we should remove spaces from | |
# For instance, \translation keys have values which are proteins and | |
# should have spaces and newlines removed from them. This class | |
# attribute gives us more control over specific formatting problems. | |
remove_space_keys = ["translation"] | |
def __init__(self): | |
pass | |
def _split_keywords(keyword_string): | |
"""Split a string of keywords into a nice clean list (PRIVATE).""" | |
# process the keywords into a python list | |
if keyword_string == "" or keyword_string == ".": | |
keywords = "" | |
elif keyword_string[-1] == ".": | |
keywords = keyword_string[:-1] | |
else: | |
keywords = keyword_string | |
keyword_list = keywords.split(";") | |
return [x.strip() for x in keyword_list] | |
def _split_accessions(accession_string): | |
"""Split a string of accession numbers into a list (PRIVATE).""" | |
# first replace all line feeds with spaces | |
# Also, EMBL style accessions are split with ';' | |
accession = accession_string.replace("\n", " ").replace(";", " ") | |
return [x.strip() for x in accession.split() if x.strip()] | |
def _split_taxonomy(taxonomy_string): | |
"""Split a string with taxonomy info into a list (PRIVATE).""" | |
if not taxonomy_string or taxonomy_string == ".": | |
# Missing data, no taxonomy | |
return [] | |
if taxonomy_string[-1] == ".": | |
tax_info = taxonomy_string[:-1] | |
else: | |
tax_info = taxonomy_string | |
tax_list = tax_info.split(";") | |
new_tax_list = [] | |
for tax_item in tax_list: | |
new_items = tax_item.split("\n") | |
new_tax_list.extend(new_items) | |
while "" in new_tax_list: | |
new_tax_list.remove("") | |
return [x.strip() for x in new_tax_list] | |
def _clean_location(location_string): | |
"""Clean whitespace out of a location string (PRIVATE). | |
The location parser isn't a fan of whitespace, so we clean it out | |
before feeding it into the parser. | |
""" | |
# Originally this imported string.whitespace and did a replace | |
# via a loop. It's simpler to just split on whitespace and rejoin | |
# the string - and this avoids importing string too. See Bug 2684. | |
return "".join(location_string.split()) | |
def _remove_newlines(text): | |
"""Remove any newlines in the passed text, returning the new string (PRIVATE).""" | |
# get rid of newlines in the qualifier value | |
newlines = ["\n", "\r"] | |
for ws in newlines: | |
text = text.replace(ws, "") | |
return text | |
def _normalize_spaces(text): | |
"""Replace multiple spaces in the passed text with single spaces (PRIVATE).""" | |
# get rid of excessive spaces | |
return " ".join(x for x in text.split(" ") if x) | |
def _remove_spaces(text): | |
"""Remove all spaces from the passed text (PRIVATE).""" | |
return text.replace(" ", "") | |
def _convert_to_python_numbers(start, end): | |
"""Convert a start and end range to python notation (PRIVATE). | |
In GenBank, starts and ends are defined in "biological" coordinates, | |
where 1 is the first base and [i, j] means to include both i and j. | |
In python, 0 is the first base and [i, j] means to include i, but | |
not j. | |
So, to convert "biological" to python coordinates, we need to | |
subtract 1 from the start, and leave the end and things should | |
be converted happily. | |
""" | |
new_start = start - 1 | |
new_end = end | |
return new_start, new_end | |
class _FeatureConsumer(_BaseGenBankConsumer): | |
"""Create a SeqRecord object with Features to return (PRIVATE). | |
Attributes: | |
- use_fuzziness - specify whether or not to parse with fuzziness in | |
feature locations. | |
- feature_cleaner - a class that will be used to provide specialized | |
cleaning-up of feature values. | |
""" | |
def __init__(self, use_fuzziness, feature_cleaner=None): | |
from Bio.SeqRecord import SeqRecord | |
_BaseGenBankConsumer.__init__(self) | |
self.data = SeqRecord(None, id=None) | |
self.data.id = None | |
self.data.description = "" | |
self._use_fuzziness = use_fuzziness | |
self._feature_cleaner = feature_cleaner | |
self._seq_type = "" | |
self._seq_data = [] | |
self._cur_reference = None | |
self._cur_feature = None | |
self._expected_size = None | |
def locus(self, locus_name): | |
"""Set the locus name is set as the name of the Sequence.""" | |
self.data.name = locus_name | |
def size(self, content): | |
"""Record the sequence length.""" | |
self._expected_size = int(content) | |
def residue_type(self, type): | |
"""Record the sequence type (SEMI-OBSOLETE). | |
This reflects the fact that the topology (linear/circular) and | |
molecule type (e.g. DNA vs RNA) were a single field in early | |
files. Current GenBank/EMBL files have two fields. | |
""" | |
self._seq_type = type.strip() | |
def topology(self, topology): | |
"""Validate and record sequence topology. | |
The topology argument should be "linear" or "circular" (string). | |
""" | |
if topology: | |
if topology not in ["linear", "circular"]: | |
raise ParserFailureError( | |
f"Unexpected topology {topology!r} should be linear or circular" | |
) | |
self.data.annotations["topology"] = topology | |
def molecule_type(self, mol_type): | |
"""Validate and record the molecule type (for round-trip etc).""" | |
if mol_type: | |
if "circular" in mol_type or "linear" in mol_type: | |
raise ParserFailureError( | |
f"Molecule type {mol_type!r} should not include topology" | |
) | |
# Writing out records will fail if we have a lower case DNA | |
# or RNA string in here, so upper case it. | |
# This is a bit ugly, but we don't want to upper case e.g. | |
# the m in mRNA, but thanks to the strip we lost the spaces | |
# so we need to index from the back | |
if mol_type[-3:].upper() in ("DNA", "RNA") and not mol_type[-3:].isupper(): | |
warnings.warn( | |
f"Non-upper case molecule type in LOCUS line: {mol_type}", | |
BiopythonParserWarning, | |
) | |
self.data.annotations["molecule_type"] = mol_type | |
def data_file_division(self, division): | |
self.data.annotations["data_file_division"] = division | |
def date(self, submit_date): | |
self.data.annotations["date"] = submit_date | |
def definition(self, definition): | |
"""Set the definition as the description of the sequence.""" | |
if self.data.description: | |
# Append to any existing description | |
# e.g. EMBL files with two DE lines. | |
self.data.description += " " + definition | |
else: | |
self.data.description = definition | |
def accession(self, acc_num): | |
"""Set the accession number as the id of the sequence. | |
If we have multiple accession numbers, the first one passed is | |
used. | |
""" | |
new_acc_nums = self._split_accessions(acc_num) | |
# Also record them ALL in the annotations | |
try: | |
# On the off chance there was more than one accession line: | |
for acc in new_acc_nums: | |
# Prevent repeat entries | |
if acc not in self.data.annotations["accessions"]: | |
self.data.annotations["accessions"].append(acc) | |
except KeyError: | |
self.data.annotations["accessions"] = new_acc_nums | |
# if we haven't set the id information yet, add the first acc num | |
if not self.data.id: | |
if len(new_acc_nums) > 0: | |
# self.data.id = new_acc_nums[0] | |
# Use the FIRST accession as the ID, not the first on this line! | |
self.data.id = self.data.annotations["accessions"][0] | |
def tls(self, content): | |
self.data.annotations["tls"] = content.split("-") | |
def tsa(self, content): | |
self.data.annotations["tsa"] = content.split("-") | |
def wgs(self, content): | |
self.data.annotations["wgs"] = content.split("-") | |
def add_wgs_scafld(self, content): | |
self.data.annotations.setdefault("wgs_scafld", []).append(content.split("-")) | |
def nid(self, content): | |
self.data.annotations["nid"] = content | |
def pid(self, content): | |
self.data.annotations["pid"] = content | |
def version(self, version_id): | |
# Want to use the versioned accession as the record.id | |
# This comes from the VERSION line in GenBank files, or the | |
# obsolete SV line in EMBL. For the new EMBL files we need | |
# both the version suffix from the ID line and the accession | |
# from the AC line. | |
if version_id.count(".") == 1 and version_id.split(".")[1].isdigit(): | |
self.accession(version_id.split(".")[0]) | |
self.version_suffix(version_id.split(".")[1]) | |
elif version_id: | |
# For backwards compatibility... | |
self.data.id = version_id | |
def project(self, content): | |
"""Handle the information from the PROJECT line as a list of projects. | |
e.g.:: | |
PROJECT GenomeProject:28471 | |
or:: | |
PROJECT GenomeProject:13543 GenomeProject:99999 | |
This is stored as dbxrefs in the SeqRecord to be consistent with the | |
projected switch of this line to DBLINK in future GenBank versions. | |
Note the NCBI plan to replace "GenomeProject:28471" with the shorter | |
"Project:28471" as part of this transition. | |
""" | |
content = content.replace("GenomeProject:", "Project:") | |
self.data.dbxrefs.extend(p for p in content.split() if p) | |
def dblink(self, content): | |
"""Store DBLINK cross references as dbxrefs in our record object. | |
This line type is expected to replace the PROJECT line in 2009. e.g. | |
During transition:: | |
PROJECT GenomeProject:28471 | |
DBLINK Project:28471 | |
Trace Assembly Archive:123456 | |
Once the project line is dropped:: | |
DBLINK Project:28471 | |
Trace Assembly Archive:123456 | |
Note GenomeProject -> Project. | |
We'll have to see some real examples to be sure, but based on the | |
above example we can expect one reference per line. | |
Note that at some point the NCBI have included an extra space, e.g.:: | |
DBLINK Project: 28471 | |
""" | |
# During the transition period with both PROJECT and DBLINK lines, | |
# we don't want to add the same cross reference twice. | |
while ": " in content: | |
content = content.replace(": ", ":") | |
if content.strip() not in self.data.dbxrefs: | |
self.data.dbxrefs.append(content.strip()) | |
def version_suffix(self, version): | |
"""Set the version to overwrite the id. | |
Since the version provides the same information as the accession | |
number, plus some extra info, we set this as the id if we have | |
a version. | |
""" | |
# e.g. GenBank line: | |
# VERSION U49845.1 GI:1293613 | |
# or the obsolete EMBL line: | |
# SV U49845.1 | |
# Scanner calls consumer.version("U49845.1") | |
# which then calls consumer.version_suffix(1) | |
# | |
# e.g. EMBL new line: | |
# ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. | |
# Scanner calls consumer.version_suffix(1) | |
assert version.isdigit() | |
self.data.annotations["sequence_version"] = int(version) | |
def db_source(self, content): | |
self.data.annotations["db_source"] = content.rstrip() | |
def gi(self, content): | |
self.data.annotations["gi"] = content | |
def keywords(self, content): | |
if "keywords" in self.data.annotations: | |
# Multi-line keywords, append to list | |
# Note EMBL states "A keyword is never split between lines." | |
self.data.annotations["keywords"].extend(self._split_keywords(content)) | |
else: | |
self.data.annotations["keywords"] = self._split_keywords(content) | |
def segment(self, content): | |
self.data.annotations["segment"] = content | |
def source(self, content): | |
# Note that some software (e.g. VectorNTI) may produce an empty | |
# source (rather than using a dot/period as might be expected). | |
if content == "": | |
source_info = "" | |
elif content[-1] == ".": | |
source_info = content[:-1] | |
else: | |
source_info = content | |
self.data.annotations["source"] = source_info | |
def organism(self, content): | |
self.data.annotations["organism"] = content | |
def taxonomy(self, content): | |
"""Record (another line of) the taxonomy lineage.""" | |
lineage = self._split_taxonomy(content) | |
try: | |
self.data.annotations["taxonomy"].extend(lineage) | |
except KeyError: | |
self.data.annotations["taxonomy"] = lineage | |
def reference_num(self, content): | |
"""Signal the beginning of a new reference object.""" | |
# if we have a current reference that hasn't been added to | |
# the list of references, add it. | |
if self._cur_reference is not None: | |
self.data.annotations["references"].append(self._cur_reference) | |
else: | |
self.data.annotations["references"] = [] | |
self._cur_reference = Reference() | |
def reference_bases(self, content): | |
"""Attempt to determine the sequence region the reference entails. | |
Possible types of information we may have to deal with: | |
(bases 1 to 86436) | |
(sites) | |
(bases 1 to 105654; 110423 to 111122) | |
1 (residues 1 to 182) | |
""" | |
# first remove the parentheses | |
assert content.endswith(")"), content | |
ref_base_info = content[1:-1] | |
all_locations = [] | |
# parse if we've got 'bases' and 'to' | |
if "bases" in ref_base_info and "to" in ref_base_info: | |
# get rid of the beginning 'bases' | |
ref_base_info = ref_base_info[5:] | |
locations = self._split_reference_locations(ref_base_info) | |
all_locations.extend(locations) | |
elif "residues" in ref_base_info and "to" in ref_base_info: | |
residues_start = ref_base_info.find("residues") | |
# get only the information after "residues" | |
ref_base_info = ref_base_info[(residues_start + len("residues ")) :] | |
locations = self._split_reference_locations(ref_base_info) | |
all_locations.extend(locations) | |
# make sure if we are not finding information then we have | |
# the string 'sites' or the string 'bases' | |
elif ref_base_info == "sites" or ref_base_info.strip() == "bases": | |
pass | |
# otherwise raise an error | |
else: | |
raise ValueError( | |
f"Could not parse base info {ref_base_info} in record {self.data.id}" | |
) | |
self._cur_reference.location = all_locations | |
def _split_reference_locations(self, location_string): | |
"""Get reference locations out of a string of reference information (PRIVATE). | |
The passed string should be of the form:: | |
1 to 20; 20 to 100 | |
This splits the information out and returns a list of location objects | |
based on the reference locations. | |
""" | |
# split possibly multiple locations using the ';' | |
all_base_info = location_string.split(";") | |
new_locations = [] | |
for base_info in all_base_info: | |
start, end = base_info.split("to") | |
new_start, new_end = self._convert_to_python_numbers( | |
int(start.strip()), int(end.strip()) | |
) | |
this_location = SimpleLocation(new_start, new_end) | |
new_locations.append(this_location) | |
return new_locations | |
def authors(self, content): | |
if self._cur_reference.authors: | |
self._cur_reference.authors += " " + content | |
else: | |
self._cur_reference.authors = content | |
def consrtm(self, content): | |
if self._cur_reference.consrtm: | |
self._cur_reference.consrtm += " " + content | |
else: | |
self._cur_reference.consrtm = content | |
def title(self, content): | |
if self._cur_reference is None: | |
warnings.warn( | |
"GenBank TITLE line without REFERENCE line.", BiopythonParserWarning | |
) | |
elif self._cur_reference.title: | |
self._cur_reference.title += " " + content | |
else: | |
self._cur_reference.title = content | |
def journal(self, content): | |
if self._cur_reference.journal: | |
self._cur_reference.journal += " " + content | |
else: | |
self._cur_reference.journal = content | |
def medline_id(self, content): | |
self._cur_reference.medline_id = content | |
def pubmed_id(self, content): | |
self._cur_reference.pubmed_id = content | |
def remark(self, content): | |
"""Deal with a reference comment.""" | |
if self._cur_reference.comment: | |
self._cur_reference.comment += " " + content | |
else: | |
self._cur_reference.comment = content | |
def comment(self, content): | |
try: | |
self.data.annotations["comment"] += "\n" + "\n".join(content) | |
except KeyError: | |
self.data.annotations["comment"] = "\n".join(content) | |
def structured_comment(self, content): | |
self.data.annotations["structured_comment"] = content | |
def features_line(self, content): | |
"""Get ready for the feature table when we reach the FEATURE line.""" | |
self.start_feature_table() | |
def start_feature_table(self): | |
"""Indicate we've got to the start of the feature table.""" | |
# make sure we've added on our last reference object | |
if self._cur_reference is not None: | |
self.data.annotations["references"].append(self._cur_reference) | |
self._cur_reference = None | |
def feature_key(self, content): | |
# start a new feature | |
self._cur_feature = SeqFeature() | |
self._cur_feature.type = content | |
self.data.features.append(self._cur_feature) | |
def location(self, content): | |
"""Parse out location information from the location string. | |
This uses simple Python code with some regular expressions to do the | |
parsing, and then translates the results into appropriate objects. | |
""" | |
# clean up newlines and other whitespace inside the location before | |
# parsing - locations should have no whitespace whatsoever | |
location_line = self._clean_location(content) | |
# Older records have junk like replace(266,"c") in the | |
# location line. Newer records just replace this with | |
# the number 266 and have the information in a more reasonable | |
# place. So we'll just grab out the number and feed this to the | |
# parser. We shouldn't really be losing any info this way. | |
if "replace" in location_line: | |
comma_pos = location_line.find(",") | |
location_line = location_line[8:comma_pos] | |
length = self._expected_size | |
# Check if the sequence is circular for features that span the origin | |
is_circular = "circular" in self.data.annotations.get("topology", "").lower() | |
stranded = "PROTEIN" not in self._seq_type.upper() | |
try: | |
location = Location.fromstring(location_line, length, is_circular, stranded) | |
except LocationParserError as e: | |
warnings.warn( | |
f"{e}; setting feature location to None.", BiopythonParserWarning | |
) | |
location = None | |
self._cur_feature.location = location | |
def feature_qualifier(self, key, value): | |
"""When we get a qualifier key and its value. | |
Can receive None, since you can have valueless keys such as /pseudo | |
""" | |
# Hack to try to preserve historical behaviour of /pseudo etc | |
if value is None: | |
# if the key doesn't exist yet, add an empty string | |
if key not in self._cur_feature.qualifiers: | |
self._cur_feature.qualifiers[key] = [""] | |
return | |
# otherwise just skip this key | |
return | |
# Remove enclosing quotation marks | |
if len(value) > 1 and value[0] == '"' and value[-1] == '"': | |
value = value[1:-1] | |
# Handle NCBI escaping | |
# Warn if escaping is not according to standard | |
if re.search(r'[^"]"[^"]|^"[^"]|[^"]"$', value): | |
warnings.warn( | |
'The NCBI states double-quote characters like " should be escaped as "" ' | |
"(two double - quotes), but here it was not: %r" % value, | |
BiopythonParserWarning, | |
) | |
# Undo escaping, repeated double quotes -> one double quote | |
value = value.replace('""', '"') | |
if self._feature_cleaner is not None: | |
value = self._feature_cleaner.clean_value(key, value) | |
# if the qualifier name exists, append the value | |
if key in self._cur_feature.qualifiers: | |
self._cur_feature.qualifiers[key].append(value) | |
# otherwise start a new list of the key with its values | |
else: | |
self._cur_feature.qualifiers[key] = [value] | |
def feature_qualifier_name(self, content_list): | |
"""Use feature_qualifier instead (OBSOLETE).""" | |
raise NotImplementedError("Use the feature_qualifier method instead.") | |
def feature_qualifier_description(self, content): | |
"""Use feature_qualifier instead (OBSOLETE).""" | |
raise NotImplementedError("Use the feature_qualifier method instead.") | |
def contig_location(self, content): | |
"""Deal with CONTIG information.""" | |
# Historically this was stored as a SeqFeature object, but it was | |
# stored under record.annotations["contig"] and not under | |
# record.features with the other SeqFeature objects. | |
# | |
# The CONTIG location line can include additional tokens like | |
# Gap(), Gap(100) or Gap(unk100) which are not used in the feature | |
# location lines, so storing it using SeqFeature based location | |
# objects is difficult. | |
# | |
# We now store this a string, which means for BioSQL we are now in | |
# much better agreement with how BioPerl records the CONTIG line | |
# in the database. | |
# | |
# NOTE - This code assumes the scanner will return all the CONTIG | |
# lines already combined into one long string! | |
self.data.annotations["contig"] = content | |
def origin_name(self, content): | |
pass | |
def base_count(self, content): | |
pass | |
def base_number(self, content): | |
pass | |
def sequence(self, content): | |
"""Add up sequence information as we get it. | |
To try and make things speedier, this puts all of the strings | |
into a list of strings, and then uses string.join later to put | |
them together. Supposedly, this is a big time savings | |
""" | |
assert " " not in content | |
self._seq_data.append(content.upper()) | |
def record_end(self, content): | |
"""Clean up when we've finished the record.""" | |
# Try and append the version number to the accession for the full id | |
if not self.data.id: | |
if "accessions" in self.data.annotations: | |
raise ValueError( | |
"Problem adding version number to accession: " | |
+ str(self.data.annotations["accessions"]) | |
) | |
self.data.id = self.data.name # Good fall back? | |
elif self.data.id.count(".") == 0: | |
try: | |
self.data.id += ".%i" % self.data.annotations["sequence_version"] | |
except KeyError: | |
pass | |
# add the sequence information | |
sequence = "".join(self._seq_data) | |
if ( | |
self._expected_size is not None | |
and len(sequence) != 0 | |
and self._expected_size != len(sequence) | |
): | |
warnings.warn( | |
"Expected sequence length %i, found %i (%s)." | |
% (self._expected_size, len(sequence), self.data.id), | |
BiopythonParserWarning, | |
) | |
molecule_type = None | |
if self._seq_type: | |
# mRNA is really also DNA, since it is actually cDNA | |
if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper(): | |
molecule_type = "DNA" | |
# are there ever really RNA sequences in GenBank? | |
elif "RNA" in self._seq_type.upper(): | |
# Even for data which was from RNA, the sequence string | |
# is usually given as DNA (T not U). Bug 3010 | |
molecule_type = "RNA" | |
elif ( | |
"PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT" | |
): # PRT is used in EMBL-bank for patents | |
molecule_type = "protein" | |
# work around ugly GenBank records which have circular or | |
# linear but no indication of sequence type | |
elif self._seq_type in ["circular", "linear", "unspecified"]: | |
pass | |
# we have a bug if we get here | |
else: | |
raise ValueError( | |
f"Could not determine molecule_type for seq_type {self._seq_type}" | |
) | |
# Don't overwrite molecule_type | |
if molecule_type is not None: | |
self.data.annotations["molecule_type"] = self.data.annotations.get( | |
"molecule_type", molecule_type | |
) | |
if not sequence and self._expected_size: | |
self.data.seq = Seq(None, length=self._expected_size) | |
else: | |
self.data.seq = Seq(sequence) | |
class _RecordConsumer(_BaseGenBankConsumer): | |
"""Create a GenBank Record object from scanner generated information (PRIVATE).""" | |
def __init__(self): | |
_BaseGenBankConsumer.__init__(self) | |
from . import Record | |
self.data = Record.Record() | |
self._seq_data = [] | |
self._cur_reference = None | |
self._cur_feature = None | |
self._cur_qualifier = None | |
def tls(self, content): | |
self.data.tls = content.split("-") | |
def tsa(self, content): | |
self.data.tsa = content.split("-") | |
def wgs(self, content): | |
self.data.wgs = content.split("-") | |
def add_wgs_scafld(self, content): | |
self.data.wgs_scafld.append(content.split("-")) | |
def locus(self, content): | |
self.data.locus = content | |
def size(self, content): | |
self.data.size = content | |
def residue_type(self, content): | |
# Be lenient about parsing, but technically lowercase residue types are malformed. | |
if "dna" in content or "rna" in content: | |
warnings.warn( | |
f"Invalid seq_type ({content}): DNA/RNA should be uppercase.", | |
BiopythonParserWarning, | |
) | |
self.data.residue_type = content | |
def data_file_division(self, content): | |
self.data.data_file_division = content | |
def date(self, content): | |
self.data.date = content | |
def definition(self, content): | |
self.data.definition = content | |
def accession(self, content): | |
for acc in self._split_accessions(content): | |
if acc not in self.data.accession: | |
self.data.accession.append(acc) | |
def molecule_type(self, mol_type): | |
"""Validate and record the molecule type (for round-trip etc).""" | |
if mol_type: | |
if "circular" in mol_type or "linear" in mol_type: | |
raise ParserFailureError( | |
f"Molecule type {mol_type!r} should not include topology" | |
) | |
# Writing out records will fail if we have a lower case DNA | |
# or RNA string in here, so upper case it. | |
# This is a bit ugly, but we don't want to upper case e.g. | |
# the m in mRNA, but thanks to the strip we lost the spaces | |
# so we need to index from the back | |
if mol_type[-3:].upper() in ("DNA", "RNA") and not mol_type[-3:].isupper(): | |
warnings.warn( | |
f"Non-upper case molecule type in LOCUS line: {mol_type}", | |
BiopythonParserWarning, | |
) | |
self.data.molecule_type = mol_type | |
def topology(self, topology): | |
"""Validate and record sequence topology. | |
The topology argument should be "linear" or "circular" (string). | |
""" | |
if topology: | |
if topology not in ["linear", "circular"]: | |
raise ParserFailureError( | |
f"Unexpected topology {topology!r} should be linear or circular" | |
) | |
self.data.topology = topology | |
def nid(self, content): | |
self.data.nid = content | |
def pid(self, content): | |
self.data.pid = content | |
def version(self, content): | |
self.data.version = content | |
def db_source(self, content): | |
self.data.db_source = content.rstrip() | |
def gi(self, content): | |
self.data.gi = content | |
def keywords(self, content): | |
self.data.keywords = self._split_keywords(content) | |
def project(self, content): | |
self.data.projects.extend(p for p in content.split() if p) | |
def dblink(self, content): | |
self.data.dblinks.append(content) | |
def segment(self, content): | |
self.data.segment = content | |
def source(self, content): | |
self.data.source = content | |
def organism(self, content): | |
self.data.organism = content | |
def taxonomy(self, content): | |
self.data.taxonomy = self._split_taxonomy(content) | |
def reference_num(self, content): | |
"""Grab the reference number and signal the start of a new reference.""" | |
# check if we have a reference to add | |
if self._cur_reference is not None: | |
self.data.references.append(self._cur_reference) | |
from . import Record | |
self._cur_reference = Record.Reference() | |
self._cur_reference.number = content | |
def reference_bases(self, content): | |
self._cur_reference.bases = content | |
def authors(self, content): | |
self._cur_reference.authors = content | |
def consrtm(self, content): | |
self._cur_reference.consrtm = content | |
def title(self, content): | |
if self._cur_reference is None: | |
warnings.warn( | |
"GenBank TITLE line without REFERENCE line.", BiopythonParserWarning | |
) | |
return | |
self._cur_reference.title = content | |
def journal(self, content): | |
self._cur_reference.journal = content | |
def medline_id(self, content): | |
self._cur_reference.medline_id = content | |
def pubmed_id(self, content): | |
self._cur_reference.pubmed_id = content | |
def remark(self, content): | |
self._cur_reference.remark = content | |
def comment(self, content): | |
self.data.comment += "\n".join(content) | |
def structured_comment(self, content): | |
self.data.structured_comment = content | |
def primary_ref_line(self, content): | |
"""Save reference data for the PRIMARY line.""" | |
self.data.primary.append(content) | |
def primary(self, content): | |
pass | |
def features_line(self, content): | |
"""Get ready for the feature table when we reach the FEATURE line.""" | |
self.start_feature_table() | |
def start_feature_table(self): | |
"""Signal the start of the feature table.""" | |
# we need to add on the last reference | |
if self._cur_reference is not None: | |
self.data.references.append(self._cur_reference) | |
def feature_key(self, content): | |
"""Grab the key of the feature and signal the start of a new feature.""" | |
# first add on feature information if we've got any | |
self._add_feature() | |
from . import Record | |
self._cur_feature = Record.Feature() | |
self._cur_feature.key = content | |
def _add_feature(self): | |
"""Add a feature to the record, with relevant checks (PRIVATE). | |
This does all of the appropriate checking to make sure we haven't | |
left any info behind, and that we are only adding info if it | |
exists. | |
""" | |
if self._cur_feature is not None: | |
# if we have a left over qualifier, add it to the qualifiers | |
# on the current feature | |
if self._cur_qualifier is not None: | |
self._cur_feature.qualifiers.append(self._cur_qualifier) | |
self._cur_qualifier = None | |
self.data.features.append(self._cur_feature) | |
def location(self, content): | |
self._cur_feature.location = self._clean_location(content) | |
def feature_qualifier(self, key, value): | |
self.feature_qualifier_name([key]) | |
if value is not None: | |
self.feature_qualifier_description(value) | |
def feature_qualifier_name(self, content_list): | |
"""Deal with qualifier names. | |
We receive a list of keys, since you can have valueless keys such as | |
/pseudo which would be passed in with the next key (since no other | |
tags separate them in the file) | |
""" | |
from . import Record | |
for content in content_list: | |
# the record parser keeps the /s -- add them if we don't have 'em | |
if not content.startswith("/"): | |
content = f"/{content}" | |
# add on a qualifier if we've got one | |
if self._cur_qualifier is not None: | |
self._cur_feature.qualifiers.append(self._cur_qualifier) | |
self._cur_qualifier = Record.Qualifier() | |
self._cur_qualifier.key = content | |
def feature_qualifier_description(self, content): | |
# if we have info then the qualifier key should have a ='s | |
if "=" not in self._cur_qualifier.key: | |
self._cur_qualifier.key = f"{self._cur_qualifier.key}=" | |
cur_content = self._remove_newlines(content) | |
# remove all spaces from the value if it is a type where spaces | |
# are not important | |
for remove_space_key in self.__class__.remove_space_keys: | |
if remove_space_key in self._cur_qualifier.key: | |
cur_content = self._remove_spaces(cur_content) | |
self._cur_qualifier.value = self._normalize_spaces(cur_content) | |
def base_count(self, content): | |
self.data.base_counts = content | |
def origin_name(self, content): | |
self.data.origin = content | |
def contig_location(self, content): | |
"""Signal that we have contig information to add to the record.""" | |
self.data.contig = self._clean_location(content) | |
def sequence(self, content): | |
"""Add sequence information to a list of sequence strings. | |
This removes spaces in the data and uppercases the sequence, and | |
then adds it to a list of sequences. Later on we'll join this | |
list together to make the final sequence. This is faster than | |
adding on the new string every time. | |
""" | |
assert " " not in content | |
self._seq_data.append(content.upper()) | |
def record_end(self, content): | |
"""Signal the end of the record and do any necessary clean-up.""" | |
# add together all of the sequence parts to create the | |
# final sequence string | |
self.data.sequence = "".join(self._seq_data) | |
# add on the last feature | |
self._add_feature() | |
def parse(handle): | |
"""Iterate over GenBank formatted entries as Record objects. | |
>>> from Bio import GenBank | |
>>> with open("GenBank/NC_000932.gb") as handle: | |
... for record in GenBank.parse(handle): | |
... print(record.accession) | |
['NC_000932'] | |
To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb") | |
instead. | |
""" | |
return iter(Iterator(handle, RecordParser())) | |
def read(handle): | |
"""Read a handle containing a single GenBank entry as a Record object. | |
>>> from Bio import GenBank | |
>>> with open("GenBank/NC_000932.gb") as handle: | |
... record = GenBank.read(handle) | |
... print(record.accession) | |
['NC_000932'] | |
To get a SeqRecord object use Bio.SeqIO.read(..., format="gb") | |
instead. | |
""" | |
iterator = parse(handle) | |
try: | |
record = next(iterator) | |
except StopIteration: | |
raise ValueError("No records found in handle") from None | |
try: | |
next(iterator) | |
raise ValueError("More than one record found in handle") | |
except StopIteration: | |
pass | |
return record | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest() | |