Spaces:
No application file
No application file
# Copyright 2006-2016 by Peter Cock. All rights reserved. | |
# Revisions copyright 2015 by Ben Woodcroft. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.AlignIO support for "stockholm" format (used in the PFAM database). | |
You are expected to use this module via the Bio.AlignIO functions (or the | |
Bio.SeqIO functions if you want to work directly with the gapped sequences). | |
For example, consider a Stockholm alignment file containing the following:: | |
# STOCKHOLM 1.0 | |
#=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>.. | |
AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU | |
#=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..-- | |
AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU | |
#=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>---- | |
#=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>............... | |
AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU | |
#=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>--------------- | |
AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU | |
#=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>--------------- | |
// | |
This is a single multiple sequence alignment, so you would probably load this | |
using the Bio.AlignIO.read() function: | |
>>> from Bio import AlignIO | |
>>> align = AlignIO.read("Stockholm/simple.sth", "stockholm") | |
>>> print(align) | |
Alignment with 2 rows and 104 columns | |
UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1 | |
AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1 | |
>>> for record in align: | |
... print("%s %i" % (record.id, len(record))) | |
AP001509.1 104 | |
AE007476.1 104 | |
In addition to the sequences themselves, this example alignment also includes | |
some GR lines for the secondary structure of the sequences. These are | |
strings, with one character for each letter in the associated sequence: | |
>>> for record in align: | |
... print(record.id) | |
... print(record.seq) | |
... print(record.letter_annotations['secondary_structure']) | |
AP001509.1 | |
UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU | |
-----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>--------------- | |
AE007476.1 | |
AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU | |
-----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>--------------- | |
Any general annotation for each row is recorded in the SeqRecord's annotations | |
dictionary. Any per-column annotation for the entire alignment in in the | |
alignment's column annotations dictionary, such as the secondary structure | |
consensus in this example: | |
>>> sorted(align.column_annotations.keys()) | |
['secondary_structure'] | |
>>> align.column_annotations["secondary_structure"] | |
'.................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>...............' | |
You can output this alignment in many different file formats | |
using Bio.AlignIO.write(), or the MultipleSeqAlignment object's format method: | |
>>> print(format(align, "fasta")) | |
>AP001509.1 | |
UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-A | |
GGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU | |
>AE007476.1 | |
AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAA | |
GGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU | |
<BLANKLINE> | |
Most output formats won't be able to hold the annotation possible in a | |
Stockholm file: | |
>>> print(format(align, "stockholm")) | |
# STOCKHOLM 1.0 | |
#=GF SQ 2 | |
AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU | |
#=GS AP001509.1 AC AP001509.1 | |
#=GS AP001509.1 DE AP001509.1 | |
#=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>--------------- | |
AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU | |
#=GS AE007476.1 AC AE007476.1 | |
#=GS AE007476.1 DE AE007476.1 | |
#=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>--------------- | |
#=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>............... | |
// | |
<BLANKLINE> | |
Note that when writing Stockholm files, AlignIO does not break long sequences | |
up and interleave them (as in the input file shown above). The standard | |
allows this simpler layout, and it is more likely to be understood by other | |
tools. | |
Finally, as an aside, it can sometimes be useful to use Bio.SeqIO.parse() to | |
iterate over the alignment rows as SeqRecord objects - rather than working | |
with Alignnment objects. | |
>>> from Bio import SeqIO | |
>>> for record in SeqIO.parse("Stockholm/simple.sth", "stockholm"): | |
... print(record.id) | |
... print(record.seq) | |
... print(record.letter_annotations['secondary_structure']) | |
AP001509.1 | |
UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU | |
-----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>--------------- | |
AE007476.1 | |
AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU | |
-----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>--------------- | |
Remember that if you slice a SeqRecord, the per-letter-annotations like the | |
secondary structure string here, are also sliced: | |
>>> sub_record = record[10:20] | |
>>> print(sub_record.seq) | |
AUCGUUUUAC | |
>>> print(sub_record.letter_annotations['secondary_structure']) | |
-------<<< | |
Likewise with the alignment object, as long as you are not dropping any rows, | |
slicing specific columns of an alignment will slice any per-column-annotations: | |
>>> align.column_annotations["secondary_structure"] | |
'.................<<<<<<<<...<<<<<<<........>>>>>>>........<<<<<<<.......>>>>>>>..>>>>>>>>...............' | |
>>> part_align = align[:,10:20] | |
>>> part_align.column_annotations["secondary_structure"] | |
'.......<<<' | |
You can also see this in the Stockholm output of this partial-alignment: | |
>>> print(format(part_align, "stockholm")) | |
# STOCKHOLM 1.0 | |
#=GF SQ 2 | |
AP001509.1 UCAACACUCU | |
#=GS AP001509.1 AC AP001509.1 | |
#=GS AP001509.1 DE AP001509.1 | |
#=GR AP001509.1 SS -------<<< | |
AE007476.1 AUCGUUUUAC | |
#=GS AE007476.1 AC AE007476.1 | |
#=GS AE007476.1 DE AE007476.1 | |
#=GR AE007476.1 SS -------<<< | |
#=GC SS_cons .......<<< | |
// | |
<BLANKLINE> | |
""" | |
from Bio.Align import MultipleSeqAlignment | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from .Interfaces import AlignmentIterator | |
from .Interfaces import SequentialAlignmentWriter | |
class StockholmWriter(SequentialAlignmentWriter): | |
"""Stockholm/PFAM alignment writer.""" | |
# These dictionaries should be kept in sync with those | |
# defined in the StockholmIterator class. | |
pfam_gr_mapping = { | |
"secondary_structure": "SS", | |
"surface_accessibility": "SA", | |
"transmembrane": "TM", | |
"posterior_probability": "PP", | |
"ligand_binding": "LI", | |
"active_site": "AS", | |
"intron": "IN", | |
} | |
# These GC mappings are in addition to *_cons in GR mapping: | |
pfam_gc_mapping = {"reference_annotation": "RF", "model_mask": "MM"} | |
# Following dictionary deliberately does not cover AC, DE or DR | |
pfam_gs_mapping = {"organism": "OS", "organism_classification": "OC", "look": "LO"} | |
def write_alignment(self, alignment): | |
"""Use this to write (another) single alignment to an open file. | |
Note that sequences and their annotation are recorded | |
together (rather than having a block of annotation followed | |
by a block of aligned sequences). | |
""" | |
count = len(alignment) | |
self._length_of_sequences = alignment.get_alignment_length() | |
self._ids_written = [] | |
if count == 0: | |
raise ValueError("Must have at least one sequence") | |
if self._length_of_sequences == 0: | |
raise ValueError("Non-empty sequences are required") | |
self.handle.write("# STOCKHOLM 1.0\n") | |
self.handle.write("#=GF SQ %i\n" % count) | |
for record in alignment: | |
self._write_record(record) | |
# This shouldn't be None... but just in case, | |
if alignment.column_annotations: | |
for k, v in sorted(alignment.column_annotations.items()): | |
if k in self.pfam_gc_mapping: | |
self.handle.write(f"#=GC {self.pfam_gc_mapping[k]} {v}\n") | |
elif k in self.pfam_gr_mapping: | |
self.handle.write(f"#=GC {self.pfam_gr_mapping[k]}_cons {v}\n") | |
else: | |
# It doesn't follow the PFAM standards, but should we record | |
# this data anyway? | |
pass | |
self.handle.write("//\n") | |
def _write_record(self, record): | |
"""Write a single SeqRecord to the file (PRIVATE).""" | |
if self._length_of_sequences != len(record.seq): | |
raise ValueError("Sequences must all be the same length") | |
# For the case for stockholm to stockholm, try and use record.name | |
seq_name = record.id | |
if record.name is not None: | |
if "accession" in record.annotations: | |
if record.id == record.annotations["accession"]: | |
seq_name = record.name | |
# In the Stockholm file format, spaces are not allowed in the id | |
seq_name = seq_name.replace(" ", "_") | |
if "start" in record.annotations and "end" in record.annotations: | |
suffix = f"/{record.annotations['start']}-{record.annotations['end']}" | |
if seq_name[-len(suffix) :] != suffix: | |
seq_name = "%s/%s-%s" % ( | |
seq_name, | |
record.annotations["start"], | |
record.annotations["end"], | |
) | |
if seq_name in self._ids_written: | |
raise ValueError(f"Duplicate record identifier: {seq_name}") | |
self._ids_written.append(seq_name) | |
self.handle.write(f"{seq_name} {record.seq}\n") | |
# The recommended placement for GS lines (per sequence annotation) | |
# is above the alignment (as a header block) or just below the | |
# corresponding sequence. | |
# | |
# The recommended placement for GR lines (per sequence per column | |
# annotation such as secondary structure) is just below the | |
# corresponding sequence. | |
# | |
# We put both just below the corresponding sequence as this allows | |
# us to write the file using a single pass through the records. | |
# AC = Accession | |
if "accession" in record.annotations: | |
self.handle.write( | |
f"#=GS {seq_name} AC {self.clean(record.annotations['accession'])}\n" | |
) | |
elif record.id: | |
self.handle.write(f"#=GS {seq_name} AC {self.clean(record.id)}\n") | |
# DE = description | |
if record.description: | |
self.handle.write(f"#=GS {seq_name} DE {self.clean(record.description)}\n") | |
# DE = database links | |
for xref in record.dbxrefs: | |
self.handle.write(f"#=GS {seq_name} DR {self.clean(xref)}\n") | |
# GS = other per sequence annotation | |
for key, value in record.annotations.items(): | |
if key in self.pfam_gs_mapping: | |
data = self.clean(str(value)) | |
if data: | |
self.handle.write( | |
"#=GS %s %s %s\n" | |
% (seq_name, self.clean(self.pfam_gs_mapping[key]), data) | |
) | |
else: | |
# It doesn't follow the PFAM standards, but should we record | |
# this data anyway? | |
pass | |
# GR = per row per column sequence annotation | |
for key, value in record.letter_annotations.items(): | |
if key in self.pfam_gr_mapping and len(str(value)) == len(record.seq): | |
data = self.clean(str(value)) | |
if data: | |
self.handle.write( | |
"#=GR %s %s %s\n" | |
% (seq_name, self.clean(self.pfam_gr_mapping[key]), data) | |
) | |
else: | |
# It doesn't follow the PFAM standards, but should we record | |
# this data anyway? | |
pass | |
class StockholmIterator(AlignmentIterator): | |
"""Loads a Stockholm file from PFAM into MultipleSeqAlignment objects. | |
The file may contain multiple concatenated alignments, which are loaded | |
and returned incrementally. | |
This parser will detect if the Stockholm file follows the PFAM | |
conventions for sequence specific meta-data (lines starting #=GS | |
and #=GR) and populates the SeqRecord fields accordingly. | |
Any annotation which does not follow the PFAM conventions is currently | |
ignored. | |
If an accession is provided for an entry in the meta data, IT WILL NOT | |
be used as the record.id (it will be recorded in the record's | |
annotations). This is because some files have (sub) sequences from | |
different parts of the same accession (differentiated by different | |
start-end positions). | |
Wrap-around alignments are not supported - each sequences must be on | |
a single line. However, interlaced sequences should work. | |
For more information on the file format, please see: | |
http://sonnhammer.sbc.su.se/Stockholm.html | |
https://en.wikipedia.org/wiki/Stockholm_format | |
http://bioperl.org/formats/alignment_formats/Stockholm_multiple_alignment_format.html | |
For consistency with BioPerl and EMBOSS we call this the "stockholm" | |
format. | |
""" | |
# These dictionaries should be kept in sync with those | |
# defined in the PfamStockholmWriter class. | |
pfam_gr_mapping = { | |
"SS": "secondary_structure", | |
"SA": "surface_accessibility", | |
"TM": "transmembrane", | |
"PP": "posterior_probability", | |
"LI": "ligand_binding", | |
"AS": "active_site", | |
"IN": "intron", | |
} | |
# These GC mappings are in addition to *_cons in GR mapping: | |
pfam_gc_mapping = {"RF": "reference_annotation", "MM": "model_mask"} | |
# Following dictionary deliberately does not cover AC, DE or DR | |
pfam_gs_mapping = {"OS": "organism", "OC": "organism_classification", "LO": "look"} | |
_header = None # for caching lines between __next__ calls | |
def __next__(self): | |
"""Parse the next alignment from the handle.""" | |
handle = self.handle | |
if self._header is None: | |
line = handle.readline() | |
else: | |
# Header we saved from when we were parsing | |
# the previous alignment. | |
line = self._header | |
self._header = None | |
if not line: | |
# Empty file - just give up. | |
raise StopIteration | |
if line.strip() != "# STOCKHOLM 1.0": | |
raise ValueError("Did not find STOCKHOLM header") | |
# Note: If this file follows the PFAM conventions, there should be | |
# a line containing the number of sequences, e.g. "#=GF SQ 67" | |
# We do not check for this - perhaps we should, and verify that | |
# if present it agrees with our parsing. | |
seqs = {} | |
ids = {} # Really only need an OrderedSet, but python lacks this | |
gs = {} | |
gr = {} | |
gf = {} | |
gc = {} | |
passed_end_alignment = False | |
while True: | |
line = handle.readline() | |
if not line: | |
break # end of file | |
line = line.strip() # remove trailing \n | |
if line == "# STOCKHOLM 1.0": | |
self._header = line | |
break | |
elif line == "//": | |
# The "//" line indicates the end of the alignment. | |
# There may still be more meta-data | |
passed_end_alignment = True | |
elif line == "": | |
# blank line, ignore | |
pass | |
elif line[0] != "#": | |
# Sequence | |
# Format: "<seqname> <sequence>" | |
assert not passed_end_alignment | |
parts = [x.strip() for x in line.split(" ", 1)] | |
if len(parts) != 2: | |
# This might be someone attempting to store a zero length sequence? | |
raise ValueError( | |
"Could not split line into identifier and sequence:\n" + line | |
) | |
seq_id, seq = parts | |
if seq_id not in ids: | |
ids[seq_id] = True | |
seqs.setdefault(seq_id, "") | |
seqs[seq_id] += seq.replace(".", "-") | |
elif len(line) >= 5: | |
# Comment line or meta-data | |
if line[:5] == "#=GF ": | |
# Generic per-File annotation, free text | |
# Format: #=GF <feature> <free text> | |
feature, text = line[5:].strip().split(None, 1) | |
# Each feature key could be used more than once, | |
# so store the entries as a list of strings. | |
if feature not in gf: | |
gf[feature] = [text] | |
else: | |
gf[feature].append(text) | |
elif line[:5] == "#=GC ": | |
# Generic per-Column annotation, exactly 1 char per column | |
# Format: "#=GC <feature> <exactly 1 char per column>" | |
feature, text = line[5:].strip().split(None, 2) | |
if feature not in gc: | |
gc[feature] = "" | |
gc[feature] += text.strip() # append to any previous entry | |
# Might be interleaved blocks, so can't check length yet | |
elif line[:5] == "#=GS ": | |
# Generic per-Sequence annotation, free text | |
# Format: "#=GS <seqname> <feature> <free text>" | |
try: | |
seq_id, feature, text = line[5:].strip().split(None, 2) | |
except ValueError: | |
# Free text can sometimes be empty, which a one line split throws an error for. | |
# See https://github.com/biopython/biopython/issues/2982 for more details | |
seq_id, feature = line[5:].strip().split(None, 1) | |
text = "" | |
# if seq_id not in ids: | |
# ids.append(seq_id) | |
if seq_id not in gs: | |
gs[seq_id] = {} | |
if feature not in gs[seq_id]: | |
gs[seq_id][feature] = [text] | |
else: | |
gs[seq_id][feature].append(text) | |
elif line[:5] == "#=GR ": | |
# Generic per-Sequence AND per-Column markup | |
# Format: "#=GR <seqname> <feature> <exactly 1 char per column>" | |
seq_id, feature, text = line[5:].strip().split(None, 2) | |
# if seq_id not in ids: | |
# ids.append(seq_id) | |
if seq_id not in gr: | |
gr[seq_id] = {} | |
if feature not in gr[seq_id]: | |
gr[seq_id][feature] = "" | |
gr[seq_id][feature] += text.strip() # append to any previous entry | |
# Might be interleaved blocks, so can't check length yet | |
# Next line... | |
assert len(seqs) <= len(ids) | |
# assert len(gs) <= len(ids) | |
# assert len(gr) <= len(ids) | |
self.ids = ids.keys() | |
self.sequences = seqs | |
self.seq_annotation = gs | |
self.seq_col_annotation = gr | |
if ids and seqs: | |
if ( | |
self.records_per_alignment is not None | |
and self.records_per_alignment != len(ids) | |
): | |
raise ValueError( | |
"Found %i records in this alignment, told to expect %i" | |
% (len(ids), self.records_per_alignment) | |
) | |
alignment_length = len(list(seqs.values())[0]) | |
records = [] # Alignment obj will put them all in a list anyway | |
for seq_id in ids: | |
seq = seqs[seq_id] | |
if alignment_length != len(seq): | |
raise ValueError( | |
"Sequences have different lengths, or repeated identifier" | |
) | |
name, start, end = self._identifier_split(seq_id) | |
record = SeqRecord( | |
Seq(seq), | |
id=seq_id, | |
name=name, | |
description=seq_id, | |
annotations={"accession": name}, | |
) | |
# Accession will be overridden by _populate_meta_data if an explicit | |
# accession is provided: | |
record.annotations["accession"] = name | |
if start is not None: | |
record.annotations["start"] = start | |
if end is not None: | |
record.annotations["end"] = end | |
self._populate_meta_data(seq_id, record) | |
records.append(record) | |
for k, v in gc.items(): | |
if len(v) != alignment_length: | |
raise ValueError( | |
"%s length %i, expected %i" % (k, len(v), alignment_length) | |
) | |
alignment = MultipleSeqAlignment(records) | |
for k, v in sorted(gc.items()): | |
if k in self.pfam_gc_mapping: | |
alignment.column_annotations[self.pfam_gc_mapping[k]] = v | |
elif k.endswith("_cons") and k[:-5] in self.pfam_gr_mapping: | |
alignment.column_annotations[self.pfam_gr_mapping[k[:-5]]] = v | |
else: | |
# Ignore it? | |
alignment.column_annotations["GC:" + k] = v | |
# TODO - Introduce an annotated alignment class? | |
# For now, store the annotation a new private property: | |
alignment._annotations = gr | |
return alignment | |
else: | |
raise StopIteration | |
def _identifier_split(self, identifier): | |
"""Return (name, start, end) string tuple from an identifier (PRIVATE).""" | |
if "/" in identifier: | |
name, start_end = identifier.rsplit("/", 1) | |
if start_end.count("-") == 1: | |
try: | |
start, end = start_end.split("-") | |
return name, int(start), int(end) | |
except ValueError: | |
# Non-integers after final '/' - fall through | |
pass | |
return identifier, None, None | |
def _get_meta_data(self, identifier, meta_dict): | |
"""Take an identifier and returns dict of all meta-data matching it (PRIVATE). | |
For example, given "Q9PN73_CAMJE/149-220" will return all matches to | |
this or "Q9PN73_CAMJE" which the identifier without its /start-end | |
suffix. | |
In the example below, the suffix is required to match the AC, but must | |
be removed to match the OS and OC meta-data:: | |
# STOCKHOLM 1.0 | |
#=GS Q9PN73_CAMJE/149-220 AC Q9PN73 | |
... | |
Q9PN73_CAMJE/149-220 NKA... | |
... | |
#=GS Q9PN73_CAMJE OS Campylobacter jejuni | |
#=GS Q9PN73_CAMJE OC Bacteria | |
This function will return an empty dictionary if no data is found. | |
""" | |
name, start, end = self._identifier_split(identifier) | |
if name == identifier: | |
identifier_keys = [identifier] | |
else: | |
identifier_keys = [identifier, name] | |
answer = {} | |
for identifier_key in identifier_keys: | |
try: | |
for feature_key in meta_dict[identifier_key]: | |
answer[feature_key] = meta_dict[identifier_key][feature_key] | |
except KeyError: | |
pass | |
return answer | |
def _populate_meta_data(self, identifier, record): | |
"""Add meta-date to a SecRecord's annotations dictionary (PRIVATE). | |
This function applies the PFAM conventions. | |
""" | |
seq_data = self._get_meta_data(identifier, self.seq_annotation) | |
for feature in seq_data: | |
# Note this dictionary contains lists! | |
if feature == "AC": # ACcession number | |
assert len(seq_data[feature]) == 1 | |
record.annotations["accession"] = seq_data[feature][0] | |
elif feature == "DE": # DEscription | |
record.description = "\n".join(seq_data[feature]) | |
elif feature == "DR": # Database Reference | |
# Should we try and parse the strings? | |
record.dbxrefs = seq_data[feature] | |
elif feature in self.pfam_gs_mapping: | |
record.annotations[self.pfam_gs_mapping[feature]] = ", ".join( | |
seq_data[feature] | |
) | |
else: | |
# Ignore it? | |
record.annotations["GS:" + feature] = ", ".join(seq_data[feature]) | |
# Now record the per-letter-annotations | |
seq_col_data = self._get_meta_data(identifier, self.seq_col_annotation) | |
for feature in seq_col_data: | |
# Note this dictionary contains strings! | |
if feature in self.pfam_gr_mapping: | |
record.letter_annotations[self.pfam_gr_mapping[feature]] = seq_col_data[ | |
feature | |
] | |
else: | |
# Ignore it? | |
record.letter_annotations["GR:" + feature] = seq_col_data[feature] | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest() | |