Spaces:
No application file
No application file
# Copyright 2008-2015 by Peter Cock. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format. | |
This module is for reading and writing PIR or NBRF format files as | |
SeqRecord objects. | |
You are expected to use this module via the Bio.SeqIO functions, or if | |
the file contains a sequence alignment, optionally via Bio.AlignIO instead. | |
This format was introduced for the Protein Information Resource (PIR), a | |
project of the National Biomedical Research Foundation (NBRF). The PIR | |
database itself is now part of UniProt. | |
The file format is described online at: | |
http://www.ebi.ac.uk/help/pir_frame.html | |
http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down) | |
An example file in this format would be:: | |
>P1;CRAB_ANAPL | |
ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). | |
MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR | |
SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH | |
GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ | |
SDVPERSIPI TREEKPAIAG AQRK* | |
>P1;CRAB_BOVIN | |
ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). | |
MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR | |
PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV | |
HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK | |
QASGPERTIP ITREEKPAVT AAPKK* | |
Or, an example of a multiple sequence alignment:: | |
>P1;S27231 | |
rhodopsin - northern leopard frog | |
MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY | |
VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG | |
GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP | |
EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES | |
ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI | |
YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA* | |
>P1;I51200 | |
rhodopsin - African clawed frog | |
MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF | |
VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG | |
GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP | |
EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES | |
LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI | |
YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA* | |
>P1;JN0120 | |
rhodopsin - Japanese lamprey | |
MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF | |
VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG | |
GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP | |
EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES | |
ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL | |
YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA* | |
As with the FASTA format, each record starts with a line beginning with ">" | |
character. There is then a two letter sequence type (P1, F1, DL, DC, RL, | |
RC, or XX), a semi colon, and the identification code. The second like is | |
free text description. The remaining lines contain the sequence itself, | |
terminating in an asterisk. Space separated blocks of ten letters as shown | |
above are typical. | |
Sequence codes and their meanings: | |
- P1 - Protein (complete) | |
- F1 - Protein (fragment) | |
- D1 - DNA (e.g. EMBOSS seqret output) | |
- DL - DNA (linear) | |
- DC - DNA (circular) | |
- RL - RNA (linear) | |
- RC - RNA (circular) | |
- N3 - tRNA | |
- N1 - Other functional RNA | |
- XX - Unknown | |
""" | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from .Interfaces import _get_seq_string | |
from .Interfaces import SequenceIterator | |
from .Interfaces import SequenceWriter | |
_pir_mol_type = { | |
"P1": "protein", | |
"F1": "protein", | |
"D1": "DNA", | |
"DL": "DNA", | |
"DC": "DNA", | |
"RL": "RNA", | |
"RC": "RNA", | |
"N3": "RNA", | |
"XX": None, | |
} | |
class PirIterator(SequenceIterator): | |
"""Parser for PIR files.""" | |
def __init__(self, source): | |
"""Iterate over a PIR file and yield SeqRecord objects. | |
source - file-like object or a path to a file. | |
Examples | |
-------- | |
>>> with open("NBRF/DMB_prot.pir") as handle: | |
... for record in PirIterator(handle): | |
... print("%s length %i" % (record.id, len(record))) | |
HLA:HLA00489 length 263 | |
HLA:HLA00490 length 94 | |
HLA:HLA00491 length 94 | |
HLA:HLA00492 length 80 | |
HLA:HLA00493 length 175 | |
HLA:HLA01083 length 188 | |
""" | |
super().__init__(source, mode="t", fmt="Pir") | |
def parse(self, handle): | |
"""Start parsing the file, and return a SeqRecord generator.""" | |
records = self.iterate(handle) | |
return records | |
def iterate(self, handle): | |
"""Iterate over the records in the PIR file.""" | |
# Skip any text before the first record (e.g. blank lines, comments) | |
for line in handle: | |
if line[0] == ">": | |
break | |
else: | |
return # Premature end of file, or just empty? | |
while True: | |
pir_type = line[1:3] | |
if pir_type not in _pir_mol_type or line[3] != ";": | |
raise ValueError( | |
"Records should start with '>XX;' where XX is a valid sequence type" | |
) | |
identifier = line[4:].strip() | |
description = handle.readline().strip() | |
lines = [] | |
for line in handle: | |
if line[0] == ">": | |
break | |
# Remove trailing whitespace, and any internal spaces | |
lines.append(line.rstrip().replace(" ", "")) | |
else: | |
line = None | |
seq = "".join(lines) | |
if seq[-1] != "*": | |
# Note the * terminator is present on nucleotide sequences too, | |
# it is not a stop codon! | |
raise ValueError( | |
"Sequences in PIR files should include a * terminator!" | |
) | |
# Return the record and then continue... | |
record = SeqRecord( | |
Seq(seq[:-1]), id=identifier, name=identifier, description=description | |
) | |
record.annotations["PIR-type"] = pir_type | |
if _pir_mol_type[pir_type]: | |
record.annotations["molecule_type"] = _pir_mol_type[pir_type] | |
yield record | |
if line is None: | |
return # StopIteration | |
raise ValueError("Unrecognised PIR record format.") | |
class PirWriter(SequenceWriter): | |
"""Class to write PIR format files.""" | |
def __init__(self, handle, wrap=60, record2title=None, code=None): | |
"""Create a PIR writer. | |
Arguments: | |
- handle - Handle to an output file, e.g. as returned | |
by open(filename, "w") | |
- wrap - Optional line length used to wrap sequence lines. | |
Defaults to wrapping the sequence at 60 characters | |
Use zero (or None) for no wrapping, giving a single | |
long line for the sequence. | |
- record2title - Optional function to return the text to be | |
used for the title line of each record. By default | |
a combination of the record.id, record.name and | |
record.description is used. | |
- code - Optional sequence code must be one of P1, F1, | |
D1, DL, DC, RL, RC, N3 and XX. By default None is used, | |
which means auto detection based on the molecule type | |
in the record annotation. | |
You can either use:: | |
handle = open(filename, "w") | |
writer = PirWriter(handle) | |
writer.write_file(myRecords) | |
handle.close() | |
Or, follow the sequential file writer system, for example:: | |
handle = open(filename, "w") | |
writer = PirWriter(handle) | |
writer.write_header() # does nothing for PIR files | |
... | |
Multiple writer.write_record() and/or writer.write_records() calls | |
... | |
writer.write_footer() # does nothing for PIR files | |
handle.close() | |
""" | |
super().__init__(handle) | |
self.wrap = None | |
if wrap: | |
if wrap < 1: | |
raise ValueError("wrap should be None, 0, or a positive integer") | |
self.wrap = wrap | |
self.record2title = record2title | |
self.code = code | |
def write_record(self, record): | |
"""Write a single PIR record to the file.""" | |
if self.record2title: | |
title = self.clean(self.record2title(record)) | |
else: | |
title = self.clean(record.id) | |
if record.name and record.description: | |
description = self.clean(record.name + " - " + record.description) | |
elif record.name and not record.description: | |
description = self.clean(record.name) | |
else: | |
description = self.clean(record.description) | |
if self.code: | |
code = self.code | |
else: | |
molecule_type = record.annotations.get("molecule_type") | |
if molecule_type is None: | |
code = "XX" | |
elif "DNA" in molecule_type: | |
code = "D1" | |
elif "RNA" in molecule_type: | |
code = "RL" | |
elif "protein" in molecule_type: | |
code = "P1" | |
else: | |
code = "XX" | |
if code not in _pir_mol_type: | |
raise TypeError( | |
"Sequence code must be one of " + _pir_mol_type.keys() + "." | |
) | |
assert "\n" not in title | |
assert "\r" not in description | |
self.handle.write(f">{code};{title}\n{description}\n") | |
data = _get_seq_string(record) # Catches sequence being None | |
assert "\n" not in data | |
assert "\r" not in data | |
if self.wrap: | |
line = "" | |
for i in range(0, len(data), self.wrap): | |
line += data[i : i + self.wrap] + "\n" | |
line = line[:-1] + "*\n" | |
self.handle.write(line) | |
else: | |
self.handle.write(data + "*\n") | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest(verbose=0) | |