Spaces:
No application file
No application file
File size: 10,295 Bytes
b7731cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 |
# Copyright 2008-2015 by Peter Cock. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format.
This module is for reading and writing PIR or NBRF format files as
SeqRecord objects.
You are expected to use this module via the Bio.SeqIO functions, or if
the file contains a sequence alignment, optionally via Bio.AlignIO instead.
This format was introduced for the Protein Information Resource (PIR), a
project of the National Biomedical Research Foundation (NBRF). The PIR
database itself is now part of UniProt.
The file format is described online at:
http://www.ebi.ac.uk/help/pir_frame.html
http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down)
An example file in this format would be::
>P1;CRAB_ANAPL
ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
SDVPERSIPI TREEKPAIAG AQRK*
>P1;CRAB_BOVIN
ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
QASGPERTIP ITREEKPAVT AAPKK*
Or, an example of a multiple sequence alignment::
>P1;S27231
rhodopsin - northern leopard frog
MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*
>P1;I51200
rhodopsin - African clawed frog
MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*
>P1;JN0120
rhodopsin - Japanese lamprey
MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*
As with the FASTA format, each record starts with a line beginning with ">"
character. There is then a two letter sequence type (P1, F1, DL, DC, RL,
RC, or XX), a semi colon, and the identification code. The second like is
free text description. The remaining lines contain the sequence itself,
terminating in an asterisk. Space separated blocks of ten letters as shown
above are typical.
Sequence codes and their meanings:
- P1 - Protein (complete)
- F1 - Protein (fragment)
- D1 - DNA (e.g. EMBOSS seqret output)
- DL - DNA (linear)
- DC - DNA (circular)
- RL - RNA (linear)
- RC - RNA (circular)
- N3 - tRNA
- N1 - Other functional RNA
- XX - Unknown
"""
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from .Interfaces import _get_seq_string
from .Interfaces import SequenceIterator
from .Interfaces import SequenceWriter
_pir_mol_type = {
"P1": "protein",
"F1": "protein",
"D1": "DNA",
"DL": "DNA",
"DC": "DNA",
"RL": "RNA",
"RC": "RNA",
"N3": "RNA",
"XX": None,
}
class PirIterator(SequenceIterator):
"""Parser for PIR files."""
def __init__(self, source):
"""Iterate over a PIR file and yield SeqRecord objects.
source - file-like object or a path to a file.
Examples
--------
>>> with open("NBRF/DMB_prot.pir") as handle:
... for record in PirIterator(handle):
... print("%s length %i" % (record.id, len(record)))
HLA:HLA00489 length 263
HLA:HLA00490 length 94
HLA:HLA00491 length 94
HLA:HLA00492 length 80
HLA:HLA00493 length 175
HLA:HLA01083 length 188
"""
super().__init__(source, mode="t", fmt="Pir")
def parse(self, handle):
"""Start parsing the file, and return a SeqRecord generator."""
records = self.iterate(handle)
return records
def iterate(self, handle):
"""Iterate over the records in the PIR file."""
# Skip any text before the first record (e.g. blank lines, comments)
for line in handle:
if line[0] == ">":
break
else:
return # Premature end of file, or just empty?
while True:
pir_type = line[1:3]
if pir_type not in _pir_mol_type or line[3] != ";":
raise ValueError(
"Records should start with '>XX;' where XX is a valid sequence type"
)
identifier = line[4:].strip()
description = handle.readline().strip()
lines = []
for line in handle:
if line[0] == ">":
break
# Remove trailing whitespace, and any internal spaces
lines.append(line.rstrip().replace(" ", ""))
else:
line = None
seq = "".join(lines)
if seq[-1] != "*":
# Note the * terminator is present on nucleotide sequences too,
# it is not a stop codon!
raise ValueError(
"Sequences in PIR files should include a * terminator!"
)
# Return the record and then continue...
record = SeqRecord(
Seq(seq[:-1]), id=identifier, name=identifier, description=description
)
record.annotations["PIR-type"] = pir_type
if _pir_mol_type[pir_type]:
record.annotations["molecule_type"] = _pir_mol_type[pir_type]
yield record
if line is None:
return # StopIteration
raise ValueError("Unrecognised PIR record format.")
class PirWriter(SequenceWriter):
"""Class to write PIR format files."""
def __init__(self, handle, wrap=60, record2title=None, code=None):
"""Create a PIR writer.
Arguments:
- handle - Handle to an output file, e.g. as returned
by open(filename, "w")
- wrap - Optional line length used to wrap sequence lines.
Defaults to wrapping the sequence at 60 characters
Use zero (or None) for no wrapping, giving a single
long line for the sequence.
- record2title - Optional function to return the text to be
used for the title line of each record. By default
a combination of the record.id, record.name and
record.description is used.
- code - Optional sequence code must be one of P1, F1,
D1, DL, DC, RL, RC, N3 and XX. By default None is used,
which means auto detection based on the molecule type
in the record annotation.
You can either use::
handle = open(filename, "w")
writer = PirWriter(handle)
writer.write_file(myRecords)
handle.close()
Or, follow the sequential file writer system, for example::
handle = open(filename, "w")
writer = PirWriter(handle)
writer.write_header() # does nothing for PIR files
...
Multiple writer.write_record() and/or writer.write_records() calls
...
writer.write_footer() # does nothing for PIR files
handle.close()
"""
super().__init__(handle)
self.wrap = None
if wrap:
if wrap < 1:
raise ValueError("wrap should be None, 0, or a positive integer")
self.wrap = wrap
self.record2title = record2title
self.code = code
def write_record(self, record):
"""Write a single PIR record to the file."""
if self.record2title:
title = self.clean(self.record2title(record))
else:
title = self.clean(record.id)
if record.name and record.description:
description = self.clean(record.name + " - " + record.description)
elif record.name and not record.description:
description = self.clean(record.name)
else:
description = self.clean(record.description)
if self.code:
code = self.code
else:
molecule_type = record.annotations.get("molecule_type")
if molecule_type is None:
code = "XX"
elif "DNA" in molecule_type:
code = "D1"
elif "RNA" in molecule_type:
code = "RL"
elif "protein" in molecule_type:
code = "P1"
else:
code = "XX"
if code not in _pir_mol_type:
raise TypeError(
"Sequence code must be one of " + _pir_mol_type.keys() + "."
)
assert "\n" not in title
assert "\r" not in description
self.handle.write(f">{code};{title}\n{description}\n")
data = _get_seq_string(record) # Catches sequence being None
assert "\n" not in data
assert "\r" not in data
if self.wrap:
line = ""
for i in range(0, len(data), self.wrap):
line += data[i : i + self.wrap] + "\n"
line = line[:-1] + "*\n"
self.handle.write(line)
else:
self.handle.write(data + "*\n")
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest(verbose=0)
|