Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /SeqIO /PirIO.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

10.3 kB

	# Copyright 2008-2015 by Peter Cock. All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format.

	This module is for reading and writing PIR or NBRF format files as
	SeqRecord objects.

	You are expected to use this module via the Bio.SeqIO functions, or if
	the file contains a sequence alignment, optionally via Bio.AlignIO instead.

	This format was introduced for the Protein Information Resource (PIR), a
	project of the National Biomedical Research Foundation (NBRF). The PIR
	database itself is now part of UniProt.

	The file format is described online at:
	http://www.ebi.ac.uk/help/pir_frame.html
	http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down)

	An example file in this format would be::

	>P1;CRAB_ANAPL
	ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
	MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
	SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
	GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
	SDVPERSIPI TREEKPAIAG AQRK*

	>P1;CRAB_BOVIN
	ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
	MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
	PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
	HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
	QASGPERTIP ITREEKPAVT AAPKK*

	Or, an example of a multiple sequence alignment::

	>P1;S27231
	rhodopsin - northern leopard frog
	MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
	VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
	GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
	EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
	ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
	YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*

	>P1;I51200
	rhodopsin - African clawed frog
	MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
	VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
	GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
	EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
	LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
	YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*

	>P1;JN0120
	rhodopsin - Japanese lamprey
	MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
	VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
	GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
	EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
	ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
	YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*


	As with the FASTA format, each record starts with a line beginning with ">"
	character. There is then a two letter sequence type (P1, F1, DL, DC, RL,
	RC, or XX), a semi colon, and the identification code. The second like is
	free text description. The remaining lines contain the sequence itself,
	terminating in an asterisk. Space separated blocks of ten letters as shown
	above are typical.

	Sequence codes and their meanings:
	- P1 - Protein (complete)
	- F1 - Protein (fragment)
	- D1 - DNA (e.g. EMBOSS seqret output)
	- DL - DNA (linear)
	- DC - DNA (circular)
	- RL - RNA (linear)
	- RC - RNA (circular)
	- N3 - tRNA
	- N1 - Other functional RNA
	- XX - Unknown

	"""
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord

	from .Interfaces import _get_seq_string
	from .Interfaces import SequenceIterator
	from .Interfaces import SequenceWriter


	_pir_mol_type = {
	"P1": "protein",
	"F1": "protein",
	"D1": "DNA",
	"DL": "DNA",
	"DC": "DNA",
	"RL": "RNA",
	"RC": "RNA",
	"N3": "RNA",
	"XX": None,
	}


	class PirIterator(SequenceIterator):
	"""Parser for PIR files."""

	def __init__(self, source):
	"""Iterate over a PIR file and yield SeqRecord objects.

	source - file-like object or a path to a file.

	Examples
	--------
	>>> with open("NBRF/DMB_prot.pir") as handle:
	... for record in PirIterator(handle):
	... print("%s length %i" % (record.id, len(record)))
	HLA:HLA00489 length 263
	HLA:HLA00490 length 94
	HLA:HLA00491 length 94
	HLA:HLA00492 length 80
	HLA:HLA00493 length 175
	HLA:HLA01083 length 188

	"""
	super().__init__(source, mode="t", fmt="Pir")

	def parse(self, handle):
	"""Start parsing the file, and return a SeqRecord generator."""
	records = self.iterate(handle)
	return records

	def iterate(self, handle):
	"""Iterate over the records in the PIR file."""
	# Skip any text before the first record (e.g. blank lines, comments)
	for line in handle:
	if line[0] == ">":
	break
	else:
	return # Premature end of file, or just empty?

	while True:
	pir_type = line[1:3]
	if pir_type not in _pir_mol_type or line[3] != ";":
	raise ValueError(
	"Records should start with '>XX;' where XX is a valid sequence type"
	)
	identifier = line[4:].strip()
	description = handle.readline().strip()

	lines = []
	for line in handle:
	if line[0] == ">":
	break
	# Remove trailing whitespace, and any internal spaces
	lines.append(line.rstrip().replace(" ", ""))
	else:
	line = None
	seq = "".join(lines)
	if seq[-1] != "*":
	# Note the * terminator is present on nucleotide sequences too,
	# it is not a stop codon!
	raise ValueError(
	"Sequences in PIR files should include a * terminator!"
	)

	# Return the record and then continue...
	record = SeqRecord(
	Seq(seq[:-1]), id=identifier, name=identifier, description=description
	)
	record.annotations["PIR-type"] = pir_type
	if _pir_mol_type[pir_type]:
	record.annotations["molecule_type"] = _pir_mol_type[pir_type]
	yield record

	if line is None:
	return # StopIteration
	raise ValueError("Unrecognised PIR record format.")


	class PirWriter(SequenceWriter):
	"""Class to write PIR format files."""

	def __init__(self, handle, wrap=60, record2title=None, code=None):
	"""Create a PIR writer.

	Arguments:
	- handle - Handle to an output file, e.g. as returned
	by open(filename, "w")
	- wrap - Optional line length used to wrap sequence lines.
	Defaults to wrapping the sequence at 60 characters
	Use zero (or None) for no wrapping, giving a single
	long line for the sequence.
	- record2title - Optional function to return the text to be
	used for the title line of each record. By default
	a combination of the record.id, record.name and
	record.description is used.
	- code - Optional sequence code must be one of P1, F1,
	D1, DL, DC, RL, RC, N3 and XX. By default None is used,
	which means auto detection based on the molecule type
	in the record annotation.

	You can either use::

	handle = open(filename, "w")
	writer = PirWriter(handle)
	writer.write_file(myRecords)
	handle.close()

	Or, follow the sequential file writer system, for example::

	handle = open(filename, "w")
	writer = PirWriter(handle)
	writer.write_header() # does nothing for PIR files
	...
	Multiple writer.write_record() and/or writer.write_records() calls
	...
	writer.write_footer() # does nothing for PIR files
	handle.close()

	"""
	super().__init__(handle)
	self.wrap = None
	if wrap:
	if wrap < 1:
	raise ValueError("wrap should be None, 0, or a positive integer")
	self.wrap = wrap
	self.record2title = record2title
	self.code = code

	def write_record(self, record):
	"""Write a single PIR record to the file."""
	if self.record2title:
	title = self.clean(self.record2title(record))
	else:
	title = self.clean(record.id)

	if record.name and record.description:
	description = self.clean(record.name + " - " + record.description)
	elif record.name and not record.description:
	description = self.clean(record.name)
	else:
	description = self.clean(record.description)

	if self.code:
	code = self.code
	else:
	molecule_type = record.annotations.get("molecule_type")
	if molecule_type is None:
	code = "XX"
	elif "DNA" in molecule_type:
	code = "D1"
	elif "RNA" in molecule_type:
	code = "RL"
	elif "protein" in molecule_type:
	code = "P1"
	else:
	code = "XX"

	if code not in _pir_mol_type:
	raise TypeError(
	"Sequence code must be one of " + _pir_mol_type.keys() + "."
	)
	assert "\n" not in title
	assert "\r" not in description

	self.handle.write(f">{code};{title}\n{description}\n")

	data = _get_seq_string(record) # Catches sequence being None

	assert "\n" not in data
	assert "\r" not in data

	if self.wrap:
	line = ""
	for i in range(0, len(data), self.wrap):
	line += data[i : i + self.wrap] + "\n"
	line = line[:-1] + "*\n"
	self.handle.write(line)
	else:
	self.handle.write(data + "*\n")


	if __name__ == "__main__":
	from Bio._utils import run_doctest

	run_doctest(verbose=0)