Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /UniProt /GOA.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

15.8 kB

	#!/usr/bin/env python
	# Copyright 2013, 2016 by Iddo Friedberg [email protected]. All rights reserved.
	# Copyright 2020 by Sergio Valqui. All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Parsers for the GAF, GPA and GPI formats from UniProt-GOA.

	Uniprot-GOA README + GAF format description:
	ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/README

	Gene Association File, GAF formats:
	http://geneontology.org/docs/go-annotation-file-gaf-format-2.2/
	http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/
	http://geneontology.org/docs/go-annotation-file-gaf-format-2.0/

	Gene Product Association Data (GPA format) README:
	http://geneontology.org/docs/gene-product-association-data-gpad-format/

	Gene Product Information (GPI format) README:
	http://geneontology.org/docs/gene-product-information-gpi-format/

	Go Annotation files are located here:
	ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/
	"""


	import copy

	# GAF: GO Annotation Format
	#
	# GAF version 2.0

	GAF20FIELDS = [
	"DB",
	"DB_Object_ID",
	"DB_Object_Symbol",
	"Qualifier",
	"GO_ID",
	"DB:Reference",
	"Evidence",
	"With",
	"Aspect",
	"DB_Object_Name",
	"Synonym",
	"DB_Object_Type",
	"Taxon_ID",
	"Date",
	"Assigned_By",
	"Annotation_Extension",
	"Gene_Product_Form_ID",
	]

	# GAF version 1.0
	GAF10FIELDS = [
	"DB",
	"DB_Object_ID",
	"DB_Object_Symbol",
	"Qualifier",
	"GO_ID",
	"DB:Reference",
	"Evidence",
	"With",
	"Aspect",
	"DB_Object_Name",
	"Synonym",
	"DB_Object_Type",
	"Taxon_ID",
	"Date",
	"Assigned_By",
	]

	# GPA version 1.0
	GPA10FIELDS = [
	"DB",
	"DB_Object_ID",
	"Qualifier",
	"GO_ID",
	"DB:Reference",
	"Evidence code",
	"With",
	"Interacting_taxon_ID",
	"Date",
	"Assigned_by",
	"Annotation_Extension",
	"Spliceform_ID",
	]

	# GPA version 1.1
	GPA11FIELDS = [
	"DB",
	"DB_Object_ID",
	"Qualifier",
	"GO_ID",
	"DB:Reference",
	"ECO_Evidence_code",
	"With",
	"Interacting_taxon_ID",
	"Date",
	"Assigned_by",
	"Annotation Extension",
	"Annotation_Properties",
	]

	# GPI version 1.0
	GPI10FIELDS = [
	"DB",
	"DB_subset",
	"DB_Object_ID",
	"DB_Object_Symbol",
	"DB_Object_Name",
	"DB_Object_Synonym",
	"DB_Object_Type",
	"Taxon",
	"Annotation_Target_Set",
	"Annotation_Completed",
	"Parent_Object_ID",
	]

	# GPI version 1.1
	GPI11FIELDS = [
	"DB_Object_ID",
	"DB_Object_Symbol",
	"DB_Object_Name",
	"DB_Object_Synonym",
	"DB_Object_Type",
	"Taxon",
	"Parent_Object_ID",
	"DB_Xref",
	"Gene_Product_Properties",
	]

	# GPI version 1.2
	GPI12FIELDS = [
	"DB",
	"DB_Object_ID",
	"DB_Object_Symbol",
	"DB_Object_Name",
	"DB_Object_Synonym",
	"DB_Object_Type",
	"Taxon",
	"Parent_Object_ID",
	"DB_Xref",
	"Gene_Product_Properties",
	]


	def _gpi10iterator(handle):
	"""Read GPI 1.0 format files (PRIVATE).

	This iterator is used to read a gp_information.goa_uniprot
	file which is in the GPI 1.0 format.
	"""
	for inline in handle:
	if inline[0] == "!":
	continue
	inrec = inline.rstrip("\n").split("\t")
	if len(inrec) == 1:
	continue
	inrec[5] = inrec[5].split("\|") # DB_Object_Synonym(s)
	inrec[8] = inrec[8].split("\|") # Annotation_Target_Set
	yield dict(zip(GPI10FIELDS, inrec))


	def _gpi11iterator(handle):
	"""Read GPI 1.1 format files (PRIVATE).

	This iterator is used to read a gp_information.goa_uniprot
	file which is in the GPI 1.1 format.
	"""
	for inline in handle:
	if inline[0] == "!":
	continue
	inrec = inline.rstrip("\n").split("\t")
	if len(inrec) == 1:
	continue
	inrec[2] = inrec[2].split("\|") # DB_Object_Name
	inrec[3] = inrec[3].split("\|") # DB_Object_Synonym(s)
	inrec[7] = inrec[7].split("\|") # DB_Xref(s)
	inrec[8] = inrec[8].split("\|") # Properties
	yield dict(zip(GPI11FIELDS, inrec))


	def _gpi12iterator(handle):
	"""Read GPI 1.2 format files (PRIVATE).

	This iterator is used to read a gp_information.goa_uniprot
	file which is in the GPI 1.2 format.
	"""
	for inline in handle:
	if inline[0] == "!":
	continue
	inrec = inline.rstrip("\n").split("\t")
	if len(inrec) == 1:
	continue
	inrec[3] = inrec[3].split("\|") # DB_Object_Name
	inrec[4] = inrec[4].split("\|") # DB_Object_Synonym(s)
	inrec[8] = inrec[8].split("\|") # DB_Xref(s)
	inrec[9] = inrec[9].split("\|") # Properties
	yield dict(zip(GPI12FIELDS, inrec))


	def gpi_iterator(handle):
	"""Read GPI format files.

	This function should be called to read a
	gp_information.goa_uniprot file. At the moment, there is
	only one format, but this may change, so
	this function is a placeholder a future wrapper.
	"""
	inline = handle.readline()
	if inline.strip() == "!gpi-version: 1.2":
	return _gpi12iterator(handle)
	elif inline.strip() == "!gpi-version: 1.1":
	# sys.stderr.write("gpi 1.1\n")
	return _gpi11iterator(handle)
	elif inline.strip() == "!gpi-version: 1.0":
	# sys.stderr.write("gpi 1.0\n")
	return _gpi10iterator(handle)
	elif inline.strip() == "!gpi-version: 2.1":
	# sys.stderr.write("gpi 2.1\n")
	# return _gpi20iterator(handle)
	raise NotImplementedError("Sorry, parsing GPI version 2 not implemented yet.")
	else:
	raise ValueError(f"Unknown GPI version {inline}\n")


	def _gpa10iterator(handle):
	"""Read GPA 1.0 format files (PRIVATE).

	This iterator is used to read a gp_association.*
	file which is in the GPA 1.0 format. Do not call directly. Rather,
	use the gpaiterator function.
	"""
	for inline in handle:
	if inline[0] == "!":
	continue
	inrec = inline.rstrip("\n").split("\t")
	if len(inrec) == 1:
	continue
	inrec[2] = inrec[2].split("\|") # Qualifier
	inrec[4] = inrec[4].split("\|") # DB:Reference(s)
	inrec[6] = inrec[6].split("\|") # With
	inrec[10] = inrec[10].split("\|") # Annotation extension
	yield dict(zip(GPA10FIELDS, inrec))


	def _gpa11iterator(handle):
	"""Read GPA 1.1 format files (PRIVATE).

	This iterator is used to read a gp_association.goa_uniprot
	file which is in the GPA 1.1 format. Do not call directly. Rather
	use the gpa_iterator function
	"""
	for inline in handle:
	if inline[0] == "!":
	continue
	inrec = inline.rstrip("\n").split("\t")
	if len(inrec) == 1:
	continue
	inrec[2] = inrec[2].split("\|") # Qualifier
	inrec[4] = inrec[4].split("\|") # DB:Reference(s)
	inrec[6] = inrec[6].split("\|") # With
	inrec[10] = inrec[10].split("\|") # Annotation extension
	yield dict(zip(GPA11FIELDS, inrec))


	def gpa_iterator(handle):
	"""Read GPA format files.

	This function should be called to read a
	gene_association.goa_uniprot file. Reads the first record and
	returns a gpa 1.1 or a gpa 1.0 iterator as needed
	"""
	inline = handle.readline()
	if inline.strip() == "!gpa-version: 1.1":
	# sys.stderr.write("gpa 1.1\n")
	return _gpa11iterator(handle)
	elif inline.strip() == "!gpa-version: 1.0":
	# sys.stderr.write("gpa 1.0\n")
	return _gpa10iterator(handle)
	else:
	raise ValueError(f"Unknown GPA version {inline}\n")


	def _gaf20iterator(handle):
	for inline in handle:
	if inline[0] == "!":
	continue
	inrec = inline.rstrip("\n").split("\t")
	if len(inrec) == 1:
	continue
	inrec[3] = inrec[3].split("\|") # Qualifier
	inrec[5] = inrec[5].split("\|") # DB:reference(s)
	inrec[7] = inrec[7].split("\|") # With \|\| From
	inrec[10] = inrec[10].split("\|") # Synonym
	inrec[12] = inrec[12].split("\|") # Taxon
	yield dict(zip(GAF20FIELDS, inrec))


	def _gaf10iterator(handle):
	for inline in handle:
	if inline[0] == "!":
	continue
	inrec = inline.rstrip("\n").split("\t")
	if len(inrec) == 1:
	continue
	inrec[3] = inrec[3].split("\|") # Qualifier
	inrec[5] = inrec[5].split("\|") # DB:reference(s)
	inrec[7] = inrec[7].split("\|") # With \|\| From
	inrec[10] = inrec[10].split("\|") # Synonym
	inrec[12] = inrec[12].split("\|") # Taxon
	yield dict(zip(GAF10FIELDS, inrec))


	def _gaf10byproteiniterator(handle):
	cur_id = None
	id_rec_list = []
	for inline in handle:
	if inline[0] == "!":
	continue
	inrec = inline.rstrip("\n").split("\t")
	if len(inrec) == 1:
	continue
	inrec[3] = inrec[3].split("\|") # Qualifier
	inrec[5] = inrec[5].split("\|") # DB:reference(s)
	inrec[7] = inrec[7].split("\|") # With \|\| From
	inrec[10] = inrec[10].split("\|") # Synonym
	inrec[12] = inrec[12].split("\|") # Taxon
	cur_rec = dict(zip(GAF10FIELDS, inrec))
	if cur_rec["DB_Object_ID"] != cur_id and cur_id:
	ret_list = copy.copy(id_rec_list)
	id_rec_list = [cur_rec]
	cur_id = cur_rec["DB_Object_ID"]
	yield ret_list
	else:
	cur_id = cur_rec["DB_Object_ID"]
	id_rec_list.append(cur_rec)


	def _gaf20byproteiniterator(handle):
	cur_id = None
	id_rec_list = []
	for inline in handle:
	if inline[0] == "!":
	continue
	inrec = inline.rstrip("\n").split("\t")
	if len(inrec) == 1:
	continue
	inrec[3] = inrec[3].split("\|") # Qualifier
	inrec[5] = inrec[5].split("\|") # DB:reference(s)
	inrec[7] = inrec[7].split("\|") # With \|\| From
	inrec[10] = inrec[10].split("\|") # Synonym
	inrec[12] = inrec[12].split("\|") # Taxon
	cur_rec = dict(zip(GAF20FIELDS, inrec))
	if cur_rec["DB_Object_ID"] != cur_id and cur_id:
	ret_list = copy.copy(id_rec_list)
	id_rec_list = [cur_rec]
	cur_id = cur_rec["DB_Object_ID"]
	yield ret_list
	else:
	cur_id = cur_rec["DB_Object_ID"]
	id_rec_list.append(cur_rec)


	def gafbyproteiniterator(handle):
	"""Iterate over records in a gene association file.

	Returns a list of all consecutive records with the same DB_Object_ID
	This function should be called to read a
	gene_association.goa_uniprot file. Reads the first record and
	returns a gaf 2.0 or a gaf 1.0 iterator as needed
	2016-04-09: added GAF 2.1 iterator & fixed bug in iterator assignment
	In the meantime GAF 2.1 uses the GAF 2.0 iterator
	"""
	inline = handle.readline()
	if inline.strip() == "!gaf-version: 2.0":
	# sys.stderr.write("gaf 2.0\n")
	return _gaf20byproteiniterator(handle)
	elif inline.strip() == "!gaf-version: 1.0":
	# sys.stderr.write("gaf 1.0\n")
	return _gaf10byproteiniterator(handle)
	elif inline.strip() == "!gaf-version: 2.1":
	# Handle GAF 2.1 as GAF 2.0 for now TODO: fix
	# sys.stderr.write("gaf 2.1\n")
	return _gaf20byproteiniterator(handle)
	elif inline.strip() == "!gaf-version: 2.2":
	# Handle GAF 2.2 as GAF 2.0 for now. Change from
	# 2.1 to 2.2 is that Qualifier field is no longer optional.
	# As this type of checks has not been done before, we can
	# continue to use the gaf2.0 parser
	return _gaf20byproteiniterator(handle)
	else:
	raise ValueError(f"Unknown GAF version {inline}\n")


	def gafiterator(handle):
	"""Iterate over a GAF 1.0 or 2.x file.

	This function should be called to read a
	gene_association.goa_uniprot file. Reads the first record and
	returns a gaf 2.x or a gaf 1.0 iterator as needed

	Example: open, read, interat and filter results.

	Original data file has been trimmed to ~600 rows.

	Original source ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/YEAST/goa_yeast.gaf.gz

	>>> from Bio.UniProt.GOA import gafiterator, record_has
	>>> Evidence = {'Evidence': set(['ND'])}
	>>> Synonym = {'Synonym': set(['YA19A_YEAST', 'YAL019W-A'])}
	>>> Taxon_ID = {'Taxon_ID': set(['taxon:559292'])}
	>>> with open('UniProt/goa_yeast.gaf', 'r') as handle:
	... for rec in gafiterator(handle):
	... if record_has(rec, Taxon_ID) and record_has(rec, Evidence) and record_has(rec, Synonym):
	... for key in ('DB_Object_Name', 'Evidence', 'Synonym', 'Taxon_ID'):
	... print(rec[key])
	...
	Putative uncharacterized protein YAL019W-A
	ND
	['YA19A_YEAST', 'YAL019W-A']
	['taxon:559292']
	Putative uncharacterized protein YAL019W-A
	ND
	['YA19A_YEAST', 'YAL019W-A']
	['taxon:559292']
	Putative uncharacterized protein YAL019W-A
	ND
	['YA19A_YEAST', 'YAL019W-A']
	['taxon:559292']

	"""
	inline = handle.readline()
	if inline.strip() == "!gaf-version: 2.0":
	# sys.stderr.write("gaf 2.0\n")
	return _gaf20iterator(handle)
	elif inline.strip() == "!gaf-version: 2.1":
	# sys.stderr.write("gaf 2.1\n")
	# Handle GAF 2.1 as GAF 2.0 for now. TODO: fix
	return _gaf20iterator(handle)
	elif inline.strip() == "!gaf-version: 2.2":
	# Handle GAF 2.2 as GAF 2.0 for now. Change from
	# 2.1 to 2.2 is that Qualifier field is no longer optional.
	# As this type of checks has not been done before, we can
	# continue to use the gaf2.0 parser
	return _gaf20iterator(handle)
	elif inline.strip() == "!gaf-version: 1.0":
	# sys.stderr.write("gaf 1.0\n")
	return _gaf10iterator(handle)
	else:
	raise ValueError(f"Unknown GAF version {inline}\n")


	def writerec(outrec, handle, fields=GAF20FIELDS):
	"""Write a single UniProt-GOA record to an output stream.

	Caller should know the format version. Default: gaf-2.0
	If header has a value, then it is assumed this is the first record,
	a header is written.
	"""
	outstr = ""
	for field in fields[:-1]:
	if isinstance(outrec[field], list):
	for subfield in outrec[field]:
	outstr += subfield + "\|"
	outstr = outstr[:-1] + "\t"
	else:
	outstr += outrec[field] + "\t"
	outstr += outrec[fields[-1]] + "\n"
	handle.write(outstr)


	def writebyproteinrec(outprotrec, handle, fields=GAF20FIELDS):
	"""Write a list of GAF records to an output stream.

	Caller should know the format version. Default: gaf-2.0
	If header has a value, then it is assumed this is the first record,
	a header is written. Typically the list is the one read by fafbyproteinrec, which
	contains all consecutive lines with the same DB_Object_ID
	"""
	for outrec in outprotrec:
	writerec(outrec, handle, fields=fields)


	def record_has(inrec, fieldvals):
	"""Accept a record, and a dictionary of field values.

	The format is {'field_name': set([val1, val2])}.
	If any field in the record has a matching value, the function returns
	True. Otherwise, returns False.
	"""
	retval = False
	for field in fieldvals:
	if isinstance(inrec[field], str):
	set1 = {inrec[field]}
	else:
	set1 = set(inrec[field])
	if set1 & fieldvals[field]:
	retval = True
	break
	return retval


	if __name__ == "__main__":
	from Bio._utils import run_doctest

	run_doctest(verbose=0)