aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 1999 by Jeffrey Chang. All rights reserved.
# Copyright 2000 by Jeffrey Chang. All rights reserved.
# Revisions Copyright 2007 by Peter Cock. All rights reserved.
# Revisions Copyright 2009 by Michiel de Hoon. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Parser for the prosite dat file from Prosite at ExPASy.
See https://www.expasy.org/prosite/
Tested with:
- Release 20.43, 10-Feb-2009
- Release 2017_03 of 15-Mar-2017.
Functions:
- read Reads a Prosite file containing one Prosite record
- parse Iterates over records in a Prosite file.
Classes:
- Record Holds Prosite data.
"""
def parse(handle):
"""Parse Prosite records.
This function is for parsing Prosite files containing multiple
records.
Arguments:
- handle - handle to the file.
"""
while True:
record = __read(handle)
if not record:
break
yield record
def read(handle):
"""Read one Prosite record.
This function is for parsing Prosite files containing
exactly one record.
Arguments:
- handle - handle to the file.
"""
record = __read(handle)
# We should have reached the end of the record by now
remainder = handle.read()
if remainder:
raise ValueError("More than one Prosite record found")
return record
class Record:
"""Holds information from a Prosite record.
Main attributes:
- name ID of the record. e.g. ADH_ZINC
- type Type of entry. e.g. PATTERN, MATRIX, or RULE
- accession e.g. PS00387
- created Date the entry was created. (MMM-YYYY for releases
before January 2017, DD-MMM-YYYY since January 2017)
- data_update Date the 'primary' data was last updated.
- info_update Date data other than 'primary' data was last updated.
- pdoc ID of the PROSITE DOCumentation.
- description Free-format description.
- pattern The PROSITE pattern. See docs.
- matrix List of strings that describes a matrix entry.
- rules List of rule definitions (from RU lines). (strings)
- prorules List of prorules (from PR lines). (strings)
NUMERICAL RESULTS:
- nr_sp_release SwissProt release.
- nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
- nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
- nr_positive True positives. tuple of (hits, seqs)
- nr_unknown Could be positives. tuple of (hits, seqs)
- nr_false_pos False positives. tuple of (hits, seqs)
- nr_false_neg False negatives. (int)
- nr_partial False negatives, because they are fragments. (int)
COMMENTS:
- cc_taxo_range Taxonomic range. See docs for format
- cc_max_repeat Maximum number of repetitions in a protein
- cc_site Interesting site. list of tuples (pattern pos, desc.)
- cc_skip_flag Can this entry be ignored?
- cc_matrix_type
- cc_scaling_db
- cc_author
- cc_ft_key
- cc_ft_desc
- cc_version version number (introduced in release 19.0)
The following are all lists if tuples (swiss-prot accession, swiss-prot name).
DATA BANK REFERENCES:
- dr_positive
- dr_false_neg
- dr_false_pos
- dr_potential Potential hits, but fingerprint region not yet available.
- dr_unknown Could possibly belong
- pdb_structs List of PDB entries.
"""
def __init__(self):
"""Initialize the class."""
self.name = ""
self.type = ""
self.accession = ""
self.created = ""
self.data_update = ""
self.info_update = ""
self.pdoc = ""
self.description = ""
self.pattern = ""
self.matrix = []
self.rules = []
self.prorules = []
self.postprocessing = []
self.nr_sp_release = ""
self.nr_sp_seqs = ""
self.nr_total = (None, None)
self.nr_positive = (None, None)
self.nr_unknown = (None, None)
self.nr_false_pos = (None, None)
self.nr_false_neg = None
self.nr_partial = None
self.cc_taxo_range = ""
self.cc_max_repeat = ""
self.cc_site = []
self.cc_skip_flag = ""
self.dr_positive = []
self.dr_false_neg = []
self.dr_false_pos = []
self.dr_potential = []
self.dr_unknown = []
self.pdb_structs = []
# Everything below are private functions
def __read(handle):
import re
record = None
for line in handle:
keyword, value = line[:2], line[5:].rstrip()
if keyword == "ID":
record = Record()
cols = value.split("; ")
if len(cols) != 2:
raise ValueError(f"I don't understand identification line\n{line}")
record.name = cols[0]
record.type = cols[1].rstrip(".") # don't want '.'
elif keyword == "AC":
record.accession = value.rstrip(";")
elif keyword == "DT":
# e.g. from January 2017,
# DT 01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; 01-APR-1990 INFO UPDATE.
# Older files had brackets round the date descriptions and used MMM-YYYY
dates = value.rstrip(".").split("; ")
if dates[0].endswith((" (CREATED)", " CREATED")):
# Remove last word
record.created = dates[0].rsplit(" ", 1)[0]
else:
raise ValueError(f"I don't understand date line\n{line}")
if dates[1].endswith((" (DATA UPDATE)", " DATA UPDATE")):
# Remove last two words
record.data_update = dates[1].rsplit(" ", 2)[0]
else:
raise ValueError(f"I don't understand date line\n{line}")
if dates[2].endswith((" (INFO UPDATE)", " INFO UPDATE")):
# Remove last two words
record.info_update = dates[2].rsplit(" ", 2)[0]
else:
raise ValueError(f"I don't understand date line\n{line}")
elif keyword == "DE":
record.description = value
elif keyword == "PA":
record.pattern += value
elif keyword == "MA":
record.matrix.append(value)
elif keyword == "PP":
record.postprocessing.extend(value.split(";"))
elif keyword == "RU":
record.rules.append(value)
elif keyword == "NR":
cols = value.split(";")
for col in cols:
if not col:
continue
qual, data = (word.lstrip() for word in col.split("="))
if qual == "/RELEASE":
release, seqs = data.split(",")
record.nr_sp_release = release
record.nr_sp_seqs = int(seqs)
elif qual == "/FALSE_NEG":
record.nr_false_neg = int(data)
elif qual == "/PARTIAL":
record.nr_partial = int(data)
elif qual in ["/TOTAL", "/POSITIVE", "/UNKNOWN", "/FALSE_POS"]:
m = re.match(r"(\d+)\((\d+)\)", data)
if not m:
raise Exception(f"Broken data {data} in comment line\n{line!r}")
hits = tuple(map(int, m.groups()))
if qual == "/TOTAL":
record.nr_total = hits
elif qual == "/POSITIVE":
record.nr_positive = hits
elif qual == "/UNKNOWN":
record.nr_unknown = hits
elif qual == "/FALSE_POS":
record.nr_false_pos = hits
else:
raise ValueError(f"Unknown qual {qual} in comment line\n{line!r}")
elif keyword == "CC":
# Expect CC lines like this:
# CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2;
# Can (normally) split on ";" and then on "="
cols = value.split(";")
for col in cols:
if not col or col[:17] == "Automatic scaling":
# DNAJ_2 in Release 15 has a non-standard comment line:
# CC Automatic scaling using reversed database
# Throw it away. (Should I keep it?)
continue
if col.count("=") == 0:
# Missing qualifier! Can we recover gracefully?
# For example, from Bug 2403, in PS50293 have:
# CC /AUTHOR=K_Hofmann; N_Hulo
continue
qual, data = (word.lstrip() for word in col.split("="))
if qual == "/TAXO-RANGE":
record.cc_taxo_range = data
elif qual == "/MAX-REPEAT":
record.cc_max_repeat = data
elif qual == "/SITE":
pos, desc = data.split(",")
record.cc_site.append((int(pos), desc))
elif qual == "/SKIP-FLAG":
record.cc_skip_flag = data
elif qual == "/MATRIX_TYPE":
record.cc_matrix_type = data
elif qual == "/SCALING_DB":
record.cc_scaling_db = data
elif qual == "/AUTHOR":
record.cc_author = data
elif qual == "/FT_KEY":
record.cc_ft_key = data
elif qual == "/FT_DESC":
record.cc_ft_desc = data
elif qual == "/VERSION":
record.cc_version = data
else:
raise ValueError(f"Unknown qual {qual} in comment line\n{line!r}")
elif keyword == "DR":
refs = value.split(";")
for ref in refs:
if not ref:
continue
acc, name, type = (word.strip() for word in ref.split(","))
if type == "T":
record.dr_positive.append((acc, name))
elif type == "F":
record.dr_false_pos.append((acc, name))
elif type == "N":
record.dr_false_neg.append((acc, name))
elif type == "P":
record.dr_potential.append((acc, name))
elif type == "?":
record.dr_unknown.append((acc, name))
else:
raise ValueError(f"I don't understand type flag {type}")
elif keyword == "3D":
cols = value.split()
for id in cols:
record.pdb_structs.append(id.rstrip(";"))
elif keyword == "PR":
rules = value.split(";")
record.prorules.extend(rules)
elif keyword == "DO":
record.pdoc = value.rstrip(";")
elif keyword == "//":
if not record:
# Then this was the copyright statement
continue
break
else:
raise ValueError(f"Unknown keyword {keyword} found")
else:
return
if not record:
raise ValueError("Unexpected end of stream.")
return record