Spaces:
No application file
No application file
# Copyright 2010-2011, 2013-2014, 2016-2018 by Peter Cock. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Provides code to access the TogoWS integrated websevices of DBCLS, Japan. | |
This module aims to make the TogoWS (from DBCLS, Japan) easier to use. See: | |
http://togows.dbcls.jp/ | |
The TogoWS REST service provides simple access to a range of databases, acting | |
as a proxy to shield you from all the different provider APIs. This works using | |
simple URLs (which this module will construct for you). For more details, see | |
http://togows.dbcls.jp/site/en/rest.html | |
The functionality is somewhat similar to Biopython's Bio.Entrez module which | |
provides access to the NCBI's Entrez Utilities (E-Utils) which also covers a | |
wide range of databases. | |
Currently TogoWS does not provide any usage guidelines (unlike the NCBI whose | |
requirements are reasonably clear). To avoid risking overloading the service, | |
Biopython will only allow three calls per second. | |
The TogoWS SOAP service offers a more complex API for calling web services | |
(essentially calling remote functions) provided by DDBJ, KEGG and PDBj. For | |
example, this allows you to run a remote BLAST search at the DDBJ. This is | |
not yet covered by this module, however there are lots of Python examples | |
on the TogoWS website using the SOAPpy python library. See: | |
http://togows.dbcls.jp/site/en/soap.html | |
http://soapy.sourceforge.net/ | |
""" | |
import io | |
import time | |
from urllib.request import urlopen | |
from urllib.parse import quote | |
# Constant | |
_BASE_URL = "http://togows.dbcls.jp" | |
# Caches: | |
_search_db_names = None | |
_entry_db_names = None | |
_entry_db_fields = {} | |
_entry_db_formats = {} | |
_convert_formats = [] | |
def _get_fields(url): | |
"""Query a TogoWS URL for a plain text list of values (PRIVATE).""" | |
handle = _open(url) | |
fields = handle.read().strip().split() | |
handle.close() | |
return fields | |
def _get_entry_dbs(): | |
return _get_fields(_BASE_URL + "/entry") | |
def _get_entry_fields(db): | |
return _get_fields(_BASE_URL + f"/entry/{db}?fields") | |
def _get_entry_formats(db): | |
return _get_fields(_BASE_URL + f"/entry/{db}?formats") | |
def _get_convert_formats(): | |
return [pair.split(".") for pair in _get_fields(_BASE_URL + "/convert/")] | |
def entry(db, id, format=None, field=None): | |
"""Call TogoWS 'entry' to fetch a record. | |
Arguments: | |
- db - database (string), see list below. | |
- id - identier (string) or a list of identifiers (either as a list of | |
strings or a single string with comma separators). | |
- format - return data file format (string), options depend on the database | |
e.g. "xml", "json", "gff", "fasta", "ttl" (RDF Turtle) | |
- field - specific field from within the database record (string) | |
e.g. "au" or "authors" for pubmed. | |
At the time of writing, this includes the following:: | |
KEGG: compound, drug, enzyme, genes, glycan, orthology, reaction, | |
module, pathway | |
DDBj: ddbj, dad, pdb | |
NCBI: nuccore, nucest, nucgss, nucleotide, protein, gene, onim, | |
homologue, snp, mesh, pubmed | |
EBI: embl, uniprot, uniparc, uniref100, uniref90, uniref50 | |
For the current list, please see http://togows.dbcls.jp/entry/ | |
This function is essentially equivalent to the NCBI Entrez service | |
EFetch, available in Biopython as Bio.Entrez.efetch(...), but that | |
does not offer field extraction. | |
""" | |
global _entry_db_names, _entry_db_fields, fetch_db_formats | |
if _entry_db_names is None: | |
_entry_db_names = _get_entry_dbs() | |
if db not in _entry_db_names: | |
raise ValueError( | |
f"TogoWS entry fetch does not officially support database '{db}'." | |
) | |
if field: | |
try: | |
fields = _entry_db_fields[db] | |
except KeyError: | |
fields = _get_entry_fields(db) | |
_entry_db_fields[db] = fields | |
if db == "pubmed" and field == "ti" and "title" in fields: | |
# Backwards compatibility fix for TogoWS change Nov/Dec 2013 | |
field = "title" | |
import warnings | |
warnings.warn( | |
"TogoWS dropped 'pubmed' field alias 'ti', please use 'title' instead." | |
) | |
if field not in fields: | |
raise ValueError( | |
"TogoWS entry fetch does not explicitly support " | |
"field '%s' for database '%s'. Only: %s" | |
% (field, db, ", ".join(sorted(fields))) | |
) | |
if format: | |
try: | |
formats = _entry_db_formats[db] | |
except KeyError: | |
formats = _get_entry_formats(db) | |
_entry_db_formats[db] = formats | |
if format not in formats: | |
raise ValueError( | |
"TogoWS entry fetch does not explicitly support " | |
"format '%s' for database '%s'. Only: %s" | |
% (format, db, ", ".join(sorted(formats))) | |
) | |
if isinstance(id, list): | |
id = ",".join(id) | |
url = _BASE_URL + f"/entry/{db}/{quote(id)}" | |
if field: | |
url += "/" + field | |
if format: | |
url += "." + format | |
return _open(url) | |
def search_count(db, query): | |
"""Call TogoWS search count to see how many matches a search gives. | |
Arguments: | |
- db - database (string), see http://togows.dbcls.jp/search | |
- query - search term (string) | |
You could then use the count to download a large set of search results in | |
batches using the offset and limit options to Bio.TogoWS.search(). In | |
general however the Bio.TogoWS.search_iter() function is simpler to use. | |
""" | |
global _search_db_names | |
if _search_db_names is None: | |
_search_db_names = _get_fields(_BASE_URL + "/search") | |
if db not in _search_db_names: | |
# TODO - Make this a ValueError? Right now despite the HTML website | |
# claiming to, the "gene" or "ncbi-gene" don't work and are not listed. | |
import warnings | |
warnings.warn( | |
"TogoWS search does not officially support database '%s'. " | |
"See %s/search/ for options." % (db, _BASE_URL) | |
) | |
url = _BASE_URL + f"/search/{db}/{quote(query)}/count" | |
handle = _open(url) | |
data = handle.read() | |
handle.close() | |
if not data: | |
raise ValueError(f"TogoWS returned no data from URL {url}") | |
try: | |
return int(data.strip()) | |
except ValueError: | |
raise ValueError(f"Expected an integer from URL {url}, got: {data!r}") from None | |
def search_iter(db, query, limit=None, batch=100): | |
"""Call TogoWS search iterating over the results (generator function). | |
Arguments: | |
- db - database (string), see http://togows.dbcls.jp/search | |
- query - search term (string) | |
- limit - optional upper bound on number of search results | |
- batch - number of search results to pull back each time talk to | |
TogoWS (currently limited to 100). | |
You would use this function within a for loop, e.g. | |
>>> from Bio import TogoWS | |
>>> for id in TogoWS.search_iter("pubmed", "diabetes+human", limit=10): | |
... print("PubMed ID: %s" %id) # maybe fetch data with entry? | |
PubMed ID: ... | |
Internally this first calls the Bio.TogoWS.search_count() and then | |
uses Bio.TogoWS.search() to get the results in batches. | |
""" | |
count = search_count(db, query) | |
if not count: | |
return | |
# NOTE - We leave it to TogoWS to enforce any upper bound on each | |
# batch, they currently return an HTTP 400 Bad Request if above 100. | |
remain = count | |
if limit is not None: | |
remain = min(remain, limit) | |
offset = 1 # They don't use zero based counting | |
prev_ids = [] # Just cache the last batch for error checking | |
while remain: | |
batch = min(batch, remain) | |
# print("%r left, asking for %r" % (remain, batch)) | |
ids = search(db, query, offset, batch).read().strip().split() | |
assert len(ids) == batch, "Got %i, expected %i" % (len(ids), batch) | |
# print("offset %i, %s ... %s" % (offset, ids[0], ids[-1])) | |
if ids == prev_ids: | |
raise RuntimeError("Same search results for previous offset") | |
for identifier in ids: | |
if identifier in prev_ids: | |
raise RuntimeError(f"Result {identifier} was in previous batch") | |
yield identifier | |
offset += batch | |
remain -= batch | |
prev_ids = ids | |
def search(db, query, offset=None, limit=None, format=None): | |
"""Call TogoWS search. | |
This is a low level wrapper for the TogoWS search function, which | |
can return results in a several formats. In general, the search_iter | |
function is more suitable for end users. | |
Arguments: | |
- db - database (string), see http://togows.dbcls.jp/search/ | |
- query - search term (string) | |
- offset, limit - optional integers specifying which result to start from | |
(1 based) and the number of results to return. | |
- format - return data file format (string), e.g. "json", "ttl" (RDF) | |
By default plain text is returned, one result per line. | |
At the time of writing, TogoWS applies a default count limit of 100 | |
search results, and this is an upper bound. To access more results, | |
use the offset argument or the search_iter(...) function. | |
TogoWS supports a long list of databases, including many from the NCBI | |
(e.g. "ncbi-pubmed" or "pubmed", "ncbi-genbank" or "genbank", and | |
"ncbi-taxonomy"), EBI (e.g. "ebi-ebml" or "embl", "ebi-uniprot" or | |
"uniprot, "ebi-go"), and KEGG (e.g. "kegg-compound" or "compound"). | |
For the current list, see http://togows.dbcls.jp/search/ | |
The NCBI provide the Entrez Search service (ESearch) which is similar, | |
available in Biopython as the Bio.Entrez.esearch() function. | |
See also the function Bio.TogoWS.search_count() which returns the number | |
of matches found, and the Bio.TogoWS.search_iter() function which allows | |
you to iterate over the search results (taking care of batching for you). | |
""" | |
global _search_db_names | |
if _search_db_names is None: | |
_search_db_names = _get_fields(_BASE_URL + "/search") | |
if db not in _search_db_names: | |
# TODO - Make this a ValueError? Right now despite the HTML website | |
# claiming to, the "gene" or "ncbi-gene" don't work and are not listed. | |
import warnings | |
warnings.warn( | |
"TogoWS search does not explicitly support database '%s'. " | |
"See %s/search/ for options." % (db, _BASE_URL) | |
) | |
url = _BASE_URL + f"/search/{db}/{quote(query)}" | |
if offset is not None and limit is not None: | |
try: | |
offset = int(offset) | |
except ValueError: | |
raise ValueError( | |
f"Offset should be an integer (at least one), not {offset!r}" | |
) from None | |
try: | |
limit = int(limit) | |
except ValueError: | |
raise ValueError( | |
f"Limit should be an integer (at least one), not {limit!r}" | |
) from None | |
if offset <= 0: | |
raise ValueError("Offset should be at least one, not %i" % offset) | |
if limit <= 0: | |
raise ValueError("Count should be at least one, not %i" % limit) | |
url += "/%i,%i" % (offset, limit) | |
elif offset is not None or limit is not None: | |
raise ValueError("Expect BOTH offset AND limit to be provided (or neither)") | |
if format: | |
url += "." + format | |
# print(url) | |
return _open(url) | |
def convert(data, in_format, out_format): | |
"""Call TogoWS for file format conversion. | |
Arguments: | |
- data - string or handle containing input record(s) | |
- in_format - string describing the input file format (e.g. "genbank") | |
- out_format - string describing the requested output format (e.g. "fasta") | |
For a list of supported conversions (e.g. "genbank" to "fasta"), see | |
http://togows.dbcls.jp/convert/ | |
Note that Biopython has built in support for conversion of sequence and | |
alignnent file formats (functions Bio.SeqIO.convert and Bio.AlignIO.convert) | |
""" | |
global _convert_formats | |
if not _convert_formats: | |
_convert_formats = _get_convert_formats() | |
if [in_format, out_format] not in _convert_formats: | |
msg = "\n".join("%s -> %s" % tuple(pair) for pair in _convert_formats) | |
raise ValueError(f"Unsupported conversion. Choose from:\n{msg}") | |
url = _BASE_URL + f"/convert/{in_format}.{out_format}" | |
# TODO - Should we just accept a string not a handle? What about a filename? | |
try: | |
# Handle | |
data = data.read() | |
except AttributeError: | |
# String | |
pass | |
return _open(url, post=data) | |
def _open(url, post=None): | |
"""Build the URL and open a handle to it (PRIVATE). | |
Open a handle to TogoWS, will raise an IOError if it encounters an error. | |
In the absence of clear guidelines, this function enforces a limit of | |
"up to three queries per second" to avoid abusing the TogoWS servers. | |
""" | |
delay = 0.333333333 # one third of a second | |
current = time.time() | |
wait = _open.previous + delay - current | |
if wait > 0: | |
time.sleep(wait) | |
_open.previous = current + wait | |
else: | |
_open.previous = current | |
if post: | |
handle = urlopen(url, post.encode()) | |
else: | |
handle = urlopen(url) | |
# We now trust TogoWS to have set an HTTP error code, that | |
# suffices for my current unit tests. Previously we would | |
# examine the start of the data returned back. | |
text_handle = io.TextIOWrapper(handle, encoding="UTF-8") | |
text_handle.url = handle.url | |
return text_handle | |
_open.previous = 0 | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest(verbose=0) | |