File size: 10,967 Bytes
b7731cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# Copyright 2014 by Kevin Wu.
# Revisions copyright 2014 by Peter Cock.
# All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.

"""Provides code to access the REST-style KEGG online API.

This module aims to make the KEGG online REST-style API easier to use. See:
https://www.kegg.jp/kegg/rest/keggapi.html

The KEGG REST-style API provides simple access to a range of KEGG databases.
This works using simple URLs (which this module will construct for you),
with any errors indicated via HTTP error levels.

The functionality is somewhat similar to Biopython's Bio.TogoWS and Bio.Entrez
modules.

Currently KEGG does not provide any usage guidelines (unlike the NCBI whose
requirements are reasonably clear). To avoid risking overloading the service,
Biopython will only allow three calls per second.

References:
Kanehisa, M. and Goto, S.; KEGG: Kyoto Encyclopedia of Genes and Genomes.
Nucleic Acids Res. 28, 29-34 (2000).

"""

import io
from urllib.request import urlopen


def _q(op, arg1, arg2=None, arg3=None):
    URL = "https://rest.kegg.jp/%s"
    if arg2 and arg3:
        args = f"{op}/{arg1}/{arg2}/{arg3}"
    elif arg2:
        args = f"{op}/{arg1}/{arg2}"
    else:
        args = f"{op}/{arg1}"
    resp = urlopen(URL % (args))

    if "image" == arg2:
        return resp

    handle = io.TextIOWrapper(resp, encoding="UTF-8")
    handle.url = resp.url
    return handle


# https://www.kegg.jp/kegg/rest/keggapi.html
def kegg_info(database):
    """KEGG info - Displays the current statistics of a given database.

    db - database or organism (string)

    The argument db can be a KEGG database name (e.g. 'pathway' or its
    official abbreviation, 'path'), or a KEGG organism code or T number
    (e.g. 'hsa' or 'T01001' for human).

    A valid list of organism codes and their T numbers can be obtained
    via kegg_info('organism') or https://rest.kegg.jp/list/organism

    """
    # TODO - return a string (rather than the handle?)
    # TODO - cache and validate the organism code / T numbers?
    # TODO - can we parse the somewhat formatted output?
    #
    # https://rest.kegg.jp/info/<database>
    #
    # <database> = pathway | brite | module | disease | drug | environ |
    #              ko | genome |<org> | compound | glycan | reaction |
    #              rpair | rclass | enzyme | genomes | genes | ligand | kegg
    # <org> = KEGG organism code or T number
    return _q("info", database)


def kegg_list(database, org=None):
    """KEGG list - Entry list for database, or specified database entries.

    db - database or organism (string)
    org - optional organism (string), see below.

    For the pathway and module databases the optional organism can be
    used to restrict the results.

    """
    # TODO - split into two functions (dbentries seems separate)?
    #
    #  https://rest.kegg.jp/list/<database>/<org>
    #
    #  <database> = pathway | module
    #  <org> = KEGG organism code
    if database in ("pathway", "module") and org:
        resp = _q("list", database, org)
    elif isinstance(database, str) and database and org:
        raise ValueError("Invalid database arg for kegg list request.")

    # https://rest.kegg.jp/list/<database>
    #
    # <database> = pathway | brite | module | disease | drug | environ |
    #              ko | genome | <org> | compound | glycan | reaction |
    #              rpair | rclass | enzyme | organism
    # <org> = KEGG organism code or T number
    #
    #
    # https://rest.kegg.jp/list/<dbentries>
    #
    # <dbentries> = KEGG database entries involving the following <database>
    # <database> = pathway | brite | module | disease | drug | environ |
    #              ko | genome | <org> | compound | glycan | reaction |
    #              rpair | rclass | enzyme
    # <org> = KEGG organism code or T number
    else:
        if isinstance(database, list):
            if len(database) > 100:
                raise ValueError(
                    "Maximum number of databases is 100 for kegg list query"
                )
            database = ("+").join(database)
        resp = _q("list", database)

    return resp


def kegg_find(database, query, option=None):
    """KEGG find - Data search.

    Finds entries with matching query keywords or other query data in
    a given database.

    db - database or organism (string)
    query - search terms (string)
    option - search option (string), see below.

    For the compound and drug database, set option to the string 'formula',
    'exact_mass' or 'mol_weight' to search on that field only. The
    chemical formula search is a partial match irrespective of the order
    of atoms given. The exact mass (or molecular weight) is checked by
    rounding off to the same decimal place as the query data. A range of
    values may also be specified with the minus(-) sign.

    """
    # TODO - return list of tuples?
    #
    # https://rest.kegg.jp/find/<database>/<query>/<option>
    #
    # <database> = compound | drug
    # <option> = formula | exact_mass | mol_weight
    if database in ["compound", "drug"] and option in [
        "formula",
        "exact_mass",
        "mol_weight",
    ]:
        resp = _q("find", database, query, option)
    elif option:
        raise ValueError("Invalid option arg for kegg find request.")

    # https://rest.kegg.jp/find/<database>/<query>
    #
    # <database> = pathway | module | disease | drug | environ | ko |
    #              genome | <org> | compound | glycan | reaction | rpair |
    #              rclass | enzyme | genes | ligand
    # <org> = KEGG organism code or T number
    else:
        if isinstance(query, list):
            query = "+".join(query)
        resp = _q("find", database, query)

    return resp


def kegg_get(dbentries, option=None):
    """KEGG get - Data retrieval.

    dbentries - Identifiers (single string, or list of strings), see below.
    option - One of "aaseq", "ntseq", "mol", "kcf", "image", "kgml" (string)

    The input is limited up to 10 entries.
    The input is limited to one pathway entry with the image or kgml option.
    The input is limited to one compound/glycan/drug entry with the image option.

    Returns a handle.
    """
    if isinstance(dbentries, list) and len(dbentries) <= 10:
        dbentries = "+".join(dbentries)
    elif isinstance(dbentries, list) and len(dbentries) > 10:
        raise ValueError("Maximum number of dbentries is 10 for kegg get query")

    # https://rest.kegg.jp/get/<dbentries>[/<option>]
    #
    # <dbentries> = KEGG database entries involving the following <database>
    # <database> = pathway | brite | module | disease | drug | environ |
    #              ko | genome | <org> | compound | glycan | reaction |
    #              rpair | rclass | enzyme
    # <org> = KEGG organism code or T number
    #
    # <option> = aaseq | ntseq | mol | kcf | image
    if option in ["aaseq", "ntseq", "mol", "kcf", "image", "kgml", "json"]:

        resp = _q("get", dbentries, option)
    elif option:
        raise ValueError("Invalid option arg for kegg get request.")
    else:
        resp = _q("get", dbentries)

    return resp


def kegg_conv(target_db, source_db, option=None):
    """KEGG conv - convert KEGG identifiers to/from outside identifiers.

    Arguments:
     - target_db - Target database
     - source_db_or_dbentries - source database or database entries
     - option - Can be "turtle" or "n-triple" (string).

    """
    # https://rest.kegg.jp/conv/<target_db>/<source_db>[/<option>]
    #
    # (<target_db> <source_db>) = (<kegg_db> <outside_db>) |
    #                             (<outside_db> <kegg_db>)
    #
    # For gene identifiers:
    # <kegg_db> = <org>
    # <org> = KEGG organism code or T number
    # <outside_db> = ncbi-gi | ncbi-geneid | uniprot
    #
    # For chemical substance identifiers:
    # <kegg_db> = drug | compound | glycan
    # <outside_db> = pubchem | chebi
    #
    # <option> = turtle | n-triple
    #
    # https://rest.kegg.jp/conv/<target_db>/<dbentries>[/<option>]
    #
    # For gene identifiers:
    # <dbentries> = database entries involving the following <database>
    # <database> = <org> | ncbi-gi | ncbi-geneid | uniprot
    # <org> = KEGG organism code or T number
    #
    # For chemical substance identifiers:
    # <dbentries> = database entries involving the following <database>
    # <database> = drug | compound | glycan | pubchem | chebi
    #
    # <option> = turtle | n-triple
    if option and option not in ["turtle", "n-triple"]:
        raise ValueError("Invalid option arg for kegg conv request.")

    if isinstance(source_db, list):
        source_db = "+".join(source_db)

    if (
        target_db in ["ncbi-gi", "ncbi-geneid", "uniprot"]
        or source_db in ["ncbi-gi", "ncbi-geneid", "uniprot"]
        or (
            target_db in ["drug", "compound", "glycan"]
            and source_db in ["pubchem", "glycan"]
        )
        or (
            target_db in ["pubchem", "glycan"]
            and source_db in ["drug", "compound", "glycan"]
        )
    ):

        if option:
            resp = _q("conv", target_db, source_db, option)
        else:
            resp = _q("conv", target_db, source_db)

        return resp
    else:
        raise ValueError("Bad argument target_db or source_db for kegg conv request.")


def kegg_link(target_db, source_db, option=None):
    """KEGG link - find related entries by using database cross-references.

    target_db - Target database
    source_db_or_dbentries - source database
    option - Can be "turtle" or "n-triple" (string).
    """
    # https://rest.kegg.jp/link/<target_db>/<source_db>[/<option>]
    #
    # <target_db> = <database>
    # <source_db> = <database>
    #
    # <database> = pathway | brite | module | ko | genome | <org> | compound |
    #              glycan | reaction | rpair | rclass | enzyme | disease |
    #              drug | dgroup | environ
    #
    # <option> = turtle | n-triple
    # https://rest.kegg.jp/link/<target_db>/<dbentries>[/<option>]
    #
    # <dbentries> = KEGG database entries involving the following <database>
    # <database> = pathway | brite | module | ko | genome | <org> | compound |
    #              glycan | reaction | rpair | rclass | enzyme | disease |
    #              drug | dgroup | environ | genes
    #
    # <option> = turtle | n-triple

    if option and option not in ["turtle", "n-triple"]:
        raise ValueError("Invalid option arg for kegg conv request.")

    if isinstance(source_db, list):
        source_db = "+".join(source_db)

    if option:
        resp = _q("link", target_db, source_db, option)
    else:
        resp = _q("link", target_db, source_db)

    return resp