Update mcp/ncbi.py
Browse files- mcp/ncbi.py +90 -29
mcp/ncbi.py
CHANGED
@@ -1,35 +1,96 @@
|
|
1 |
-
|
2 |
-
"""
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
# ---------- Public helpers ----------
|
22 |
-
async def search_gene(term: str, retmax: int = 5) -> List[Dict]:
|
23 |
-
"""Return basic gene info (ID + name/symbol) by search term."""
|
24 |
-
data = await _get("esearch.fcgi", {"db": "gene", "term": term, "retmode": "json", "retmax": retmax})
|
25 |
-
ids = data["esearchresult"]["idlist"]
|
26 |
if not ids:
|
27 |
return []
|
28 |
-
summary = await _get("esummary.fcgi", {"db": "gene", "id": ",".join(ids), "retmode": "json"})
|
29 |
-
return list(summary["result"].values())[1:] # first key is 'uids'
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
async def get_mesh_definition(term: str) -> str:
|
32 |
-
"""Return MeSH term
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""MedGenesis – NCBI E‑utilities helper (async, cached).
|
3 |
+
|
4 |
+
Supports:
|
5 |
+
• `search_gene(term)` → quick gene symbol/name hits via ESearch + ESummary
|
6 |
+
• `get_mesh_definition(term)`→ first MeSH definition string via ESummary
|
7 |
+
|
8 |
+
New features
|
9 |
+
~~~~~~~~~~~~
|
10 |
+
* Central `_request()` with exponential‑backoff retry (2×/4×).
|
11 |
+
* 12‑hour LRU caches for both public helpers (API quota‑friendly).
|
12 |
+
* Respects optional `BIO_KEY` env to boost rate limits.
|
13 |
+
* Handles single‑item edge cases (ESummary returns dict not list).
|
14 |
"""
|
15 |
+
from __future__ import annotations
|
16 |
+
|
17 |
+
import os, asyncio, httpx, xmltodict
|
18 |
+
from functools import lru_cache
|
19 |
+
from typing import List, Dict, Any
|
20 |
+
|
21 |
+
_API_KEY = os.getenv("BIO_KEY") # optional but raises quota if set
|
22 |
+
_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
|
23 |
+
_TIMEOUT = 15
|
24 |
+
|
25 |
+
# ---------------------------------------------------------------------
|
26 |
+
# Internal request helper with retry
|
27 |
+
# ---------------------------------------------------------------------
|
28 |
+
async def _request(endpoint: str, params: Dict[str, Any], *, retries: int = 3) -> httpx.Response:
|
29 |
+
if _API_KEY:
|
30 |
+
params["api_key"] = _API_KEY
|
31 |
+
delay = 2
|
32 |
+
last = None
|
33 |
+
for _ in range(retries):
|
34 |
+
async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
|
35 |
+
last = await cli.get(f"{_BASE}{endpoint}", params=params)
|
36 |
+
if last.status_code == 200:
|
37 |
+
return last
|
38 |
+
await asyncio.sleep(delay)
|
39 |
+
delay *= 2
|
40 |
+
last.raise_for_status() # pragma: no cover
|
41 |
|
42 |
+
|
43 |
+
# ---------------------------------------------------------------------
|
44 |
+
# Gene search (ESearch → ESummary) – cached 12 h
|
45 |
+
# ---------------------------------------------------------------------
|
46 |
+
@lru_cache(maxsize=512)
|
47 |
+
async def search_gene(term: str, *, retmax: int = 5) -> List[Dict]:
|
48 |
+
"""Return list of gene summary dicts for *term* (Entrez Gene db)."""
|
49 |
+
es_params = {
|
50 |
+
"db" : "gene",
|
51 |
+
"term" : term,
|
52 |
+
"retmode": "json",
|
53 |
+
"retmax": retmax,
|
54 |
+
}
|
55 |
+
es_resp = await _request("esearch.fcgi", es_params)
|
56 |
+
ids = es_resp.json().get("esearchresult", {}).get("idlist", [])
|
|
|
|
|
|
|
|
|
|
|
57 |
if not ids:
|
58 |
return []
|
|
|
|
|
59 |
|
60 |
+
sum_params = {"db": "gene", "id": ",".join(ids), "retmode": "json"}
|
61 |
+
sum_resp = await _request("esummary.fcgi", sum_params)
|
62 |
+
data = sum_resp.json().get("result", {})
|
63 |
+
# first key is 'uids'; skip it
|
64 |
+
return [v for k, v in data.items() if k != "uids"]
|
65 |
+
|
66 |
+
|
67 |
+
# ---------------------------------------------------------------------
|
68 |
+
# MeSH definition – cached 12 h
|
69 |
+
# ---------------------------------------------------------------------
|
70 |
+
@lru_cache(maxsize=512)
|
71 |
async def get_mesh_definition(term: str) -> str:
|
72 |
+
"""Return first MeSH definition string for *term* or ''."""
|
73 |
+
params = {
|
74 |
+
"db": "mesh",
|
75 |
+
"term": term,
|
76 |
+
"retmode": "json",
|
77 |
+
"retmax": 1,
|
78 |
+
}
|
79 |
+
resp = await _request("esummary.fcgi", params)
|
80 |
+
data = resp.json().get("result", {})
|
81 |
+
recs = [v for k, v in data.items() if k != "uids"]
|
82 |
+
if not recs:
|
83 |
+
return ""
|
84 |
+
return recs[0].get("ds_meshterms", [""])[0]
|
85 |
+
|
86 |
+
|
87 |
+
# ---------------------------------------------------------------------
|
88 |
+
# CLI demo
|
89 |
+
# ---------------------------------------------------------------------
|
90 |
+
if __name__ == "__main__":
|
91 |
+
async def _demo():
|
92 |
+
genes = await search_gene("TP53", retmax=3)
|
93 |
+
print(f"Gene hits: {len(genes)} – {genes[0]['name'] if genes else 'None'}")
|
94 |
+
mesh = await get_mesh_definition("glioblastoma")
|
95 |
+
print("MeSH def:", mesh[:80], "…")
|
96 |
+
asyncio.run(_demo())
|