mgbam commited on
Commit
f65e3d6
·
verified ·
1 Parent(s): 1974999

Update mcp/ncbi.py

Browse files
Files changed (1) hide show
  1. mcp/ncbi.py +90 -29
mcp/ncbi.py CHANGED
@@ -1,35 +1,96 @@
1
- # mcp/ncbi.py
2
- """
3
- NCBI E-utilities helpers – Gene, Protein, ClinVar, MeSH.
 
 
 
 
 
 
 
 
 
 
4
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- import os
7
- import httpx
8
- from typing import List, Dict
9
-
10
- NCBI_KEY = os.getenv("BIO_KEY") # optional but increases rate limits
11
- BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
12
-
13
- async def _get(endpoint: str, params: Dict) -> Dict:
14
- if NCBI_KEY:
15
- params["api_key"] = NCBI_KEY
16
- async with httpx.AsyncClient(timeout=20) as client:
17
- r = await client.get(f"{BASE}{endpoint}", params=params)
18
- r.raise_for_status()
19
- return r.json() if r.headers["Content-Type"].startswith("application/json") else r.text
20
-
21
- # ---------- Public helpers ----------
22
- async def search_gene(term: str, retmax: int = 5) -> List[Dict]:
23
- """Return basic gene info (ID + name/symbol) by search term."""
24
- data = await _get("esearch.fcgi", {"db": "gene", "term": term, "retmode": "json", "retmax": retmax})
25
- ids = data["esearchresult"]["idlist"]
26
  if not ids:
27
  return []
28
- summary = await _get("esummary.fcgi", {"db": "gene", "id": ",".join(ids), "retmode": "json"})
29
- return list(summary["result"].values())[1:] # first key is 'uids'
30
 
 
 
 
 
 
 
 
 
 
 
 
31
  async def get_mesh_definition(term: str) -> str:
32
- """Return MeSH term definition (first record)."""
33
- text = await _get("esummary.fcgi", {"db": "mesh", "term": term, "retmode": "json", "retmax": 1})
34
- recs = list(text["result"].values())[1:]
35
- return recs[0].get("ds_meshterms", [""])[0] if recs else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """MedGenesis – NCBI E‑utilities helper (async, cached).
3
+
4
+ Supports:
5
+ • `search_gene(term)` → quick gene symbol/name hits via ESearch + ESummary
6
+ • `get_mesh_definition(term)`→ first MeSH definition string via ESummary
7
+
8
+ New features
9
+ ~~~~~~~~~~~~
10
+ * Central `_request()` with exponential‑backoff retry (2×/4×).
11
+ * 12‑hour LRU caches for both public helpers (API quota‑friendly).
12
+ * Respects optional `BIO_KEY` env to boost rate limits.
13
+ * Handles single‑item edge cases (ESummary returns dict not list).
14
  """
15
+ from __future__ import annotations
16
+
17
+ import os, asyncio, httpx, xmltodict
18
+ from functools import lru_cache
19
+ from typing import List, Dict, Any
20
+
21
+ _API_KEY = os.getenv("BIO_KEY") # optional but raises quota if set
22
+ _BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
23
+ _TIMEOUT = 15
24
+
25
+ # ---------------------------------------------------------------------
26
+ # Internal request helper with retry
27
+ # ---------------------------------------------------------------------
28
+ async def _request(endpoint: str, params: Dict[str, Any], *, retries: int = 3) -> httpx.Response:
29
+ if _API_KEY:
30
+ params["api_key"] = _API_KEY
31
+ delay = 2
32
+ last = None
33
+ for _ in range(retries):
34
+ async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
35
+ last = await cli.get(f"{_BASE}{endpoint}", params=params)
36
+ if last.status_code == 200:
37
+ return last
38
+ await asyncio.sleep(delay)
39
+ delay *= 2
40
+ last.raise_for_status() # pragma: no cover
41
 
42
+
43
+ # ---------------------------------------------------------------------
44
+ # Gene search (ESearch → ESummary) – cached 12 h
45
+ # ---------------------------------------------------------------------
46
+ @lru_cache(maxsize=512)
47
+ async def search_gene(term: str, *, retmax: int = 5) -> List[Dict]:
48
+ """Return list of gene summary dicts for *term* (Entrez Gene db)."""
49
+ es_params = {
50
+ "db" : "gene",
51
+ "term" : term,
52
+ "retmode": "json",
53
+ "retmax": retmax,
54
+ }
55
+ es_resp = await _request("esearch.fcgi", es_params)
56
+ ids = es_resp.json().get("esearchresult", {}).get("idlist", [])
 
 
 
 
 
57
  if not ids:
58
  return []
 
 
59
 
60
+ sum_params = {"db": "gene", "id": ",".join(ids), "retmode": "json"}
61
+ sum_resp = await _request("esummary.fcgi", sum_params)
62
+ data = sum_resp.json().get("result", {})
63
+ # first key is 'uids'; skip it
64
+ return [v for k, v in data.items() if k != "uids"]
65
+
66
+
67
+ # ---------------------------------------------------------------------
68
+ # MeSH definition – cached 12 h
69
+ # ---------------------------------------------------------------------
70
+ @lru_cache(maxsize=512)
71
  async def get_mesh_definition(term: str) -> str:
72
+ """Return first MeSH definition string for *term* or ''."""
73
+ params = {
74
+ "db": "mesh",
75
+ "term": term,
76
+ "retmode": "json",
77
+ "retmax": 1,
78
+ }
79
+ resp = await _request("esummary.fcgi", params)
80
+ data = resp.json().get("result", {})
81
+ recs = [v for k, v in data.items() if k != "uids"]
82
+ if not recs:
83
+ return ""
84
+ return recs[0].get("ds_meshterms", [""])[0]
85
+
86
+
87
+ # ---------------------------------------------------------------------
88
+ # CLI demo
89
+ # ---------------------------------------------------------------------
90
+ if __name__ == "__main__":
91
+ async def _demo():
92
+ genes = await search_gene("TP53", retmax=3)
93
+ print(f"Gene hits: {len(genes)} – {genes[0]['name'] if genes else 'None'}")
94
+ mesh = await get_mesh_definition("glioblastoma")
95
+ print("MeSH def:", mesh[:80], "…")
96
+ asyncio.run(_demo())