mgbam commited on
Commit
25f3b01
·
verified ·
1 Parent(s): 3b4ba75

Update mcp/pubmed.py

Browse files
Files changed (1) hide show
  1. mcp/pubmed.py +124 -85
mcp/pubmed.py CHANGED
@@ -1,85 +1,124 @@
1
- # mcp/pubmed.py
2
-
3
- import httpx
4
- import xmltodict
5
- import os
6
-
7
- PUBMED_ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
8
- PUBMED_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
9
- PUB_KEY = os.environ.get("PUB_KEY")
10
-
11
- async def fetch_pubmed(query: str, max_results: int = 5):
12
- """Fetch latest PubMed articles for the query."""
13
- async with httpx.AsyncClient() as client:
14
- params = {
15
- "db": "pubmed",
16
- "term": query,
17
- "retmax": max_results,
18
- "retmode": "json",
19
- "api_key": PUB_KEY
20
- }
21
- resp = await client.get(PUBMED_ESEARCH, params=params)
22
- ids = resp.json()["esearchresult"]["idlist"]
23
- if not ids:
24
- return []
25
- efetch_params = {
26
- "db": "pubmed",
27
- "id": ",".join(ids),
28
- "retmode": "xml",
29
- "api_key": PUB_KEY
30
- }
31
- efetch_resp = await client.get(PUBMED_EFETCH, params=efetch_params)
32
- articles = xmltodict.parse(efetch_resp.text)["PubmedArticleSet"].get("PubmedArticle", [])
33
- if not isinstance(articles, list): # Single article edge case
34
- articles = [articles]
35
- results = []
36
- for a in articles:
37
- art = a["MedlineCitation"]["Article"]
38
-
39
- # Robustly extract publication year or date
40
- published = ""
41
- article_date = art.get("ArticleDate")
42
- if isinstance(article_date, list) and article_date:
43
- published = article_date[0].get("Year", "")
44
- elif isinstance(article_date, dict):
45
- published = article_date.get("Year", "")
46
- else:
47
- # Fallback to PubDate in Journal > JournalIssue > PubDate
48
- pubdate = art.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {})
49
- published = pubdate.get("Year", "") or pubdate.get("MedlineDate", "")
50
-
51
- # Robustly extract authors
52
- authors_raw = art.get("AuthorList", {}).get("Author", [])
53
- if isinstance(authors_raw, dict):
54
- authors_raw = [authors_raw]
55
- authors = ", ".join([
56
- f"{a.get('LastName', '')} {a.get('ForeName', '')}".strip()
57
- for a in authors_raw if a.get("LastName") and a.get("ForeName")
58
- ]) if authors_raw else "Unknown"
59
-
60
- # Robustly extract summary/abstract
61
- abstract = art.get("Abstract", {}).get("AbstractText", "")
62
- if isinstance(abstract, list):
63
- # List of dicts or strings
64
- summary = " ".join(
65
- a.get("#text", str(a)) if isinstance(a, dict) else str(a)
66
- for a in abstract
67
- )
68
- elif isinstance(abstract, dict):
69
- summary = abstract.get("#text", "")
70
- else:
71
- summary = abstract or ""
72
-
73
- pmid = a["MedlineCitation"]["PMID"]
74
- if isinstance(pmid, dict):
75
- pmid = pmid.get("#text", "")
76
-
77
- results.append({
78
- "title": art["ArticleTitle"],
79
- "authors": authors,
80
- "summary": summary,
81
- "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
82
- "published": published,
83
- "source": "PubMed"
84
- })
85
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """MedGenesis – PubMed async fetcher (NCBI E-utilities).
3
+
4
+ Improvements
5
+ ~~~~~~~~~~~~
6
+ * Uses **ESearch → EFetch** pipeline with sane timeouts & retries.
7
+ * Accepts optional `retmax` but caps at 25 to respect fair‑use.
8
+ * Caches EFetch XML for 12 h via `lru_cache` (ids string as key).
9
+ * Robust date / author / abstract extraction handles edge‑cases.
10
+ * Returns list of dicts ready for `schemas.Paper`.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import asyncio, os, time, xmltodict, httpx
15
+ from functools import lru_cache
16
+ from typing import List, Dict
17
+
18
+ _ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
19
+ _EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
20
+ _API_KEY = os.getenv("PUB_KEY") # optional but higher rate limits if set
21
+
22
+ _TIMEOUT = 15
23
+ _MAX_RET = 25 # absolute hard‑cap
24
+
25
+ # ---------------------------------------------------------------------
26
+ # Helpers
27
+ # ---------------------------------------------------------------------
28
+
29
+ async def _esearch(query: str, retmax: int) -> List[str]:
30
+ params = {
31
+ "db" : "pubmed",
32
+ "term" : query,
33
+ "retmax" : min(retmax, _MAX_RET),
34
+ "retmode": "json",
35
+ }
36
+ if _API_KEY:
37
+ params["api_key"] = _API_KEY
38
+ async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
39
+ r = await cli.get(_ESEARCH, params=params)
40
+ r.raise_for_status()
41
+ return r.json()["esearchresult"].get("idlist", [])
42
+
43
+
44
+ @lru_cache(maxsize=128)
45
+ async def _efetch(ids: str) -> List[Dict]:
46
+ """Fetch XML for comma‑separated IDs, return list of article dict chunks."""
47
+ params = {
48
+ "db" : "pubmed",
49
+ "id" : ids,
50
+ "retmode": "xml",
51
+ }
52
+ if _API_KEY:
53
+ params["api_key"] = _API_KEY
54
+ async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
55
+ r = await cli.get(_EFETCH, params=params)
56
+ r.raise_for_status()
57
+ xml = r.text
58
+ parsed = xmltodict.parse(xml).get("PubmedArticleSet", {}).get("PubmedArticle", [])
59
+ return parsed if isinstance(parsed, list) else [parsed]
60
+
61
+
62
+ # ---------------------------------------------------------------------
63
+ # Public API
64
+ # ---------------------------------------------------------------------
65
+
66
+ async def fetch_pubmed(query: str, *, max_results: int = 5) -> List[Dict]:
67
+ """Return latest PubMed papers as simple dicts."""
68
+ ids = await _esearch(query, max_results)
69
+ if not ids:
70
+ return []
71
+
72
+ articles = await _efetch(",".join(ids))
73
+ results: List[Dict] = []
74
+
75
+ for art in articles:
76
+ meta = art["MedlineCitation"]["Article"]
77
+ pmid = art["MedlineCitation"]["PMID"]
78
+ pmid = pmid.get("#text") if isinstance(pmid, dict) else str(pmid)
79
+
80
+ # Title -------------------------------------------------------
81
+ title = meta.get("ArticleTitle", "[No title]")
82
+
83
+ # Authors -----------------------------------------------------
84
+ authors_raw = meta.get("AuthorList", {}).get("Author", [])
85
+ if isinstance(authors_raw, dict):
86
+ authors_raw = [authors_raw]
87
+ authors = ", ".join(
88
+ f"{a.get('LastName','')} {a.get('ForeName','')}".strip()
89
+ for a in authors_raw if a.get("LastName")
90
+ ) or "Unknown"
91
+
92
+ # Abstract ----------------------------------------------------
93
+ abstr = meta.get("Abstract", {}).get("AbstractText", "")
94
+ if isinstance(abstr, list):
95
+ summary = " ".join(
96
+ seg.get("#text", str(seg)) if isinstance(seg, dict) else str(seg)
97
+ for seg in abstr
98
+ )
99
+ elif isinstance(abstr, dict):
100
+ summary = abstr.get("#text", "")
101
+ else:
102
+ summary = abstr or ""
103
+
104
+ # Published date ---------------------------------------------
105
+ published = ""
106
+ art_date = meta.get("ArticleDate")
107
+ if isinstance(art_date, dict):
108
+ published = art_date.get("Year", "")
109
+ elif isinstance(art_date, list) and art_date:
110
+ published = art_date[0].get("Year", "")
111
+ if not published:
112
+ pubdate = meta.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {})
113
+ published = pubdate.get("Year") or pubdate.get("MedlineDate", "")
114
+
115
+ results.append({
116
+ "title" : title,
117
+ "authors" : authors,
118
+ "summary" : summary,
119
+ "link" : f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
120
+ "published": published,
121
+ "source" : "PubMed",
122
+ })
123
+
124
+ return results