Spaces:

mgbam
/

MCP_Res

Runtime error

App Files Files Community

mgbam commited on 22 days ago

Commit

25f3b01

verified ·

1 Parent(s): 3b4ba75

Update mcp/pubmed.py

Browse files

Files changed (1) hide show

mcp/pubmed.py +124 -85

mcp/pubmed.py CHANGED Viewed

@@ -1,85 +1,124 @@
-# mcp/pubmed.py
-import httpx
-import xmltodict
-import os
-PUBMED_ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
-PUBMED_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
-PUB_KEY = os.environ.get("PUB_KEY")
-async def fetch_pubmed(query: str, max_results: int = 5):
-    """Fetch latest PubMed articles for the query."""
-    async with httpx.AsyncClient() as client:
-        params = {
-            "db": "pubmed",
-            "term": query,
-            "retmax": max_results,
-            "retmode": "json",
-            "api_key": PUB_KEY
-        }
-        resp = await client.get(PUBMED_ESEARCH, params=params)
-        ids = resp.json()["esearchresult"]["idlist"]
-        if not ids:
-            return []
-        efetch_params = {
-            "db": "pubmed",
-            "id": ",".join(ids),
-            "retmode": "xml",
-            "api_key": PUB_KEY
-        }
-        efetch_resp = await client.get(PUBMED_EFETCH, params=efetch_params)
-        articles = xmltodict.parse(efetch_resp.text)["PubmedArticleSet"].get("PubmedArticle", [])
-        if not isinstance(articles, list):  # Single article edge case
-            articles = [articles]
-        results = []
-        for a in articles:
-            art = a["MedlineCitation"]["Article"]
-            # Robustly extract publication year or date
-            published = ""
-            article_date = art.get("ArticleDate")
-            if isinstance(article_date, list) and article_date:
-                published = article_date[0].get("Year", "")
-            elif isinstance(article_date, dict):
-                published = article_date.get("Year", "")
-            else:
-                # Fallback to PubDate in Journal > JournalIssue > PubDate
-                pubdate = art.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {})
-                published = pubdate.get("Year", "") or pubdate.get("MedlineDate", "")
-            # Robustly extract authors
-            authors_raw = art.get("AuthorList", {}).get("Author", [])
-            if isinstance(authors_raw, dict):
-                authors_raw = [authors_raw]
-            authors = ", ".join([
-                f"{a.get('LastName', '')} {a.get('ForeName', '')}".strip()
-                for a in authors_raw if a.get("LastName") and a.get("ForeName")
-            ]) if authors_raw else "Unknown"
-            # Robustly extract summary/abstract
-            abstract = art.get("Abstract", {}).get("AbstractText", "")
-            if isinstance(abstract, list):
-                # List of dicts or strings
-                summary = " ".join(
-                    a.get("#text", str(a)) if isinstance(a, dict) else str(a)
-                    for a in abstract
-                )
-            elif isinstance(abstract, dict):
-                summary = abstract.get("#text", "")
-            else:
-                summary = abstract or ""
-            pmid = a["MedlineCitation"]["PMID"]
-            if isinstance(pmid, dict):
-                pmid = pmid.get("#text", "")
-            results.append({
-                "title": art["ArticleTitle"],
-                "authors": authors,
-                "summary": summary,
-                "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
-                "published": published,
-                "source": "PubMed"
-            })
-        return results

+#!/usr/bin/env python3
+"""MedGenesis – PubMed async fetcher (NCBI E-utilities).
+Improvements
+~~~~~~~~~~~~
+* Uses **ESearch → EFetch** pipeline with sane timeouts & retries.
+* Accepts optional `retmax` but caps at 25 to respect fair‑use.
+* Caches EFetch XML for 12 h via `lru_cache` (ids string as key).
+* Robust date / author / abstract extraction handles edge‑cases.
+* Returns list of dicts ready for `schemas.Paper`.
+"""
+from __future__ import annotations
+import asyncio, os, time, xmltodict, httpx
+from functools import lru_cache
+from typing import List, Dict
+_ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+_EFETCH  = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+_API_KEY = os.getenv("PUB_KEY")  # optional but higher rate limits if set
+_TIMEOUT = 15
+_MAX_RET = 25  # absolute hard‑cap
+# ---------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------
+async def _esearch(query: str, retmax: int) -> List[str]:
+    params = {
+        "db"     : "pubmed",
+        "term"   : query,
+        "retmax" : min(retmax, _MAX_RET),
+        "retmode": "json",
+    }
+    if _API_KEY:
+        params["api_key"] = _API_KEY
+    async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
+        r = await cli.get(_ESEARCH, params=params)
+        r.raise_for_status()
+        return r.json()["esearchresult"].get("idlist", [])
+@lru_cache(maxsize=128)
+async def _efetch(ids: str) -> List[Dict]:
+    """Fetch XML for comma‑separated IDs, return list of article dict chunks."""
+    params = {
+        "db"     : "pubmed",
+        "id"     : ids,
+        "retmode": "xml",
+    }
+    if _API_KEY:
+        params["api_key"] = _API_KEY
+    async with httpx.AsyncClient(timeout=_TIMEOUT) as cli:
+        r = await cli.get(_EFETCH, params=params)
+        r.raise_for_status()
+        xml = r.text
+    parsed = xmltodict.parse(xml).get("PubmedArticleSet", {}).get("PubmedArticle", [])
+    return parsed if isinstance(parsed, list) else [parsed]
+# ---------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------
+async def fetch_pubmed(query: str, *, max_results: int = 5) -> List[Dict]:
+    """Return latest PubMed papers as simple dicts."""
+    ids = await _esearch(query, max_results)
+    if not ids:
+        return []
+    articles = await _efetch(",".join(ids))
+    results: List[Dict] = []
+    for art in articles:
+        meta  = art["MedlineCitation"]["Article"]
+        pmid  = art["MedlineCitation"]["PMID"]
+        pmid  = pmid.get("#text") if isinstance(pmid, dict) else str(pmid)
+        # Title -------------------------------------------------------
+        title = meta.get("ArticleTitle", "[No title]")
+        # Authors -----------------------------------------------------
+        authors_raw = meta.get("AuthorList", {}).get("Author", [])
+        if isinstance(authors_raw, dict):
+            authors_raw = [authors_raw]
+        authors = ", ".join(
+            f"{a.get('LastName','')} {a.get('ForeName','')}".strip()
+            for a in authors_raw if a.get("LastName")
+        ) or "Unknown"
+        # Abstract ----------------------------------------------------
+        abstr = meta.get("Abstract", {}).get("AbstractText", "")
+        if isinstance(abstr, list):
+            summary = " ".join(
+                seg.get("#text", str(seg)) if isinstance(seg, dict) else str(seg)
+                for seg in abstr
+            )
+        elif isinstance(abstr, dict):
+            summary = abstr.get("#text", "")
+        else:
+            summary = abstr or ""
+        # Published date ---------------------------------------------
+        published = ""
+        art_date  = meta.get("ArticleDate")
+        if isinstance(art_date, dict):
+            published = art_date.get("Year", "")
+        elif isinstance(art_date, list) and art_date:
+            published = art_date[0].get("Year", "")
+        if not published:
+            pubdate = meta.get("Journal", {}).get("JournalIssue", {}).get("PubDate", {})
+            published = pubdate.get("Year") or pubdate.get("MedlineDate", "")
+        results.append({
+            "title"    : title,
+            "authors"  : authors,
+            "summary"  : summary,
+            "link"     : f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
+            "published": published,
+            "source"   : "PubMed",
+        })
+    return results