#!/usr/bin/env python3 """ MedGenesis – dual‑LLM asynchronous orchestrator ============================================== • Accepts `llm` argument ("openai" | "gemini"), defaults to "openai". • Harvests literature (PubMed + arXiv) → extracts keywords. • Fans‑out to open APIs for genes, trials, safety, ontology: – **MyGene.info** for live gene annotations – **ClinicalTrials.gov v2** for recruiting & completed studies – UMLS / openFDA / DisGeNET / MeSH (existing helpers) – Optional Open Targets & DrugCentral via `multi_enrich` if needed. • Returns a single JSON‑serialisable dict consumed by the Streamlit UI. """ from __future__ import annotations import asyncio from typing import Any, Dict, List # ── Literature fetchers ───────────────────────────────────────────── from mcp.arxiv import fetch_arxiv from mcp.pubmed import fetch_pubmed # ── NLP & legacy enrichers ───────────────────────────────────────── from mcp.nlp import extract_keywords from mcp.umls import lookup_umls from mcp.openfda import fetch_drug_safety from mcp.ncbi import search_gene, get_mesh_definition from mcp.disgenet import disease_to_genes # ── Modern high‑throughput APIs ──────────────────────────────────── from mcp.mygene import fetch_gene_info # MyGene.info from mcp.ctgov import search_trials_v2 # ClinicalTrials.gov v2 # from mcp.targets import fetch_ot_associations # (optional future use) # ── LLM utilities ───────────────────────────────────────────────── from mcp.openai_utils import ai_summarize, ai_qa from mcp.gemini import gemini_summarize, gemini_qa # ------------------------------------------------------------------ # LLM router # ------------------------------------------------------------------ def _get_llm(llm: str): """Return (summarize_fn, qa_fn) based on requested engine.""" if llm and llm.lower() == "gemini": return gemini_summarize, gemini_qa return ai_summarize, ai_qa # default → OpenAI # ------------------------------------------------------------------ # Helper: batch NCBI / MeSH / DisGeNET enrichment for keyword list # ------------------------------------------------------------------ async def _enrich_ncbi_mesh_disg(keys: List[str]) -> Dict[str, Any]: jobs = [search_gene(k) for k in keys] + \ [get_mesh_definition(k) for k in keys] + \ [disease_to_genes(k) for k in keys] results = await asyncio.gather(*jobs, return_exceptions=True) genes, mesh_defs, disg_links = [], [], [] n = len(keys) for idx, res in enumerate(results): if isinstance(res, Exception): continue bucket = idx // n # 0 = gene, 1 = mesh, 2 = disg if bucket == 0: genes.extend(res) elif bucket == 1: mesh_defs.append(res) else: disg_links.extend(res) return {"genes": genes, "meshes": mesh_defs, "disgenet": disg_links} # ------------------------------------------------------------------ # Main orchestrator # ------------------------------------------------------------------ async def orchestrate_search(query: str, *, llm: str = "openai") -> Dict[str, Any]: """Master async pipeline – returns dict consumed by UI.""" # 1) Literature -------------------------------------------------- arxiv_task = asyncio.create_task(fetch_arxiv(query)) pubmed_task = asyncio.create_task(fetch_pubmed(query)) papers = sum(await asyncio.gather(arxiv_task, pubmed_task), []) # 2) Keyword extraction ----------------------------------------- corpus = " ".join(p["summary"] for p in papers) keywords = extract_keywords(corpus)[:8] # 3) Fan‑out enrichment ----------------------------------------- umls_tasks = [lookup_umls(k) for k in keywords] fda_tasks = [fetch_drug_safety(k) for k in keywords] ncbi_task = asyncio.create_task(_enrich_ncbi_mesh_disg(keywords)) mygene_task = asyncio.create_task(fetch_gene_info(query)) # top gene hit trials_task = asyncio.create_task(search_trials_v2(query, max_n=20)) umls, fda, ncbi_data, mygene, trials = await asyncio.gather( asyncio.gather(*umls_tasks, return_exceptions=True), asyncio.gather(*fda_tasks, return_exceptions=True), ncbi_task, mygene_task, trials_task, ) # 4) LLM summary ------------------------------------------------- summarize_fn, _ = _get_llm(llm) ai_summary = await summarize_fn(corpus) # 5) Assemble payload ------------------------------------------- return { "papers" : papers, "umls" : umls, "drug_safety" : fda, "ai_summary" : ai_summary, "llm_used" : llm.lower(), # Gene & variant context "genes" : (ncbi_data["genes"] or []) + ([mygene] if mygene else []), "mesh_defs" : ncbi_data["meshes"], "gene_disease" : ncbi_data["disgenet"], # Clinical trials "clinical_trials": trials, } # ------------------------------------------------------------------ async def answer_ai_question(question: str, *, context: str, llm: str = "openai") -> Dict[str, str]: """One‑shot follow‑up QA using selected engine.""" _, qa_fn = _get_llm(llm) return {"answer": await qa_fn(question, context)}