Spaces:

mgbam
/

Medapp

Sleeping

App Files Files Community

mgbam commited on Jan 27

Commit

ff77b73

verified ·

1 Parent(s): 834aa30

Rename pubmed_utils.py to pubmed_rag.py

Browse files

Files changed (2) hide show

pubmed_rag.py +195 -0
pubmed_utils.py +0 -84

pubmed_rag.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import requests
+import nltk
+nltk.download("punkt")
+from nltk.tokenize import sent_tokenize
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from transformers import pipeline, AutoTokenizer, AutoModel
+from sentence_transformers import SentenceTransformer
+import os
+import faiss
+import numpy as np
+import json
+from config import (
+    PUBMED_EMAIL,
+    MAX_PUBMED_RESULTS,
+    DEFAULT_SUMMARIZATION_CHUNK,
+    VECTORDB_PATH,
+    EMBEDDING_MODEL_NAME
+)
+###############################################################################
+#                          SUMMARIZATION & EMBEDDINGS                         #
+###############################################################################
+summarizer = pipeline(
+    "summarization",
+    model="facebook/bart-large-cnn",
+    tokenizer="facebook/bart-large-cnn",
+)
+embed_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
+###############################################################################
+#                            PUBMED UTIL FUNCTIONS                            #
+###############################################################################
+def search_pubmed(query, max_results=MAX_PUBMED_RESULTS):
+    """
+    Search PubMed for PMIDs matching a query. Returns a list of PMIDs.
+    """
+    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+    params = {
+        "db": "pubmed",
+        "term": query,
+        "retmax": max_results,
+        "retmode": "json",
+        "tool": "AdvancedMedicalAI",
+        "email": PUBMED_EMAIL
+    }
+    resp = requests.get(url, params=params)
+    resp.raise_for_status()
+    data = resp.json()
+    return data.get("esearchresult", {}).get("idlist", [])
+def fetch_abstract(pmid):
+    """
+    Fetches an abstract for a single PMID via EFetch.
+    """
+    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+    params = {
+        "db": "pubmed",
+        "id": pmid,
+        "retmode": "text",
+        "rettype": "abstract",
+        "tool": "AdvancedMedicalAI",
+        "email": PUBMED_EMAIL
+    }
+    resp = requests.get(url, params=params)
+    resp.raise_for_status()
+    return resp.text.strip()
+def fetch_pubmed_abstracts(pmids):
+    """
+    Parallel fetch for multiple PMIDs. Returns dict {pmid: text}.
+    """
+    results = {}
+    with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor:
+        future_to_pmid = {executor.submit(fetch_abstract, pmid): pmid for pmid in pmids}
+        for future in as_completed(future_to_pmid):
+            pmid = future_to_pmid[future]
+            try:
+                results[pmid] = future.result()
+            except Exception as e:
+                results[pmid] = f"Error fetching PMID {pmid}: {str(e)}"
+    return results
+###############################################################################
+#                           SUMMARIZE & CHUNK TEXT                            #
+###############################################################################
+def chunk_and_summarize(raw_text, chunk_size=DEFAULT_SUMMARIZATION_CHUNK):
+    """
+    Splits large text into chunks by sentences, then summarizes each chunk, merging results.
+    """
+    sentences = sent_tokenize(raw_text)
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for sent in sentences:
+        token_count = len(sent.split())
+        if current_length + token_count > chunk_size:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = []
+            current_length = 0
+        current_chunk.append(sent)
+        current_length += token_count
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    summary_list = []
+    for c in chunks:
+        summ = summarizer(c, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
+        summary_list.append(summ)
+    return " ".join(summary_list)
+###############################################################################
+#                    SIMPLE VECTOR STORE (FAISS) FOR RAG                      #
+###############################################################################
+def create_or_load_faiss_index():
+    """
+    Creates a new FAISS index or loads from disk if it exists.
+    """
+    index_path = os.path.join(VECTORDB_PATH, "faiss_index.bin")
+    meta_path = os.path.join(VECTORDB_PATH, "faiss_meta.json")
+    if not os.path.exists(VECTORDB_PATH):
+        os.makedirs(VECTORDB_PATH)
+    if os.path.exists(index_path) and os.path.exists(meta_path):
+        # Load existing index
+        index = faiss.read_index(index_path)
+        with open(meta_path, "r") as f:
+            meta_data = json.load(f)
+        return index, meta_data
+    else:
+        # Create new index
+        index = faiss.IndexFlatL2(embed_model.get_sentence_embedding_dimension())
+        meta_data = {}
+        return index, meta_data
+def save_faiss_index(index, meta_data):
+    """
+    Saves the FAISS index and metadata to disk.
+    """
+    index_path = os.path.join(VECTORDB_PATH, "faiss_index.bin")
+    meta_path = os.path.join(VECTORDB_PATH, "faiss_meta.json")
+    faiss.write_index(index, index_path)
+    with open(meta_path, "w") as f:
+        json.dump(meta_data, f)
+def upsert_documents(docs):
+    """
+    Takes in a dict of {pmid: text}, embeds and upserts them into the FAISS index.
+    Each doc is stored in 'meta_data' with pmid as key.
+    """
+    index, meta_data = create_or_load_faiss_index()
+    texts = list(docs.values())
+    pmids = list(docs.keys())
+    embeddings = embed_model.encode(texts, convert_to_numpy=True)
+    index.add(embeddings)
+    # Maintain a simple meta_data: { int_id: { 'pmid': X, 'text': Y } }
+    # Where int_id is the row in the index
+    start_id = len(meta_data)
+    for i, pmid in enumerate(pmids):
+        meta_data[str(start_id + i)] = {"pmid": pmid, "text": texts[i]}
+    save_faiss_index(index, meta_data)
+def semantic_search(query, top_k=3):
+    """
+    Embeds 'query' and searches the FAISS index for top_k similar docs.
+    Returns a list of dict with 'pmid' and 'text'.
+    """
+    index, meta_data = create_or_load_faiss_index()
+    query_embedding = embed_model.encode([query], convert_to_numpy=True)
+    distances, indices = index.search(query_embedding, top_k)
+    results = []
+    for dist, idx_list in zip(distances, indices):
+        for d, i in zip(dist, idx_list):
+            # i is row in the index, look up meta_data
+            doc_info = meta_data[str(i)]
+            results.append({"pmid": doc_info["pmid"], "text": doc_info["text"], "score": float(d)})
+    # Sort by ascending distance => best match first
+    results.sort(key=lambda x: x["score"])
+    return results

pubmed_utils.py DELETED Viewed

@@ -1,84 +0,0 @@
-import requests
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from transformers import pipeline
-from config import PUBMED_EMAIL, CHUNK_SIZE
-# Summarization pipeline
-summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-def search_pubmed(query, max_results=5):
-    """
-    Search PubMed for PMIDs matching the query.
-    """
-    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
-    params = {
-        "db": "pubmed",
-        "term": query,
-        "retmax": max_results,
-        "retmode": "json",
-        "tool": "MedicalAI",
-        "email": PUBMED_EMAIL,
-    }
-    response = requests.get(url, params=params)
-    response.raise_for_status()
-    return response.json().get("esearchresult", {}).get("idlist", [])
-def fetch_abstract(pmid):
-    """
-    Fetch abstract for a given PubMed ID.
-    """
-    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
-    params = {
-        "db": "pubmed",
-        "id": pmid,
-        "retmode": "text",
-        "rettype": "abstract",
-        "tool": "MedicalAI",
-        "email": PUBMED_EMAIL,
-    }
-    response = requests.get(url, params=params)
-    response.raise_for_status()
-    return response.text.strip()
-def fetch_pubmed_abstracts(pmids):
-    """
-    Fetch multiple PubMed abstracts concurrently.
-    """
-    results = {}
-    with ThreadPoolExecutor(max_workers=5) as executor:
-        future_to_pmid = {executor.submit(fetch_abstract, pmid): pmid for pmid in pmids}
-        for future in as_completed(future_to_pmid):
-            pmid = future_to_pmid[future]
-            try:
-                results[pmid] = future.result()
-            except Exception as e:
-                results[pmid] = f"Error fetching PMID {pmid}: {str(e)}"
-    return results
-def summarize_text(text, chunk_size=CHUNK_SIZE):
-    """
-    Summarize long text using a chunking strategy.
-    """
-    sentences = text.split(". ")
-    chunks = []
-    current_chunk = []
-    current_length = 0
-    for sentence in sentences:
-        tokens = len(sentence.split())
-        if current_length + tokens > chunk_size:
-            chunks.append(" ".join(current_chunk))
-            current_chunk = []
-            current_length = 0
-        current_chunk.append(sentence)
-        current_length += tokens
-    if current_chunk:
-        chunks.append(" ".join(current_chunk))
-    summaries = [summarizer(chunk, max_length=100, min_length=30)[0]["summary_text"] for chunk in chunks]
-    return " ".join(summaries)