Spaces:

mgbam
/

Medapp

Sleeping

App Files Files Community

mgbam commited on Jan 27

Commit

718c260

verified ·

1 Parent(s): 8225d31

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -52

app.py CHANGED Viewed

@@ -9,12 +9,15 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 import streamlit as st
 import pandas as pd
 # NLP
 import nltk
 nltk.download('punkt')
 from nltk.tokenize import sent_tokenize
-# Hugging Face Transformers
 from transformers import pipeline
 # Optional: OpenAI and Google Generative AI
@@ -24,30 +27,17 @@ import google.generativeai as genai
 ###############################################################################
 #                              CONFIG & ENV                                   #
 ###############################################################################
-"""
-In your Hugging Face Space:
-1. Add environment secrets:
-   - OPENAI_API_KEY       (if using OpenAI)
-   - GEMINI_API_KEY       (if using Google PaLM/Gemini)
-   - MY_PUBMED_EMAIL      (to identify yourself to NCBI)
-2. In requirements.txt, install:
-   - streamlit
-   - requests
-   - nltk
-   - transformers
-   - torch
-   - openai (if using OpenAI)
-   - google-generativeai (if using Gemini)
-   - pandas
-"""
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
 MY_PUBMED_EMAIL = os.getenv("MY_PUBMED_EMAIL", "[email protected]")
 if OPENAI_API_KEY:
     openai.api_key = OPENAI_API_KEY
 if GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
@@ -58,12 +48,12 @@ if GEMINI_API_KEY:
 def load_summarizer():
     """
     Load a summarization model (e.g., BART, PEGASUS, T5).
-    For a more concise summarization, consider: 'google/pegasus-xsum'
     For a balanced approach, 'facebook/bart-large-cnn' is popular.
     """
     return pipeline(
-        "summarization",
-        model="facebook/bart-large-cnn",
         tokenizer="facebook/bart-large-cnn"
     )
@@ -109,11 +99,9 @@ def fetch_one_abstract(pmid):
     resp = requests.get(base_url, params=params)
     resp.raise_for_status()
     raw_text = resp.text.strip()
-    # If there's no clear text returned, mark as empty
     if not raw_text:
         return (pmid, "No abstract text found.")
     return (pmid, raw_text)
 def fetch_pubmed_abstracts(pmids):
@@ -122,6 +110,9 @@ def fetch_pubmed_abstracts(pmids):
     Returns {pmid: abstract_text}.
     """
     abstracts_map = {}
     with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor:
         future_to_pmid = {executor.submit(fetch_one_abstract, pmid): pmid for pmid in pmids}
         for future in as_completed(future_to_pmid):
@@ -142,10 +133,9 @@ def chunk_and_summarize(abstract_text, chunk_size=512):
     then summarizes each chunk with the Hugging Face pipeline.
     Returns a combined summary for the entire abstract.
     """
-    # We first split by sentences
     sentences = sent_tokenize(abstract_text)
     chunks = []
     current_chunk = []
     current_length = 0
     for sent in sentences:
@@ -155,6 +145,7 @@ def chunk_and_summarize(abstract_text, chunk_size=512):
             chunks.append(" ".join(current_chunk))
             current_chunk = []
             current_length = 0
         current_chunk.append(sent)
         current_length += tokens_in_sent
@@ -162,18 +153,16 @@ def chunk_and_summarize(abstract_text, chunk_size=512):
     if current_chunk:
         chunks.append(" ".join(current_chunk))
-    # Summarize each chunk to avoid hitting token or length constraints
     summarized_pieces = []
     for c in chunks:
         summary_out = summarizer(
             c,
-            max_length=100,   # tweak for desired summary length
             min_length=30,
             do_sample=False
         )
         summarized_pieces.append(summary_out[0]['summary_text'])
-    # Combine partial summaries into one final text
     final_summary = " ".join(summarized_pieces)
     return final_summary.strip()
@@ -218,17 +207,17 @@ def gemini_chat(system_prompt, user_message, model_name="models/chat-bison-001",
 ###############################################################################
 def build_system_prompt_with_refs(pmids, summarized_map):
     """
-    Creates a system prompt that includes the summarized abstracts alongside
-    labeled references. This allows the LLM to quote or cite specific references.
     """
-    # Example of labeling references: [Ref1], [Ref2], etc.
     system_context = (
         "You have access to the following summarized PubMed articles. "
-        "When relevant, cite them in your final answer using their reference label.\n\n"
     )
     for idx, pmid in enumerate(pmids, start=1):
         ref_label = f"[Ref{idx}]"
         system_context += f"{ref_label} (PMID {pmid}): {summarized_map[pmid]}\n\n"
     system_context += "Use this contextual info to provide a concise, evidence-based answer."
     return system_context
@@ -236,12 +225,13 @@ def build_system_prompt_with_refs(pmids, summarized_map):
 #                                STREAMLIT APP                                #
 ###############################################################################
 def main():
-    st.set_page_config(page_title="Enhanced RAG + PubMed", layout="wide")
     st.title("Enhanced RAG + PubMed: Production-Ready Medical Insights")
     st.markdown("""
-    **Welcome** to an advanced demonstration of **Retrieval-Augmented Generation (RAG)**
-    using PubMed E-utilities, Hugging Face Summarization, and optional LLM calls (OpenAI or Gemini).
     This version includes:
     - **Parallel** fetching for multiple PMIDs
@@ -261,7 +251,6 @@ def main():
         height=120
     )
-    # Sidebar or columns for parameters
     col1, col2 = st.columns(2)
     with col1:
         max_papers = st.slider(
@@ -284,7 +273,10 @@ def main():
         min_value=256,
         max_value=1024,
         value=512,
-        help="Larger chunks might produce fewer summaries, but risk token limits. Smaller chunks produce more robust summaries."
     )
     if st.button("Run Enhanced RAG Pipeline"):
@@ -295,12 +287,12 @@ def main():
         # 1. PubMed Search
         with st.spinner("Searching PubMed..."):
             pmids = search_pubmed(query=user_query, max_results=max_papers)
         if not pmids:
             st.error("No matching PubMed results. Try a different query.")
             return
-        # 2. Fetch abstracts in parallel
         with st.spinner("Fetching and summarizing abstracts..."):
             abstracts_map = fetch_pubmed_abstracts(pmids)
             summarized_map = {}
@@ -318,8 +310,8 @@ def main():
             st.write(summarized_map[pmid])
             st.write("---")
-        # 4. Build System Prompt
-        st.subheader("Final Answer")
         system_prompt = build_system_prompt_with_refs(pmids, summarized_map)
         with st.spinner("Generating final answer..."):
@@ -331,23 +323,24 @@ def main():
         st.write(answer)
         st.success("RAG Pipeline Complete.")
-    # Production Considerations & Next Steps
     st.markdown("---")
     st.markdown("""
-    ### Production-Ready Enhancements:
     1. **Vector Databases & Advanced Retrieval**
-       - For large-scale usage, index PubMed articles in a vector DB (e.g. Pinecone, Weaviate) to quickly retrieve relevant passages.
     2. **Citation Parsing**
-       - Automatically detect which abstract chunks contributed to each sentence.
     3. **Multi-Lingual**
-       - Integrate translation pipelines for non-English queries or abstracts.
     4. **Rate Limiting**
-       - Respect NCBI's ~3 requests/sec guideline if you're scaling out.
-    5. **Robust Logging & Error Handling**
-       - Build out logs, handle exceptions gracefully, and provide fallback prompts if an LLM fails or an abstract is missing.
-    6. **Privacy & Security**
-       - This demo only fetches public info. For patient data, ensure HIPAA/GDPR compliance and encrypted data pipelines.
     """)
 if __name__ == "__main__":
     main()

 import streamlit as st
 import pandas as pd
+# Set page config FIRST, before any other Streamlit calls:
+st.set_page_config(page_title="Enhanced RAG + PubMed", layout="wide")
 # NLP
 import nltk
 nltk.download('punkt')
 from nltk.tokenize import sent_tokenize
+# Transformers for summarization
 from transformers import pipeline
 # Optional: OpenAI and Google Generative AI
 ###############################################################################
 #                              CONFIG & ENV                                   #
 ###############################################################################
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
 MY_PUBMED_EMAIL = os.getenv("MY_PUBMED_EMAIL", "[email protected]")
+# Configure OpenAI if key is provided
 if OPENAI_API_KEY:
     openai.api_key = OPENAI_API_KEY
+# Configure Google PaLM / Gemini if key is provided
 if GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
 def load_summarizer():
     """
     Load a summarization model (e.g., BART, PEGASUS, T5).
+    For a more concise summarization, consider 'google/pegasus-xsum'.
     For a balanced approach, 'facebook/bart-large-cnn' is popular.
     """
     return pipeline(
+        "summarization",
+        model="facebook/bart-large-cnn",
         tokenizer="facebook/bart-large-cnn"
     )
     resp = requests.get(base_url, params=params)
     resp.raise_for_status()
     raw_text = resp.text.strip()
     if not raw_text:
         return (pmid, "No abstract text found.")
     return (pmid, raw_text)
 def fetch_pubmed_abstracts(pmids):
     Returns {pmid: abstract_text}.
     """
     abstracts_map = {}
+    if not pmids:
+        return abstracts_map
     with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor:
         future_to_pmid = {executor.submit(fetch_one_abstract, pmid): pmid for pmid in pmids}
         for future in as_completed(future_to_pmid):
     then summarizes each chunk with the Hugging Face pipeline.
     Returns a combined summary for the entire abstract.
     """
     sentences = sent_tokenize(abstract_text)
     chunks = []
     current_chunk = []
     current_length = 0
     for sent in sentences:
             chunks.append(" ".join(current_chunk))
             current_chunk = []
             current_length = 0
         current_chunk.append(sent)
         current_length += tokens_in_sent
     if current_chunk:
         chunks.append(" ".join(current_chunk))
     summarized_pieces = []
     for c in chunks:
         summary_out = summarizer(
             c,
+            max_length=100,   # Tweak for desired summary length
             min_length=30,
             do_sample=False
         )
         summarized_pieces.append(summary_out[0]['summary_text'])
     final_summary = " ".join(summarized_pieces)
     return final_summary.strip()
 ###############################################################################
 def build_system_prompt_with_refs(pmids, summarized_map):
     """
+    Creates a system prompt that includes the summarized abstracts alongside
+    labeled references (e.g., [Ref1]) so the LLM can cite them in the final answer.
     """
     system_context = (
         "You have access to the following summarized PubMed articles. "
+        "When relevant, cite them using their reference label.\n\n"
     )
     for idx, pmid in enumerate(pmids, start=1):
         ref_label = f"[Ref{idx}]"
         system_context += f"{ref_label} (PMID {pmid}): {summarized_map[pmid]}\n\n"
     system_context += "Use this contextual info to provide a concise, evidence-based answer."
     return system_context
 #                                STREAMLIT APP                                #
 ###############################################################################
 def main():
+    # From here on, we do NOT call st.set_page_config() again (to avoid the error).
     st.title("Enhanced RAG + PubMed: Production-Ready Medical Insights")
     st.markdown("""
+    **Welcome** to an advanced demonstration of **Retrieval-Augmented Generation (RAG)**
+    using PubMed E-utilities, Hugging Face Summarization, and optional LLM calls
+    (OpenAI or Gemini).
     This version includes:
     - **Parallel** fetching for multiple PMIDs
         height=120
     )
     col1, col2 = st.columns(2)
     with col1:
         max_papers = st.slider(
         min_value=256,
         max_value=1024,
         value=512,
+        help=(
+            "Larger chunks produce fewer summarization calls, but risk token limits. "
+            "Smaller chunks produce more robust summaries."
+        )
     )
     if st.button("Run Enhanced RAG Pipeline"):
         # 1. PubMed Search
         with st.spinner("Searching PubMed..."):
             pmids = search_pubmed(query=user_query, max_results=max_papers)
         if not pmids:
             st.error("No matching PubMed results. Try a different query.")
             return
+        # 2. Fetch & Summarize
         with st.spinner("Fetching and summarizing abstracts..."):
             abstracts_map = fetch_pubmed_abstracts(pmids)
             summarized_map = {}
             st.write(summarized_map[pmid])
             st.write("---")
+        # 4. Build Prompt & Generate Final Answer
+        st.subheader("RAG-Enhanced Final Answer")
         system_prompt = build_system_prompt_with_refs(pmids, summarized_map)
         with st.spinner("Generating final answer..."):
         st.write(answer)
         st.success("RAG Pipeline Complete.")
+    # Production notes:
     st.markdown("---")
     st.markdown("""
+    ### Production-Ready Enhancements
     1. **Vector Databases & Advanced Retrieval**
+       - For large-scale usage, index PubMed articles in a vector DB to quickly retrieve relevant passages.
     2. **Citation Parsing**
+       - Automatically detect which chunk or article contributed to each sentence for more precise referencing.
     3. **Multi-Lingual**
+       - Integrate translation pipelines for non-English queries or abstracts to expand global reach.
     4. **Rate Limiting**
+       - Respect NCBI's ~3 requests/sec guideline if scaling up usage.
+    5. **Logging & Monitoring**
+       - In production, set up robust logging/observability for success/failure rates.
+    6. **Security & Privacy**
+       - Currently only uses public info. If patient data is included, ensure HIPAA/GDPR compliance.
     """)
 if __name__ == "__main__":
     main()