Spaces:

mgbam
/

CraAssitant

Runtime error

App Files Files Community

mgbam commited on Jan 20

Commit

520f2f0

verified ·

1 Parent(s): 50a3ce2

Update app.py

Browse files

Files changed (1) hide show

app.py +168 -208

app.py CHANGED Viewed

@@ -26,7 +26,9 @@ import spacy
 import spacy.cli
 import PyPDF2
-# Ensure spaCy model is downloaded
 try:
     nlp = spacy.load("en_core_web_sm")
 except OSError:
@@ -34,27 +36,46 @@ except OSError:
     spacy.cli.download("en_core_web_sm")
     nlp = spacy.load("en_core_web_sm")
-# Logging
 logger.add("error_logs.log", rotation="1 MB", level="ERROR")
-# Load environment variables
 load_dotenv()
 HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 ENTREZ_EMAIL = os.getenv("ENTREZ_EMAIL")
 if not HUGGINGFACE_TOKEN or not OPENAI_API_KEY:
     logger.error("Missing Hugging Face or OpenAI credentials.")
     raise ValueError("Missing credentials for Hugging Face or OpenAI.")
-# Hugging Face & OpenAI
 login(HUGGINGFACE_TOKEN)
 client = OpenAI(api_key=OPENAI_API_KEY)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f"Using device: {device}")
-# Model: Classification
 MODEL_NAME = "mgbam/bert-base-finetuned-mgbam"
 try:
     model = AutoModelForSequenceClassification.from_pretrained(
@@ -67,7 +88,6 @@ except Exception as e:
     logger.error(f"Model load error: {e}")
     raise
-# Model: Translation
 try:
     translation_model_name = "Helsinki-NLP/opus-mt-en-fr"
     translation_model = MarianMTModel.from_pretrained(
@@ -85,16 +105,21 @@ LANGUAGE_MAP: Dict[str, Tuple[str, str]] = {
     "French to English": ("fr", "en"),
 }
-# API endpoints
 PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
 PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
 EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
 ##########################################################
 #                  HELPER FUNCTIONS                      #
 ##########################################################
-def safe_json_parse(text: str) -> Union[Dict, None]:
     try:
         return json.loads(text)
     except json.JSONDecodeError as e:
@@ -102,7 +127,7 @@ def safe_json_parse(text: str) -> Union[Dict, None]:
         return None
 def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
-    """Parse PubMed XML and return structured articles."""
     root = ET.fromstring(xml_data)
     articles = []
     for article in root.findall(".//PubmedArticle"):
@@ -134,6 +159,7 @@ def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
 ##########################################################
 async def fetch_articles_by_nct_id(nct_id: str) -> Dict[str, Any]:
     params = {"query": nct_id, "format": "json"}
     async with httpx.AsyncClient() as client_http:
         try:
@@ -145,6 +171,7 @@ async def fetch_articles_by_nct_id(nct_id: str) -> Dict[str, Any]:
             return {"error": str(e)}
 async def fetch_articles_by_query(query_params: str) -> Dict[str, Any]:
     parsed_params = safe_json_parse(query_params)
     if not parsed_params or not isinstance(parsed_params, dict):
         return {"error": "Invalid JSON."}
@@ -160,6 +187,7 @@ async def fetch_articles_by_query(query_params: str) -> Dict[str, Any]:
             return {"error": str(e)}
 async def fetch_pubmed_by_query(query_params: str) -> Dict[str, Any]:
     parsed_params = safe_json_parse(query_params)
     if not parsed_params or not isinstance(parsed_params, dict):
         return {"error": "Invalid JSON for PubMed."}
@@ -174,31 +202,34 @@ async def fetch_pubmed_by_query(query_params: str) -> Dict[str, Any]:
     async with httpx.AsyncClient() as client_http:
         try:
-            search_response = await client_http.get(PUBMED_SEARCH_URL, params=search_params)
-            search_response.raise_for_status()
-            search_data = search_response.json()
             id_list = search_data.get("esearchresult", {}).get("idlist", [])
             if not id_list:
                 return {"result": ""}
             fetch_params = {
                 "db": "pubmed",
                 "id": ",".join(id_list),
                 "retmode": "xml",
                 "email": ENTREZ_EMAIL,
             }
-            fetch_response = await client_http.get(PUBMED_FETCH_URL, params=fetch_params)
-            fetch_response.raise_for_status()
-            return {"result": fetch_response.text}
         except Exception as e:
             logger.error(f"Error fetching PubMed articles: {e}")
             return {"error": str(e)}
 async def fetch_crossref_by_query(query_params: str) -> Dict[str, Any]:
     parsed_params = safe_json_parse(query_params)
     if not parsed_params or not isinstance(parsed_params, dict):
         return {"error": "Invalid JSON for Crossref."}
-    CROSSREF_API_URL = "https://api.crossref.org/works"
     async with httpx.AsyncClient() as client_http:
         try:
             response = await client_http.get(CROSSREF_API_URL, params=parsed_params)
@@ -209,7 +240,41 @@ async def fetch_crossref_by_query(query_params: str) -> Dict[str, Any]:
             return {"error": str(e)}
 ##########################################################
-#                     CORE FUNCTIONS                     #
 ##########################################################
 def summarize_text(text: str) -> str:
@@ -307,187 +372,105 @@ def perform_named_entity_recognition(text: str) -> str:
         return "Named Entity Recognition failed."
 ##########################################################
-#                 ENHANCED EDA FUNCTIONS                 #
 ##########################################################
-def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Optional[alt.Chart]]:
-    """Show columns, shape, numeric summary, correlation heatmap, distribution histograms."""
-    try:
-        columns_info = f"Columns: {list(df.columns)}"
-        shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
-        with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
-            describe_info = df.describe(include="all").to_string()
-        summary_text = (
-            f"--- Enhanced EDA Summary ---\n"
-            f"{columns_info}\n{shape_info}\n\n"
-            f"Summary Statistics:\n{describe_info}\n"
-        )
-        numeric_cols = df.select_dtypes(include="number")
-        corr_chart, distribution_chart = None, None
-        # Correlation
-        if numeric_cols.shape[1] >= 2:
-            corr = numeric_cols.corr()
-            corr_melted = corr.reset_index().melt(id_vars="index")
-            corr_melted.columns = ["Feature1", "Feature2", "Correlation"]
-            corr_chart = (
-                alt.Chart(corr_melted)
-                .mark_rect()
-                .encode(
-                    x="Feature1:O",
-                    y="Feature2:O",
-                    color="Correlation:Q",
-                    tooltip=["Feature1", "Feature2", "Correlation"]
-                )
-                .properties(width=400, height=400, title="Correlation Heatmap")
-            )
-        # Distribution
-        if numeric_cols.shape[1] >= 1:
-            df_long = numeric_cols.melt(var_name='Column', value_name='Value')
-            distribution_chart = (
-                alt.Chart(df_long)
-                .mark_bar()
-                .encode(
-                    alt.X("Value:Q", bin=alt.Bin(maxbins=30)),
-                    alt.Y('count()'),
-                    alt.Facet('Column:N', columns=2),
-                    tooltip=["Value"]
-                )
-                .properties(
-                    title='Distribution of Numeric Columns',
-                    width=300,
-                    height=200
-                )
-                .interactive()
-            )
-        return summary_text, corr_chart, distribution_chart
-    except Exception as e:
-        logger.error(f"Enhanced EDA Error: {e}")
-        return f"Enhanced EDA failed: {e}", None, None
-##########################################################
-#         PARSING FILES WITH MULTI-ENCODING CSV          #
-##########################################################
 def parse_csv_file_to_df(file_up: gr.File) -> pd.DataFrame:
     """
-    Safely parse a CSV by:
-      1) Checking if the file path on disk exists; if so, read from disk.
-      2) Otherwise, read from .file in memory.
-    3) For each approach, we try multiple encodings:
-       ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"].
     """
     path = file_up.name
-    # 1) If the file exists on disk, read from that path
     if os.path.isfile(path):
         for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
             try:
-                df = pd.read_csv(path, encoding=enc)
-                return df
             except UnicodeDecodeError:
-                logger.warning(f"CSV parse failed with encoding={enc}. Trying next...")
             except Exception as e:
-                logger.warning(f"Unexpected CSV read error with encoding={enc}: {e}")
-        raise ValueError("Could not parse CSV with any tried encodings (disk).")
     else:
-        # 2) Fallback: read from in-memory
         if not hasattr(file_up, "file"):
-            raise ValueError("Gradio file object has no .file attribute. Cannot parse CSV.")
         raw_bytes = file_up.file.read()
-        # Try multiple encodings on the raw bytes
         for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
             try:
-                text_decoded = raw_bytes.decode(enc, errors="replace")
                 from io import StringIO
-                df = pd.read_csv(StringIO(text_decoded))
-                return df
             except UnicodeDecodeError:
-                logger.warning(f"In-memory CSV parse failed with encoding={enc}. Trying next...")
             except Exception as e:
-                logger.warning(f"Unexpected in-memory CSV error (enc={enc}): {e}")
-        raise ValueError("Could not parse CSV with any tried encodings (in-memory).")
 def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
-    """
-    For .xls or .xlsx:
-      1) If file path exists, read from that path.
-      2) Else read from .file in memory.
-    """
-    import os
     excel_path = file_up.name
     if os.path.isfile(excel_path):
         return pd.read_excel(excel_path, engine="openpyxl")
     else:
         if not hasattr(file_up, "file"):
-            raise ValueError("Gradio file object has no .file attribute. Cannot parse Excel.")
         try:
             excel_bytes = file_up.file.read()
             return pd.read_excel(io.BytesIO(excel_bytes), engine="openpyxl")
         except Exception as e:
             raise ValueError(f"Excel parse error: {e}")
-def parse_pdf_file_as_str(file_up: gr.File) -> str:
-    """
-    For PDFs, read pages with PyPDF2.
-    Similar two-step approach: local path or fallback to memory.
-    """
-    pdf_path = file_up.name
-    if os.path.isfile(pdf_path):
-        with open(pdf_path, "rb") as f:
-            pdf_reader = PyPDF2.PdfReader(f)
-            text_content = []
-            for page in pdf_reader.pages:
-                text_content.append(page.extract_text() or "")
-            return "\n".join(text_content)
-    else:
-        if not hasattr(file_up, "file"):
-            raise ValueError("Gradio file object has no .file attribute. Cannot parse PDF.")
-        try:
-            pdf_bytes = file_up.file.read()
-            reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
-            text_content = []
-            for page in reader.pages:
-                text_content.append(page.extract_text() or "")
-            return "\n".join(text_content)
-        except Exception as e:
-            raise ValueError(f"PDF parse error: {e}")
-def parse_text_file_as_str(file_up: gr.File) -> str:
-    """
-    For .txt, do the same path or fallback approach,
-    possibly with multiple encodings if needed.
-    """
-    path = file_up.name
-    if os.path.isfile(path):
-        with open(path, "rb") as f:
-            return f.read().decode("utf-8", errors="replace")
-    else:
-        if not hasattr(file_up, "file"):
-            raise ValueError("Gradio file object has no .file attribute. Cannot parse txt.")
-        raw_bytes = file_up.file.read()
-        return raw_bytes.decode("utf-8", errors="replace")
 ##########################################################
 #                   GRADIO APP SETUP                     #
 ##########################################################
 with gr.Blocks() as demo:
-    gr.Markdown("# 🩺 Enhanced Clinical Research Assistant with EDA")
     gr.Markdown("""
 - **Summarize** text (GPT-3.5)
 - **Predict** outcomes (fine-tuned model)
 - **Translate** (English ↔ French)
 - **Named Entity Recognition** (spaCy)
 - **Fetch** from PubMed, Crossref, Europe PMC
 - **Generate** PDF reports
-- **Enhanced EDA** on CSV/Excel (with fallback encodings)
 """)
     with gr.Row():
@@ -504,11 +487,11 @@ with gr.Blocks() as demo:
             "Generate Report",
             "Translate",
             "Perform Named Entity Recognition",
-            "Perform Enhanced EDA",
             "Fetch Clinical Studies",
             "Fetch PubMed Articles (Legacy)",
             "Fetch PubMed by Query",
             "Fetch Crossref by Query",
         ],
         label="Select an Action",
     )
@@ -546,24 +529,23 @@ with gr.Blocks() as demo:
         combined_text = txt.strip()
-        # If a file is uploaded, parse based on extension
         if file_up is not None:
             file_ext = os.path.splitext(file_up.name)[1].lower()
             try:
                 if file_ext == ".txt":
-                    txt_content = parse_text_file_as_str(file_up)
-                    combined_text += "\n" + txt_content
                 elif file_ext == ".pdf":
                     pdf_text = parse_pdf_file_as_str(file_up)
                     combined_text += "\n" + pdf_text
-                # For CSV/Excel, we usually parse them *inside* certain actions (EDA, Summarize, etc.)
-                # Because sometimes you want the raw DataFrame, not the text.
             except Exception as e:
                 return f"File parse error: {e}", None, None, None
-        # Now handle the action
         if action == "Summarize":
-            # If CSV or Excel is uploaded, parse into DF and then convert to text
             if file_up:
                 fx = file_up.name.lower()
                 if fx.endswith(".csv"):
@@ -571,13 +553,13 @@ with gr.Blocks() as demo:
                         df_csv = parse_csv_file_to_df(file_up)
                         combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
-                        return f"CSV parse error for Summarize: {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
                         combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
-                        return f"Excel parse error for Summarize: {e}", None, None, None
             summary = summarize_text(combined_text)
             return summary, None, None, None
@@ -590,13 +572,13 @@ with gr.Blocks() as demo:
                         df_csv = parse_csv_file_to_df(file_up)
                         combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
-                        return f"CSV parse error for Predict: {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
                         combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
-                        return f"Excel parse error for Predict: {e}", None, None, None
             predictions = predict_outcome(combined_text)
             if isinstance(predictions, dict):
@@ -605,6 +587,7 @@ with gr.Blocks() as demo:
             return predictions, None, None, None
         elif action == "Generate Report":
             if file_up:
                 fx = file_up.name.lower()
                 if fx.endswith(".csv"):
@@ -612,13 +595,13 @@ with gr.Blocks() as demo:
                         df_csv = parse_csv_file_to_df(file_up)
                         combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
-                        return f"CSV parse error for Report: {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
                         combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
-                        return f"Excel parse error for Report: {e}", None, None, None
             fp = generate_report(combined_text, report_fn)
             msg = f"Report generated: {fp}" if fp else "Report generation failed."
@@ -632,13 +615,13 @@ with gr.Blocks() as demo:
                         df_csv = parse_csv_file_to_df(file_up)
                         combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
-                        return f"CSV parse error for Translate: {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
                         combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
-                        return f"Excel parse error for Translate: {e}", None, None, None
             translated = translate_text(combined_text, translation_opt)
             return translated, None, None, None
@@ -651,20 +634,17 @@ with gr.Blocks() as demo:
                         df_csv = parse_csv_file_to_df(file_up)
                         combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
-                        return f"CSV parse error for NER: {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
                         combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
-                        return f"Excel parse error for NER: {e}", None, None, None
             ner_result = perform_named_entity_recognition(combined_text)
             return ner_result, None, None, None
-        elif action == "Perform Enhanced EDA":
-            return await _action_eda(file_up, txt)
         elif action == "Fetch Clinical Studies":
             if nct_id:
                 result = await fetch_articles_by_nct_id(nct_id)
@@ -708,43 +688,23 @@ with gr.Blocks() as demo:
             )
             return formatted, None, None, None
         return "Invalid action.", None, None, None
-    async def _action_eda(file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
-        """Perform Enhanced EDA on CSV or Excel. If no file, try parsing raw_text as CSV."""
-        if file_up is None and not raw_text.strip():
-            return "No data provided for EDA.", None, None, None
-        if file_up:
-            ext = os.path.splitext(file_up.name)[1].lower()
-            if ext == ".csv":
-                try:
-                    df = parse_csv_file_to_df(file_up)
-                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df)
-                    return eda_summary, corr_chart, dist_chart, None
-                except Exception as e:
-                    return f"CSV EDA failed: {e}", None, None, None
-            elif ext in [".xls", ".xlsx"]:
-                try:
-                    df = parse_excel_file_to_df(file_up)
-                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df)
-                    return eda_summary, corr_chart, dist_chart, None
-                except Exception as e:
-                    return f"Excel EDA failed: {e}", None, None, None
-            else:
-                return "No valid CSV/Excel data for EDA.", None, None, None
-        else:
-            # If no file, maybe user pasted CSV text
-            if "," in raw_text:
-                from io import StringIO
-                try:
-                    df = pd.read_csv(StringIO(raw_text))
-                    eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df)
-                    return eda_summary, corr_chart, dist_chart, None
-                except Exception as e:
-                    return f"Text-based CSV parse error: {e}", None, None, None
-            return "No valid CSV/Excel data found for EDA.", None, None, None
     submit_btn.click(
         fn=handle_action,
         inputs=[action, text_input, file_input, translation_option, query_params_input, nct_id_input, report_filename_input, export_format],

 import spacy.cli
 import PyPDF2
+# =========================
+# 1) SpaCy Model Download
+# =========================
 try:
     nlp = spacy.load("en_core_web_sm")
 except OSError:
     spacy.cli.download("en_core_web_sm")
     nlp = spacy.load("en_core_web_sm")
+# =========================
+# 2) Logging Setup
+# =========================
 logger.add("error_logs.log", rotation="1 MB", level="ERROR")
+# =========================
+# 3) Environment Vars
+# =========================
 load_dotenv()
 HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")  # <--- NEW for BioPortal
 ENTREZ_EMAIL = os.getenv("ENTREZ_EMAIL")
 if not HUGGINGFACE_TOKEN or not OPENAI_API_KEY:
     logger.error("Missing Hugging Face or OpenAI credentials.")
     raise ValueError("Missing credentials for Hugging Face or OpenAI.")
+if not BIOPORTAL_API_KEY:
+    logger.warning("No BioPortal API Key found. BioPortal queries may fail.")
+# =========================
+# 4) Hugging Face Login
+# =========================
 login(HUGGINGFACE_TOKEN)
+# =========================
+# 5) OpenAI Client
+# =========================
 client = OpenAI(api_key=OPENAI_API_KEY)
+# =========================
+# 6) Device (CPU/GPU)
+# =========================
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f"Using device: {device}")
+# =========================
+# 7) Models Setup
+# =========================
 MODEL_NAME = "mgbam/bert-base-finetuned-mgbam"
 try:
     model = AutoModelForSequenceClassification.from_pretrained(
     logger.error(f"Model load error: {e}")
     raise
 try:
     translation_model_name = "Helsinki-NLP/opus-mt-en-fr"
     translation_model = MarianMTModel.from_pretrained(
     "French to English": ("fr", "en"),
 }
+# =========================
+# 8) API Endpoints
+# =========================
 PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
 PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
 EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
+BIOPORTAL_API_BASE = "https://data.bioontology.org"
+CROSSREF_API_URL = "https://api.crossref.org/works"
 ##########################################################
 #                  HELPER FUNCTIONS                      #
 ##########################################################
+def safe_json_parse(text: str) -> Union[Dict[str, Any], None]:
+    """Parse JSON string into Python dictionary safely."""
     try:
         return json.loads(text)
     except json.JSONDecodeError as e:
         return None
 def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
+    """Parse PubMed XML into structured articles."""
     root = ET.fromstring(xml_data)
     articles = []
     for article in root.findall(".//PubmedArticle"):
 ##########################################################
 async def fetch_articles_by_nct_id(nct_id: str) -> Dict[str, Any]:
+    """Europe PMC by NCT ID."""
     params = {"query": nct_id, "format": "json"}
     async with httpx.AsyncClient() as client_http:
         try:
             return {"error": str(e)}
 async def fetch_articles_by_query(query_params: str) -> Dict[str, Any]:
+    """Europe PMC by JSON query."""
     parsed_params = safe_json_parse(query_params)
     if not parsed_params or not isinstance(parsed_params, dict):
         return {"error": "Invalid JSON."}
             return {"error": str(e)}
 async def fetch_pubmed_by_query(query_params: str) -> Dict[str, Any]:
+    """PubMed by JSON query."""
     parsed_params = safe_json_parse(query_params)
     if not parsed_params or not isinstance(parsed_params, dict):
         return {"error": "Invalid JSON for PubMed."}
     async with httpx.AsyncClient() as client_http:
         try:
+            # 1) search
+            search_resp = await client_http.get(PUBMED_SEARCH_URL, params=search_params)
+            search_resp.raise_for_status()
+            search_data = search_resp.json()
             id_list = search_data.get("esearchresult", {}).get("idlist", [])
             if not id_list:
                 return {"result": ""}
+            # 2) fetch
             fetch_params = {
                 "db": "pubmed",
                 "id": ",".join(id_list),
                 "retmode": "xml",
                 "email": ENTREZ_EMAIL,
             }
+            fetch_resp = await client_http.get(PUBMED_FETCH_URL, params=fetch_params)
+            fetch_resp.raise_for_status()
+            return {"result": fetch_resp.text}
         except Exception as e:
             logger.error(f"Error fetching PubMed articles: {e}")
             return {"error": str(e)}
 async def fetch_crossref_by_query(query_params: str) -> Dict[str, Any]:
+    """Crossref by JSON query."""
     parsed_params = safe_json_parse(query_params)
     if not parsed_params or not isinstance(parsed_params, dict):
         return {"error": "Invalid JSON for Crossref."}
     async with httpx.AsyncClient() as client_http:
         try:
             response = await client_http.get(CROSSREF_API_URL, params=parsed_params)
             return {"error": str(e)}
 ##########################################################
+#                  BIOPORTAL INTEGRATION                 #
+##########################################################
+async def fetch_bioportal_by_query(query_params: str) -> Dict[str, Any]:
+    """
+    Fetch from BioPortal using JSON query parameters.
+    Expects something like: {"q": "cancer"}
+    See: https://data.bioontology.org/documentation
+    """
+    if not BIOPORTAL_API_KEY:
+        return {"error": "No BioPortal API Key set. Cannot fetch BioPortal data."}
+    parsed_params = safe_json_parse(query_params)
+    if not parsed_params or not isinstance(parsed_params, dict):
+        return {"error": "Invalid JSON for BioPortal."}
+    search_term = parsed_params.get("q", "")
+    if not search_term:
+        return {"error": "No 'q' found in JSON. Provide a search term."}
+    url = f"{BIOPORTAL_API_BASE}/search"
+    headers = {"Authorization": f"apikey token={BIOPORTAL_API_KEY}"}
+    req_params = {"q": search_term}
+    async with httpx.AsyncClient() as client_http:
+        try:
+            resp = await client_http.get(url, params=req_params, headers=headers)
+            resp.raise_for_status()
+            return resp.json()
+        except Exception as e:
+            logger.error(f"Error fetching BioPortal data: {e}")
+            return {"error": str(e)}
+##########################################################
+#                     CORE LOGIC                         #
 ##########################################################
 def summarize_text(text: str) -> str:
         return "Named Entity Recognition failed."
 ##########################################################
+#          FILE PARSING (TXT, PDF, CSV, EXCEL)           #
 ##########################################################
+def parse_pdf_file_as_str(file_up: gr.File) -> str:
+    """Read PDF pages with PyPDF2 (local path or in-memory)."""
+    pdf_path = file_up.name
+    if os.path.isfile(pdf_path):
+        with open(pdf_path, "rb") as f:
+            reader = PyPDF2.PdfReader(f)
+            text_content = []
+            for page in reader.pages:
+                text_content.append(page.extract_text() or "")
+            return "\n".join(text_content)
+    else:
+        if not hasattr(file_up, "file"):
+            raise ValueError("Gradio file object has no .file attribute (PDF).")
+        try:
+            pdf_bytes = file_up.file.read()
+            reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
+            text_content = []
+            for page in reader.pages:
+                text_content.append(page.extract_text() or "")
+            return "\n".join(text_content)
+        except Exception as e:
+            raise ValueError(f"PDF parse error: {e}")
+def parse_text_file_as_str(file_up: gr.File) -> str:
+    """Read .txt as UTF-8 from path or in-memory."""
+    path = file_up.name
+    if os.path.isfile(path):
+        with open(path, "rb") as f:
+            return f.read().decode("utf-8", errors="replace")
+    else:
+        if not hasattr(file_up, "file"):
+            raise ValueError("Gradio file object has no .file attribute (TXT).")
+        raw_bytes = file_up.file.read()
+        return raw_bytes.decode("utf-8", errors="replace")
 def parse_csv_file_to_df(file_up: gr.File) -> pd.DataFrame:
     """
+    Safely parse CSV with multiple encodings.
+    1) Local file path or fallback .file
+    2) Encodings: ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]
     """
     path = file_up.name
+    # local path
     if os.path.isfile(path):
         for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
             try:
+                return pd.read_csv(path, encoding=enc)
             except UnicodeDecodeError:
+                logger.warning(f"CSV parse failed with {enc}, trying next...")
             except Exception as e:
+                logger.warning(f"Other CSV parse error with {enc}: {e}")
+        raise ValueError("Could not parse CSV from local path with known encodings.")
     else:
         if not hasattr(file_up, "file"):
+            raise ValueError("Gradio file object has no .file attribute (CSV).")
         raw_bytes = file_up.file.read()
         for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
             try:
+                txt_decoded = raw_bytes.decode(enc, errors="replace")
                 from io import StringIO
+                return pd.read_csv(StringIO(txt_decoded))
             except UnicodeDecodeError:
+                logger.warning(f"In-memory CSV parse failed with {enc}, trying next...")
             except Exception as e:
+                logger.warning(f"In-memory CSV parse error with {enc}: {e}")
+        raise ValueError("Could not parse CSV from memory with known encodings.")
 def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
+    """Read Excel (.xls/.xlsx) from path or in-memory."""
     excel_path = file_up.name
     if os.path.isfile(excel_path):
         return pd.read_excel(excel_path, engine="openpyxl")
     else:
         if not hasattr(file_up, "file"):
+            raise ValueError("Gradio file object has no .file attribute (Excel).")
         try:
             excel_bytes = file_up.file.read()
             return pd.read_excel(io.BytesIO(excel_bytes), engine="openpyxl")
         except Exception as e:
             raise ValueError(f"Excel parse error: {e}")
 ##########################################################
 #                   GRADIO APP SETUP                     #
 ##########################################################
 with gr.Blocks() as demo:
+    gr.Markdown("# 🩺 Clinical Research Assistant (No EDA) + BioPortal")
     gr.Markdown("""
 - **Summarize** text (GPT-3.5)
 - **Predict** outcomes (fine-tuned model)
 - **Translate** (English ↔ French)
 - **Named Entity Recognition** (spaCy)
 - **Fetch** from PubMed, Crossref, Europe PMC
+- **Fetch** from BioPortal (NEW)
 - **Generate** PDF reports
+- (EDA Removed)
 """)
     with gr.Row():
             "Generate Report",
             "Translate",
             "Perform Named Entity Recognition",
             "Fetch Clinical Studies",
             "Fetch PubMed Articles (Legacy)",
             "Fetch PubMed by Query",
             "Fetch Crossref by Query",
+            "Fetch BioPortal by Query",     # <-- NEW ACTION
         ],
         label="Select an Action",
     )
         combined_text = txt.strip()
+        # 1) If user uploaded a file, parse basic text from .txt or .pdf
         if file_up is not None:
             file_ext = os.path.splitext(file_up.name)[1].lower()
             try:
                 if file_ext == ".txt":
+                    text_content = parse_text_file_as_str(file_up)
+                    combined_text += "\n" + text_content
                 elif file_ext == ".pdf":
                     pdf_text = parse_pdf_file_as_str(file_up)
                     combined_text += "\n" + pdf_text
+                # CSV/Excel might be parsed in the actions below if needed
             except Exception as e:
                 return f"File parse error: {e}", None, None, None
+        # 2) Action dispatch
         if action == "Summarize":
+            # If CSV or Excel is uploaded, parse DataFrame -> text
             if file_up:
                 fx = file_up.name.lower()
                 if fx.endswith(".csv"):
                         df_csv = parse_csv_file_to_df(file_up)
                         combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
+                        return f"CSV parse error (Summarize): {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
                         combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
+                        return f"Excel parse error (Summarize): {e}", None, None, None
             summary = summarize_text(combined_text)
             return summary, None, None, None
                         df_csv = parse_csv_file_to_df(file_up)
                         combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
+                        return f"CSV parse error (Predict): {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
                         combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
+                        return f"Excel parse error (Predict): {e}", None, None, None
             predictions = predict_outcome(combined_text)
             if isinstance(predictions, dict):
             return predictions, None, None, None
         elif action == "Generate Report":
+            # Merge CSV/Excel if user wants them in the PDF
             if file_up:
                 fx = file_up.name.lower()
                 if fx.endswith(".csv"):
                         df_csv = parse_csv_file_to_df(file_up)
                         combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
+                        return f"CSV parse error (Report): {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
                         combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
+                        return f"Excel parse error (Report): {e}", None, None, None
             fp = generate_report(combined_text, report_fn)
             msg = f"Report generated: {fp}" if fp else "Report generation failed."
                         df_csv = parse_csv_file_to_df(file_up)
                         combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
+                        return f"CSV parse error (Translate): {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
                         combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
+                        return f"Excel parse error (Translate): {e}", None, None, None
             translated = translate_text(combined_text, translation_opt)
             return translated, None, None, None
                         df_csv = parse_csv_file_to_df(file_up)
                         combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
+                        return f"CSV parse error (NER): {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
                         combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
+                        return f"Excel parse error (NER): {e}", None, None, None
             ner_result = perform_named_entity_recognition(combined_text)
             return ner_result, None, None, None
         elif action == "Fetch Clinical Studies":
             if nct_id:
                 result = await fetch_articles_by_nct_id(nct_id)
             )
             return formatted, None, None, None
+        elif action == "Fetch BioPortal by Query":
+            bioportal_result = await fetch_bioportal_by_query(query_str)
+            # Typically, the results are in "collection"
+            # See: https://data.bioontology.org/documentation
+            items = bioportal_result.get("collection", [])
+            if not items:
+                return "No BioPortal results found.", None, None, None
+            # Format a quick listing
+            formatted = "\n\n".join(
+                f"Label: {item.get('prefLabel')}, ID: {item.get('@id')}"
+                for item in items
+            )
+            return formatted, None, None, None
         return "Invalid action.", None, None, None
     submit_btn.click(
         fn=handle_action,
         inputs=[action, text_input, file_input, translation_option, query_params_input, nct_id_input, report_filename_input, export_format],