mgbam commited on
Commit
520f2f0
·
verified ·
1 Parent(s): 50a3ce2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +168 -208
app.py CHANGED
@@ -26,7 +26,9 @@ import spacy
26
  import spacy.cli
27
  import PyPDF2
28
 
29
- # Ensure spaCy model is downloaded
 
 
30
  try:
31
  nlp = spacy.load("en_core_web_sm")
32
  except OSError:
@@ -34,27 +36,46 @@ except OSError:
34
  spacy.cli.download("en_core_web_sm")
35
  nlp = spacy.load("en_core_web_sm")
36
 
37
- # Logging
 
 
38
  logger.add("error_logs.log", rotation="1 MB", level="ERROR")
39
 
40
- # Load environment variables
 
 
41
  load_dotenv()
42
  HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")
43
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 
44
  ENTREZ_EMAIL = os.getenv("ENTREZ_EMAIL")
45
 
46
  if not HUGGINGFACE_TOKEN or not OPENAI_API_KEY:
47
  logger.error("Missing Hugging Face or OpenAI credentials.")
48
  raise ValueError("Missing credentials for Hugging Face or OpenAI.")
49
 
50
- # Hugging Face & OpenAI
 
 
 
 
 
51
  login(HUGGINGFACE_TOKEN)
 
 
 
 
52
  client = OpenAI(api_key=OPENAI_API_KEY)
53
 
 
 
 
54
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
55
  logger.info(f"Using device: {device}")
56
 
57
- # Model: Classification
 
 
58
  MODEL_NAME = "mgbam/bert-base-finetuned-mgbam"
59
  try:
60
  model = AutoModelForSequenceClassification.from_pretrained(
@@ -67,7 +88,6 @@ except Exception as e:
67
  logger.error(f"Model load error: {e}")
68
  raise
69
 
70
- # Model: Translation
71
  try:
72
  translation_model_name = "Helsinki-NLP/opus-mt-en-fr"
73
  translation_model = MarianMTModel.from_pretrained(
@@ -85,16 +105,21 @@ LANGUAGE_MAP: Dict[str, Tuple[str, str]] = {
85
  "French to English": ("fr", "en"),
86
  }
87
 
88
- # API endpoints
 
 
89
  PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
90
  PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
91
  EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
 
 
92
 
93
  ##########################################################
94
  # HELPER FUNCTIONS #
95
  ##########################################################
96
 
97
- def safe_json_parse(text: str) -> Union[Dict, None]:
 
98
  try:
99
  return json.loads(text)
100
  except json.JSONDecodeError as e:
@@ -102,7 +127,7 @@ def safe_json_parse(text: str) -> Union[Dict, None]:
102
  return None
103
 
104
  def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
105
- """Parse PubMed XML and return structured articles."""
106
  root = ET.fromstring(xml_data)
107
  articles = []
108
  for article in root.findall(".//PubmedArticle"):
@@ -134,6 +159,7 @@ def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
134
  ##########################################################
135
 
136
  async def fetch_articles_by_nct_id(nct_id: str) -> Dict[str, Any]:
 
137
  params = {"query": nct_id, "format": "json"}
138
  async with httpx.AsyncClient() as client_http:
139
  try:
@@ -145,6 +171,7 @@ async def fetch_articles_by_nct_id(nct_id: str) -> Dict[str, Any]:
145
  return {"error": str(e)}
146
 
147
  async def fetch_articles_by_query(query_params: str) -> Dict[str, Any]:
 
148
  parsed_params = safe_json_parse(query_params)
149
  if not parsed_params or not isinstance(parsed_params, dict):
150
  return {"error": "Invalid JSON."}
@@ -160,6 +187,7 @@ async def fetch_articles_by_query(query_params: str) -> Dict[str, Any]:
160
  return {"error": str(e)}
161
 
162
  async def fetch_pubmed_by_query(query_params: str) -> Dict[str, Any]:
 
163
  parsed_params = safe_json_parse(query_params)
164
  if not parsed_params or not isinstance(parsed_params, dict):
165
  return {"error": "Invalid JSON for PubMed."}
@@ -174,31 +202,34 @@ async def fetch_pubmed_by_query(query_params: str) -> Dict[str, Any]:
174
 
175
  async with httpx.AsyncClient() as client_http:
176
  try:
177
- search_response = await client_http.get(PUBMED_SEARCH_URL, params=search_params)
178
- search_response.raise_for_status()
179
- search_data = search_response.json()
 
180
  id_list = search_data.get("esearchresult", {}).get("idlist", [])
181
  if not id_list:
182
  return {"result": ""}
183
 
 
184
  fetch_params = {
185
  "db": "pubmed",
186
  "id": ",".join(id_list),
187
  "retmode": "xml",
188
  "email": ENTREZ_EMAIL,
189
  }
190
- fetch_response = await client_http.get(PUBMED_FETCH_URL, params=fetch_params)
191
- fetch_response.raise_for_status()
192
- return {"result": fetch_response.text}
193
  except Exception as e:
194
  logger.error(f"Error fetching PubMed articles: {e}")
195
  return {"error": str(e)}
196
 
197
  async def fetch_crossref_by_query(query_params: str) -> Dict[str, Any]:
 
198
  parsed_params = safe_json_parse(query_params)
199
  if not parsed_params or not isinstance(parsed_params, dict):
200
  return {"error": "Invalid JSON for Crossref."}
201
- CROSSREF_API_URL = "https://api.crossref.org/works"
202
  async with httpx.AsyncClient() as client_http:
203
  try:
204
  response = await client_http.get(CROSSREF_API_URL, params=parsed_params)
@@ -209,7 +240,41 @@ async def fetch_crossref_by_query(query_params: str) -> Dict[str, Any]:
209
  return {"error": str(e)}
210
 
211
  ##########################################################
212
- # CORE FUNCTIONS #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  ##########################################################
214
 
215
  def summarize_text(text: str) -> str:
@@ -307,187 +372,105 @@ def perform_named_entity_recognition(text: str) -> str:
307
  return "Named Entity Recognition failed."
308
 
309
  ##########################################################
310
- # ENHANCED EDA FUNCTIONS #
311
  ##########################################################
312
 
313
- def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Optional[alt.Chart]]:
314
- """Show columns, shape, numeric summary, correlation heatmap, distribution histograms."""
315
- try:
316
- columns_info = f"Columns: {list(df.columns)}"
317
- shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
318
-
319
- with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
320
- describe_info = df.describe(include="all").to_string()
321
-
322
- summary_text = (
323
- f"--- Enhanced EDA Summary ---\n"
324
- f"{columns_info}\n{shape_info}\n\n"
325
- f"Summary Statistics:\n{describe_info}\n"
326
- )
327
-
328
- numeric_cols = df.select_dtypes(include="number")
329
- corr_chart, distribution_chart = None, None
330
-
331
- # Correlation
332
- if numeric_cols.shape[1] >= 2:
333
- corr = numeric_cols.corr()
334
- corr_melted = corr.reset_index().melt(id_vars="index")
335
- corr_melted.columns = ["Feature1", "Feature2", "Correlation"]
336
- corr_chart = (
337
- alt.Chart(corr_melted)
338
- .mark_rect()
339
- .encode(
340
- x="Feature1:O",
341
- y="Feature2:O",
342
- color="Correlation:Q",
343
- tooltip=["Feature1", "Feature2", "Correlation"]
344
- )
345
- .properties(width=400, height=400, title="Correlation Heatmap")
346
- )
347
-
348
- # Distribution
349
- if numeric_cols.shape[1] >= 1:
350
- df_long = numeric_cols.melt(var_name='Column', value_name='Value')
351
- distribution_chart = (
352
- alt.Chart(df_long)
353
- .mark_bar()
354
- .encode(
355
- alt.X("Value:Q", bin=alt.Bin(maxbins=30)),
356
- alt.Y('count()'),
357
- alt.Facet('Column:N', columns=2),
358
- tooltip=["Value"]
359
- )
360
- .properties(
361
- title='Distribution of Numeric Columns',
362
- width=300,
363
- height=200
364
- )
365
- .interactive()
366
- )
367
-
368
- return summary_text, corr_chart, distribution_chart
369
-
370
- except Exception as e:
371
- logger.error(f"Enhanced EDA Error: {e}")
372
- return f"Enhanced EDA failed: {e}", None, None
373
 
374
- ##########################################################
375
- # PARSING FILES WITH MULTI-ENCODING CSV #
376
- ##########################################################
 
 
 
 
 
 
 
 
377
 
378
  def parse_csv_file_to_df(file_up: gr.File) -> pd.DataFrame:
379
  """
380
- Safely parse a CSV by:
381
- 1) Checking if the file path on disk exists; if so, read from disk.
382
- 2) Otherwise, read from .file in memory.
383
- 3) For each approach, we try multiple encodings:
384
- ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"].
385
  """
386
  path = file_up.name
387
- # 1) If the file exists on disk, read from that path
388
  if os.path.isfile(path):
389
  for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
390
  try:
391
- df = pd.read_csv(path, encoding=enc)
392
- return df
393
  except UnicodeDecodeError:
394
- logger.warning(f"CSV parse failed with encoding={enc}. Trying next...")
395
  except Exception as e:
396
- logger.warning(f"Unexpected CSV read error with encoding={enc}: {e}")
397
- raise ValueError("Could not parse CSV with any tried encodings (disk).")
398
  else:
399
- # 2) Fallback: read from in-memory
400
  if not hasattr(file_up, "file"):
401
- raise ValueError("Gradio file object has no .file attribute. Cannot parse CSV.")
402
  raw_bytes = file_up.file.read()
403
-
404
- # Try multiple encodings on the raw bytes
405
  for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
406
  try:
407
- text_decoded = raw_bytes.decode(enc, errors="replace")
408
  from io import StringIO
409
- df = pd.read_csv(StringIO(text_decoded))
410
- return df
411
  except UnicodeDecodeError:
412
- logger.warning(f"In-memory CSV parse failed with encoding={enc}. Trying next...")
413
  except Exception as e:
414
- logger.warning(f"Unexpected in-memory CSV error (enc={enc}): {e}")
415
- raise ValueError("Could not parse CSV with any tried encodings (in-memory).")
416
 
417
  def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
418
- """
419
- For .xls or .xlsx:
420
- 1) If file path exists, read from that path.
421
- 2) Else read from .file in memory.
422
- """
423
- import os
424
  excel_path = file_up.name
425
  if os.path.isfile(excel_path):
426
  return pd.read_excel(excel_path, engine="openpyxl")
427
  else:
428
  if not hasattr(file_up, "file"):
429
- raise ValueError("Gradio file object has no .file attribute. Cannot parse Excel.")
430
  try:
431
  excel_bytes = file_up.file.read()
432
  return pd.read_excel(io.BytesIO(excel_bytes), engine="openpyxl")
433
  except Exception as e:
434
  raise ValueError(f"Excel parse error: {e}")
435
 
436
- def parse_pdf_file_as_str(file_up: gr.File) -> str:
437
- """
438
- For PDFs, read pages with PyPDF2.
439
- Similar two-step approach: local path or fallback to memory.
440
- """
441
- pdf_path = file_up.name
442
- if os.path.isfile(pdf_path):
443
- with open(pdf_path, "rb") as f:
444
- pdf_reader = PyPDF2.PdfReader(f)
445
- text_content = []
446
- for page in pdf_reader.pages:
447
- text_content.append(page.extract_text() or "")
448
- return "\n".join(text_content)
449
- else:
450
- if not hasattr(file_up, "file"):
451
- raise ValueError("Gradio file object has no .file attribute. Cannot parse PDF.")
452
- try:
453
- pdf_bytes = file_up.file.read()
454
- reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
455
- text_content = []
456
- for page in reader.pages:
457
- text_content.append(page.extract_text() or "")
458
- return "\n".join(text_content)
459
- except Exception as e:
460
- raise ValueError(f"PDF parse error: {e}")
461
-
462
- def parse_text_file_as_str(file_up: gr.File) -> str:
463
- """
464
- For .txt, do the same path or fallback approach,
465
- possibly with multiple encodings if needed.
466
- """
467
- path = file_up.name
468
- if os.path.isfile(path):
469
- with open(path, "rb") as f:
470
- return f.read().decode("utf-8", errors="replace")
471
- else:
472
- if not hasattr(file_up, "file"):
473
- raise ValueError("Gradio file object has no .file attribute. Cannot parse txt.")
474
- raw_bytes = file_up.file.read()
475
- return raw_bytes.decode("utf-8", errors="replace")
476
-
477
  ##########################################################
478
  # GRADIO APP SETUP #
479
  ##########################################################
480
 
481
  with gr.Blocks() as demo:
482
- gr.Markdown("# 🩺 Enhanced Clinical Research Assistant with EDA")
483
  gr.Markdown("""
484
  - **Summarize** text (GPT-3.5)
485
  - **Predict** outcomes (fine-tuned model)
486
  - **Translate** (English ↔ French)
487
  - **Named Entity Recognition** (spaCy)
488
  - **Fetch** from PubMed, Crossref, Europe PMC
 
489
  - **Generate** PDF reports
490
- - **Enhanced EDA** on CSV/Excel (with fallback encodings)
491
  """)
492
 
493
  with gr.Row():
@@ -504,11 +487,11 @@ with gr.Blocks() as demo:
504
  "Generate Report",
505
  "Translate",
506
  "Perform Named Entity Recognition",
507
- "Perform Enhanced EDA",
508
  "Fetch Clinical Studies",
509
  "Fetch PubMed Articles (Legacy)",
510
  "Fetch PubMed by Query",
511
  "Fetch Crossref by Query",
 
512
  ],
513
  label="Select an Action",
514
  )
@@ -546,24 +529,23 @@ with gr.Blocks() as demo:
546
 
547
  combined_text = txt.strip()
548
 
549
- # If a file is uploaded, parse based on extension
550
  if file_up is not None:
551
  file_ext = os.path.splitext(file_up.name)[1].lower()
552
  try:
553
  if file_ext == ".txt":
554
- txt_content = parse_text_file_as_str(file_up)
555
- combined_text += "\n" + txt_content
556
  elif file_ext == ".pdf":
557
  pdf_text = parse_pdf_file_as_str(file_up)
558
  combined_text += "\n" + pdf_text
559
- # For CSV/Excel, we usually parse them *inside* certain actions (EDA, Summarize, etc.)
560
- # Because sometimes you want the raw DataFrame, not the text.
561
  except Exception as e:
562
  return f"File parse error: {e}", None, None, None
563
 
564
- # Now handle the action
565
  if action == "Summarize":
566
- # If CSV or Excel is uploaded, parse into DF and then convert to text
567
  if file_up:
568
  fx = file_up.name.lower()
569
  if fx.endswith(".csv"):
@@ -571,13 +553,13 @@ with gr.Blocks() as demo:
571
  df_csv = parse_csv_file_to_df(file_up)
572
  combined_text += "\n" + df_csv.to_csv(index=False)
573
  except Exception as e:
574
- return f"CSV parse error for Summarize: {e}", None, None, None
575
  elif fx.endswith((".xls", ".xlsx")):
576
  try:
577
  df_xl = parse_excel_file_to_df(file_up)
578
  combined_text += "\n" + df_xl.to_csv(index=False)
579
  except Exception as e:
580
- return f"Excel parse error for Summarize: {e}", None, None, None
581
 
582
  summary = summarize_text(combined_text)
583
  return summary, None, None, None
@@ -590,13 +572,13 @@ with gr.Blocks() as demo:
590
  df_csv = parse_csv_file_to_df(file_up)
591
  combined_text += "\n" + df_csv.to_csv(index=False)
592
  except Exception as e:
593
- return f"CSV parse error for Predict: {e}", None, None, None
594
  elif fx.endswith((".xls", ".xlsx")):
595
  try:
596
  df_xl = parse_excel_file_to_df(file_up)
597
  combined_text += "\n" + df_xl.to_csv(index=False)
598
  except Exception as e:
599
- return f"Excel parse error for Predict: {e}", None, None, None
600
 
601
  predictions = predict_outcome(combined_text)
602
  if isinstance(predictions, dict):
@@ -605,6 +587,7 @@ with gr.Blocks() as demo:
605
  return predictions, None, None, None
606
 
607
  elif action == "Generate Report":
 
608
  if file_up:
609
  fx = file_up.name.lower()
610
  if fx.endswith(".csv"):
@@ -612,13 +595,13 @@ with gr.Blocks() as demo:
612
  df_csv = parse_csv_file_to_df(file_up)
613
  combined_text += "\n" + df_csv.to_csv(index=False)
614
  except Exception as e:
615
- return f"CSV parse error for Report: {e}", None, None, None
616
  elif fx.endswith((".xls", ".xlsx")):
617
  try:
618
  df_xl = parse_excel_file_to_df(file_up)
619
  combined_text += "\n" + df_xl.to_csv(index=False)
620
  except Exception as e:
621
- return f"Excel parse error for Report: {e}", None, None, None
622
 
623
  fp = generate_report(combined_text, report_fn)
624
  msg = f"Report generated: {fp}" if fp else "Report generation failed."
@@ -632,13 +615,13 @@ with gr.Blocks() as demo:
632
  df_csv = parse_csv_file_to_df(file_up)
633
  combined_text += "\n" + df_csv.to_csv(index=False)
634
  except Exception as e:
635
- return f"CSV parse error for Translate: {e}", None, None, None
636
  elif fx.endswith((".xls", ".xlsx")):
637
  try:
638
  df_xl = parse_excel_file_to_df(file_up)
639
  combined_text += "\n" + df_xl.to_csv(index=False)
640
  except Exception as e:
641
- return f"Excel parse error for Translate: {e}", None, None, None
642
 
643
  translated = translate_text(combined_text, translation_opt)
644
  return translated, None, None, None
@@ -651,20 +634,17 @@ with gr.Blocks() as demo:
651
  df_csv = parse_csv_file_to_df(file_up)
652
  combined_text += "\n" + df_csv.to_csv(index=False)
653
  except Exception as e:
654
- return f"CSV parse error for NER: {e}", None, None, None
655
  elif fx.endswith((".xls", ".xlsx")):
656
  try:
657
  df_xl = parse_excel_file_to_df(file_up)
658
  combined_text += "\n" + df_xl.to_csv(index=False)
659
  except Exception as e:
660
- return f"Excel parse error for NER: {e}", None, None, None
661
 
662
  ner_result = perform_named_entity_recognition(combined_text)
663
  return ner_result, None, None, None
664
 
665
- elif action == "Perform Enhanced EDA":
666
- return await _action_eda(file_up, txt)
667
-
668
  elif action == "Fetch Clinical Studies":
669
  if nct_id:
670
  result = await fetch_articles_by_nct_id(nct_id)
@@ -708,43 +688,23 @@ with gr.Blocks() as demo:
708
  )
709
  return formatted, None, None, None
710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  return "Invalid action.", None, None, None
712
 
713
- async def _action_eda(file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
714
- """Perform Enhanced EDA on CSV or Excel. If no file, try parsing raw_text as CSV."""
715
- if file_up is None and not raw_text.strip():
716
- return "No data provided for EDA.", None, None, None
717
-
718
- if file_up:
719
- ext = os.path.splitext(file_up.name)[1].lower()
720
- if ext == ".csv":
721
- try:
722
- df = parse_csv_file_to_df(file_up)
723
- eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df)
724
- return eda_summary, corr_chart, dist_chart, None
725
- except Exception as e:
726
- return f"CSV EDA failed: {e}", None, None, None
727
- elif ext in [".xls", ".xlsx"]:
728
- try:
729
- df = parse_excel_file_to_df(file_up)
730
- eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df)
731
- return eda_summary, corr_chart, dist_chart, None
732
- except Exception as e:
733
- return f"Excel EDA failed: {e}", None, None, None
734
- else:
735
- return "No valid CSV/Excel data for EDA.", None, None, None
736
- else:
737
- # If no file, maybe user pasted CSV text
738
- if "," in raw_text:
739
- from io import StringIO
740
- try:
741
- df = pd.read_csv(StringIO(raw_text))
742
- eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df)
743
- return eda_summary, corr_chart, dist_chart, None
744
- except Exception as e:
745
- return f"Text-based CSV parse error: {e}", None, None, None
746
- return "No valid CSV/Excel data found for EDA.", None, None, None
747
-
748
  submit_btn.click(
749
  fn=handle_action,
750
  inputs=[action, text_input, file_input, translation_option, query_params_input, nct_id_input, report_filename_input, export_format],
 
26
  import spacy.cli
27
  import PyPDF2
28
 
29
+ # =========================
30
+ # 1) SpaCy Model Download
31
+ # =========================
32
  try:
33
  nlp = spacy.load("en_core_web_sm")
34
  except OSError:
 
36
  spacy.cli.download("en_core_web_sm")
37
  nlp = spacy.load("en_core_web_sm")
38
 
39
+ # =========================
40
+ # 2) Logging Setup
41
+ # =========================
42
  logger.add("error_logs.log", rotation="1 MB", level="ERROR")
43
 
44
+ # =========================
45
+ # 3) Environment Vars
46
+ # =========================
47
  load_dotenv()
48
  HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")
49
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
50
+ BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY") # <--- NEW for BioPortal
51
  ENTREZ_EMAIL = os.getenv("ENTREZ_EMAIL")
52
 
53
  if not HUGGINGFACE_TOKEN or not OPENAI_API_KEY:
54
  logger.error("Missing Hugging Face or OpenAI credentials.")
55
  raise ValueError("Missing credentials for Hugging Face or OpenAI.")
56
 
57
+ if not BIOPORTAL_API_KEY:
58
+ logger.warning("No BioPortal API Key found. BioPortal queries may fail.")
59
+
60
+ # =========================
61
+ # 4) Hugging Face Login
62
+ # =========================
63
  login(HUGGINGFACE_TOKEN)
64
+
65
+ # =========================
66
+ # 5) OpenAI Client
67
+ # =========================
68
  client = OpenAI(api_key=OPENAI_API_KEY)
69
 
70
+ # =========================
71
+ # 6) Device (CPU/GPU)
72
+ # =========================
73
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
74
  logger.info(f"Using device: {device}")
75
 
76
+ # =========================
77
+ # 7) Models Setup
78
+ # =========================
79
  MODEL_NAME = "mgbam/bert-base-finetuned-mgbam"
80
  try:
81
  model = AutoModelForSequenceClassification.from_pretrained(
 
88
  logger.error(f"Model load error: {e}")
89
  raise
90
 
 
91
  try:
92
  translation_model_name = "Helsinki-NLP/opus-mt-en-fr"
93
  translation_model = MarianMTModel.from_pretrained(
 
105
  "French to English": ("fr", "en"),
106
  }
107
 
108
+ # =========================
109
+ # 8) API Endpoints
110
+ # =========================
111
  PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
112
  PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
113
  EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
114
+ BIOPORTAL_API_BASE = "https://data.bioontology.org"
115
+ CROSSREF_API_URL = "https://api.crossref.org/works"
116
 
117
  ##########################################################
118
  # HELPER FUNCTIONS #
119
  ##########################################################
120
 
121
+ def safe_json_parse(text: str) -> Union[Dict[str, Any], None]:
122
+ """Parse JSON string into Python dictionary safely."""
123
  try:
124
  return json.loads(text)
125
  except json.JSONDecodeError as e:
 
127
  return None
128
 
129
  def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
130
+ """Parse PubMed XML into structured articles."""
131
  root = ET.fromstring(xml_data)
132
  articles = []
133
  for article in root.findall(".//PubmedArticle"):
 
159
  ##########################################################
160
 
161
  async def fetch_articles_by_nct_id(nct_id: str) -> Dict[str, Any]:
162
+ """Europe PMC by NCT ID."""
163
  params = {"query": nct_id, "format": "json"}
164
  async with httpx.AsyncClient() as client_http:
165
  try:
 
171
  return {"error": str(e)}
172
 
173
  async def fetch_articles_by_query(query_params: str) -> Dict[str, Any]:
174
+ """Europe PMC by JSON query."""
175
  parsed_params = safe_json_parse(query_params)
176
  if not parsed_params or not isinstance(parsed_params, dict):
177
  return {"error": "Invalid JSON."}
 
187
  return {"error": str(e)}
188
 
189
  async def fetch_pubmed_by_query(query_params: str) -> Dict[str, Any]:
190
+ """PubMed by JSON query."""
191
  parsed_params = safe_json_parse(query_params)
192
  if not parsed_params or not isinstance(parsed_params, dict):
193
  return {"error": "Invalid JSON for PubMed."}
 
202
 
203
  async with httpx.AsyncClient() as client_http:
204
  try:
205
+ # 1) search
206
+ search_resp = await client_http.get(PUBMED_SEARCH_URL, params=search_params)
207
+ search_resp.raise_for_status()
208
+ search_data = search_resp.json()
209
  id_list = search_data.get("esearchresult", {}).get("idlist", [])
210
  if not id_list:
211
  return {"result": ""}
212
 
213
+ # 2) fetch
214
  fetch_params = {
215
  "db": "pubmed",
216
  "id": ",".join(id_list),
217
  "retmode": "xml",
218
  "email": ENTREZ_EMAIL,
219
  }
220
+ fetch_resp = await client_http.get(PUBMED_FETCH_URL, params=fetch_params)
221
+ fetch_resp.raise_for_status()
222
+ return {"result": fetch_resp.text}
223
  except Exception as e:
224
  logger.error(f"Error fetching PubMed articles: {e}")
225
  return {"error": str(e)}
226
 
227
  async def fetch_crossref_by_query(query_params: str) -> Dict[str, Any]:
228
+ """Crossref by JSON query."""
229
  parsed_params = safe_json_parse(query_params)
230
  if not parsed_params or not isinstance(parsed_params, dict):
231
  return {"error": "Invalid JSON for Crossref."}
232
+
233
  async with httpx.AsyncClient() as client_http:
234
  try:
235
  response = await client_http.get(CROSSREF_API_URL, params=parsed_params)
 
240
  return {"error": str(e)}
241
 
242
  ##########################################################
243
+ # BIOPORTAL INTEGRATION #
244
+ ##########################################################
245
+
246
+ async def fetch_bioportal_by_query(query_params: str) -> Dict[str, Any]:
247
+ """
248
+ Fetch from BioPortal using JSON query parameters.
249
+ Expects something like: {"q": "cancer"}
250
+ See: https://data.bioontology.org/documentation
251
+ """
252
+ if not BIOPORTAL_API_KEY:
253
+ return {"error": "No BioPortal API Key set. Cannot fetch BioPortal data."}
254
+
255
+ parsed_params = safe_json_parse(query_params)
256
+ if not parsed_params or not isinstance(parsed_params, dict):
257
+ return {"error": "Invalid JSON for BioPortal."}
258
+
259
+ search_term = parsed_params.get("q", "")
260
+ if not search_term:
261
+ return {"error": "No 'q' found in JSON. Provide a search term."}
262
+
263
+ url = f"{BIOPORTAL_API_BASE}/search"
264
+ headers = {"Authorization": f"apikey token={BIOPORTAL_API_KEY}"}
265
+ req_params = {"q": search_term}
266
+
267
+ async with httpx.AsyncClient() as client_http:
268
+ try:
269
+ resp = await client_http.get(url, params=req_params, headers=headers)
270
+ resp.raise_for_status()
271
+ return resp.json()
272
+ except Exception as e:
273
+ logger.error(f"Error fetching BioPortal data: {e}")
274
+ return {"error": str(e)}
275
+
276
+ ##########################################################
277
+ # CORE LOGIC #
278
  ##########################################################
279
 
280
  def summarize_text(text: str) -> str:
 
372
  return "Named Entity Recognition failed."
373
 
374
  ##########################################################
375
+ # FILE PARSING (TXT, PDF, CSV, EXCEL) #
376
  ##########################################################
377
 
378
+ def parse_pdf_file_as_str(file_up: gr.File) -> str:
379
+ """Read PDF pages with PyPDF2 (local path or in-memory)."""
380
+ pdf_path = file_up.name
381
+ if os.path.isfile(pdf_path):
382
+ with open(pdf_path, "rb") as f:
383
+ reader = PyPDF2.PdfReader(f)
384
+ text_content = []
385
+ for page in reader.pages:
386
+ text_content.append(page.extract_text() or "")
387
+ return "\n".join(text_content)
388
+ else:
389
+ if not hasattr(file_up, "file"):
390
+ raise ValueError("Gradio file object has no .file attribute (PDF).")
391
+ try:
392
+ pdf_bytes = file_up.file.read()
393
+ reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
394
+ text_content = []
395
+ for page in reader.pages:
396
+ text_content.append(page.extract_text() or "")
397
+ return "\n".join(text_content)
398
+ except Exception as e:
399
+ raise ValueError(f"PDF parse error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
+ def parse_text_file_as_str(file_up: gr.File) -> str:
402
+ """Read .txt as UTF-8 from path or in-memory."""
403
+ path = file_up.name
404
+ if os.path.isfile(path):
405
+ with open(path, "rb") as f:
406
+ return f.read().decode("utf-8", errors="replace")
407
+ else:
408
+ if not hasattr(file_up, "file"):
409
+ raise ValueError("Gradio file object has no .file attribute (TXT).")
410
+ raw_bytes = file_up.file.read()
411
+ return raw_bytes.decode("utf-8", errors="replace")
412
 
413
  def parse_csv_file_to_df(file_up: gr.File) -> pd.DataFrame:
414
  """
415
+ Safely parse CSV with multiple encodings.
416
+ 1) Local file path or fallback .file
417
+ 2) Encodings: ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]
 
 
418
  """
419
  path = file_up.name
420
+ # local path
421
  if os.path.isfile(path):
422
  for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
423
  try:
424
+ return pd.read_csv(path, encoding=enc)
 
425
  except UnicodeDecodeError:
426
+ logger.warning(f"CSV parse failed with {enc}, trying next...")
427
  except Exception as e:
428
+ logger.warning(f"Other CSV parse error with {enc}: {e}")
429
+ raise ValueError("Could not parse CSV from local path with known encodings.")
430
  else:
 
431
  if not hasattr(file_up, "file"):
432
+ raise ValueError("Gradio file object has no .file attribute (CSV).")
433
  raw_bytes = file_up.file.read()
 
 
434
  for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
435
  try:
436
+ txt_decoded = raw_bytes.decode(enc, errors="replace")
437
  from io import StringIO
438
+ return pd.read_csv(StringIO(txt_decoded))
 
439
  except UnicodeDecodeError:
440
+ logger.warning(f"In-memory CSV parse failed with {enc}, trying next...")
441
  except Exception as e:
442
+ logger.warning(f"In-memory CSV parse error with {enc}: {e}")
443
+ raise ValueError("Could not parse CSV from memory with known encodings.")
444
 
445
  def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
446
+ """Read Excel (.xls/.xlsx) from path or in-memory."""
 
 
 
 
 
447
  excel_path = file_up.name
448
  if os.path.isfile(excel_path):
449
  return pd.read_excel(excel_path, engine="openpyxl")
450
  else:
451
  if not hasattr(file_up, "file"):
452
+ raise ValueError("Gradio file object has no .file attribute (Excel).")
453
  try:
454
  excel_bytes = file_up.file.read()
455
  return pd.read_excel(io.BytesIO(excel_bytes), engine="openpyxl")
456
  except Exception as e:
457
  raise ValueError(f"Excel parse error: {e}")
458
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  ##########################################################
460
  # GRADIO APP SETUP #
461
  ##########################################################
462
 
463
  with gr.Blocks() as demo:
464
+ gr.Markdown("# 🩺 Clinical Research Assistant (No EDA) + BioPortal")
465
  gr.Markdown("""
466
  - **Summarize** text (GPT-3.5)
467
  - **Predict** outcomes (fine-tuned model)
468
  - **Translate** (English ↔ French)
469
  - **Named Entity Recognition** (spaCy)
470
  - **Fetch** from PubMed, Crossref, Europe PMC
471
+ - **Fetch** from BioPortal (NEW)
472
  - **Generate** PDF reports
473
+ - (EDA Removed)
474
  """)
475
 
476
  with gr.Row():
 
487
  "Generate Report",
488
  "Translate",
489
  "Perform Named Entity Recognition",
 
490
  "Fetch Clinical Studies",
491
  "Fetch PubMed Articles (Legacy)",
492
  "Fetch PubMed by Query",
493
  "Fetch Crossref by Query",
494
+ "Fetch BioPortal by Query", # <-- NEW ACTION
495
  ],
496
  label="Select an Action",
497
  )
 
529
 
530
  combined_text = txt.strip()
531
 
532
+ # 1) If user uploaded a file, parse basic text from .txt or .pdf
533
  if file_up is not None:
534
  file_ext = os.path.splitext(file_up.name)[1].lower()
535
  try:
536
  if file_ext == ".txt":
537
+ text_content = parse_text_file_as_str(file_up)
538
+ combined_text += "\n" + text_content
539
  elif file_ext == ".pdf":
540
  pdf_text = parse_pdf_file_as_str(file_up)
541
  combined_text += "\n" + pdf_text
542
+ # CSV/Excel might be parsed in the actions below if needed
 
543
  except Exception as e:
544
  return f"File parse error: {e}", None, None, None
545
 
546
+ # 2) Action dispatch
547
  if action == "Summarize":
548
+ # If CSV or Excel is uploaded, parse DataFrame -> text
549
  if file_up:
550
  fx = file_up.name.lower()
551
  if fx.endswith(".csv"):
 
553
  df_csv = parse_csv_file_to_df(file_up)
554
  combined_text += "\n" + df_csv.to_csv(index=False)
555
  except Exception as e:
556
+ return f"CSV parse error (Summarize): {e}", None, None, None
557
  elif fx.endswith((".xls", ".xlsx")):
558
  try:
559
  df_xl = parse_excel_file_to_df(file_up)
560
  combined_text += "\n" + df_xl.to_csv(index=False)
561
  except Exception as e:
562
+ return f"Excel parse error (Summarize): {e}", None, None, None
563
 
564
  summary = summarize_text(combined_text)
565
  return summary, None, None, None
 
572
  df_csv = parse_csv_file_to_df(file_up)
573
  combined_text += "\n" + df_csv.to_csv(index=False)
574
  except Exception as e:
575
+ return f"CSV parse error (Predict): {e}", None, None, None
576
  elif fx.endswith((".xls", ".xlsx")):
577
  try:
578
  df_xl = parse_excel_file_to_df(file_up)
579
  combined_text += "\n" + df_xl.to_csv(index=False)
580
  except Exception as e:
581
+ return f"Excel parse error (Predict): {e}", None, None, None
582
 
583
  predictions = predict_outcome(combined_text)
584
  if isinstance(predictions, dict):
 
587
  return predictions, None, None, None
588
 
589
  elif action == "Generate Report":
590
+ # Merge CSV/Excel if user wants them in the PDF
591
  if file_up:
592
  fx = file_up.name.lower()
593
  if fx.endswith(".csv"):
 
595
  df_csv = parse_csv_file_to_df(file_up)
596
  combined_text += "\n" + df_csv.to_csv(index=False)
597
  except Exception as e:
598
+ return f"CSV parse error (Report): {e}", None, None, None
599
  elif fx.endswith((".xls", ".xlsx")):
600
  try:
601
  df_xl = parse_excel_file_to_df(file_up)
602
  combined_text += "\n" + df_xl.to_csv(index=False)
603
  except Exception as e:
604
+ return f"Excel parse error (Report): {e}", None, None, None
605
 
606
  fp = generate_report(combined_text, report_fn)
607
  msg = f"Report generated: {fp}" if fp else "Report generation failed."
 
615
  df_csv = parse_csv_file_to_df(file_up)
616
  combined_text += "\n" + df_csv.to_csv(index=False)
617
  except Exception as e:
618
+ return f"CSV parse error (Translate): {e}", None, None, None
619
  elif fx.endswith((".xls", ".xlsx")):
620
  try:
621
  df_xl = parse_excel_file_to_df(file_up)
622
  combined_text += "\n" + df_xl.to_csv(index=False)
623
  except Exception as e:
624
+ return f"Excel parse error (Translate): {e}", None, None, None
625
 
626
  translated = translate_text(combined_text, translation_opt)
627
  return translated, None, None, None
 
634
  df_csv = parse_csv_file_to_df(file_up)
635
  combined_text += "\n" + df_csv.to_csv(index=False)
636
  except Exception as e:
637
+ return f"CSV parse error (NER): {e}", None, None, None
638
  elif fx.endswith((".xls", ".xlsx")):
639
  try:
640
  df_xl = parse_excel_file_to_df(file_up)
641
  combined_text += "\n" + df_xl.to_csv(index=False)
642
  except Exception as e:
643
+ return f"Excel parse error (NER): {e}", None, None, None
644
 
645
  ner_result = perform_named_entity_recognition(combined_text)
646
  return ner_result, None, None, None
647
 
 
 
 
648
  elif action == "Fetch Clinical Studies":
649
  if nct_id:
650
  result = await fetch_articles_by_nct_id(nct_id)
 
688
  )
689
  return formatted, None, None, None
690
 
691
+ elif action == "Fetch BioPortal by Query":
692
+ bioportal_result = await fetch_bioportal_by_query(query_str)
693
+ # Typically, the results are in "collection"
694
+ # See: https://data.bioontology.org/documentation
695
+ items = bioportal_result.get("collection", [])
696
+ if not items:
697
+ return "No BioPortal results found.", None, None, None
698
+
699
+ # Format a quick listing
700
+ formatted = "\n\n".join(
701
+ f"Label: {item.get('prefLabel')}, ID: {item.get('@id')}"
702
+ for item in items
703
+ )
704
+ return formatted, None, None, None
705
+
706
  return "Invalid action.", None, None, None
707
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
708
  submit_btn.click(
709
  fn=handle_action,
710
  inputs=[action, text_input, file_input, translation_option, query_params_input, nct_id_input, report_filename_input, export_format],