mgbam commited on
Commit
d3ccae5
·
verified ·
1 Parent(s): 31be05a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +255 -240
app.py CHANGED
@@ -43,27 +43,18 @@ HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")
43
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
44
  ENTREZ_EMAIL = os.getenv("ENTREZ_EMAIL")
45
 
46
- # Basic checks
47
  if not HUGGINGFACE_TOKEN or not OPENAI_API_KEY:
48
  logger.error("Missing Hugging Face or OpenAI credentials.")
49
  raise ValueError("Missing credentials for Hugging Face or OpenAI.")
50
 
51
- # API endpoints
52
- PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
53
- PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
54
- EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
55
-
56
- # Log in to Hugging Face
57
  login(HUGGINGFACE_TOKEN)
58
-
59
- # Initialize OpenAI
60
  client = OpenAI(api_key=OPENAI_API_KEY)
61
 
62
- # Device setting
63
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
64
  logger.info(f"Using device: {device}")
65
 
66
- # Model settings
67
  MODEL_NAME = "mgbam/bert-base-finetuned-mgbam"
68
  try:
69
  model = AutoModelForSequenceClassification.from_pretrained(
@@ -76,7 +67,7 @@ except Exception as e:
76
  logger.error(f"Model load error: {e}")
77
  raise
78
 
79
- # Translation model settings
80
  try:
81
  translation_model_name = "Helsinki-NLP/opus-mt-en-fr"
82
  translation_model = MarianMTModel.from_pretrained(
@@ -94,12 +85,16 @@ LANGUAGE_MAP: Dict[str, Tuple[str, str]] = {
94
  "French to English": ("fr", "en"),
95
  }
96
 
97
- ###################################################
98
- # UTILS #
99
- ###################################################
 
 
 
 
 
100
 
101
  def safe_json_parse(text: str) -> Union[Dict, None]:
102
- """Safely parse JSON string into a Python dictionary."""
103
  try:
104
  return json.loads(text)
105
  except json.JSONDecodeError as e:
@@ -107,7 +102,7 @@ def safe_json_parse(text: str) -> Union[Dict, None]:
107
  return None
108
 
109
  def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
110
- """Parses PubMed XML data and returns a list of structured articles."""
111
  root = ET.fromstring(xml_data)
112
  articles = []
113
  for article in root.findall(".//PubmedArticle"):
@@ -134,9 +129,9 @@ def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
134
  })
135
  return articles
136
 
137
- ###################################################
138
- # ASYNC FETCHES #
139
- ###################################################
140
 
141
  async def fetch_articles_by_nct_id(nct_id: str) -> Dict[str, Any]:
142
  params = {"query": nct_id, "format": "json"}
@@ -213,12 +208,11 @@ async def fetch_crossref_by_query(query_params: str) -> Dict[str, Any]:
213
  logger.error(f"Error fetching Crossref data: {e}")
214
  return {"error": str(e)}
215
 
216
- ###################################################
217
- # CORE LOGIC #
218
- ###################################################
219
 
220
  def summarize_text(text: str) -> str:
221
- """Summarize text using OpenAI."""
222
  if not text.strip():
223
  return "No text provided for summarization."
224
  try:
@@ -234,7 +228,6 @@ def summarize_text(text: str) -> str:
234
  return "Summarization failed."
235
 
236
  def predict_outcome(text: str) -> Union[Dict[str, float], str]:
237
- """Predict outcomes (classification) using a fine-tuned model."""
238
  if not text.strip():
239
  return "No text provided for prediction."
240
  try:
@@ -249,7 +242,6 @@ def predict_outcome(text: str) -> Union[Dict[str, float], str]:
249
  return "Prediction failed."
250
 
251
  def generate_report(text: str, filename: str = "clinical_report.pdf") -> Optional[str]:
252
- """Generate a PDF report from the given text."""
253
  try:
254
  if not text.strip():
255
  logger.warning("No text provided for the report.")
@@ -271,7 +263,6 @@ def generate_report(text: str, filename: str = "clinical_report.pdf") -> Optiona
271
  return None
272
 
273
  def visualize_predictions(predictions: Dict[str, float]) -> Optional[alt.Chart]:
274
- """Visualize model prediction probabilities using Altair."""
275
  try:
276
  data = pd.DataFrame(list(predictions.items()), columns=["Label", "Probability"])
277
  chart = (
@@ -290,7 +281,6 @@ def visualize_predictions(predictions: Dict[str, float]) -> Optional[alt.Chart]:
290
  return None
291
 
292
  def translate_text(text: str, translation_option: str) -> str:
293
- """Translate text between English and French."""
294
  if not text.strip():
295
  return "No text provided for translation."
296
  try:
@@ -304,7 +294,6 @@ def translate_text(text: str, translation_option: str) -> str:
304
  return "Translation failed."
305
 
306
  def perform_named_entity_recognition(text: str) -> str:
307
- """Perform Named Entity Recognition (NER) using spaCy."""
308
  if not text.strip():
309
  return "No text provided for NER."
310
  try:
@@ -317,19 +306,15 @@ def perform_named_entity_recognition(text: str) -> str:
317
  logger.error(f"NER Error: {e}")
318
  return "Named Entity Recognition failed."
319
 
320
- ###################################################
321
- # ENHANCED EDA #
322
- ###################################################
323
 
324
  def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Optional[alt.Chart]]:
325
- """
326
- Show columns, shape, numeric summary, correlation heatmap, and distribution histograms.
327
- Returns (text_summary, correlation_chart, distribution_chart).
328
- """
329
  try:
330
  columns_info = f"Columns: {list(df.columns)}"
331
  shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
332
-
333
  with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
334
  describe_info = df.describe(include="all").to_string()
335
 
@@ -340,7 +325,9 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
340
  )
341
 
342
  numeric_cols = df.select_dtypes(include="number")
343
- corr_chart = None
 
 
344
  if numeric_cols.shape[1] >= 2:
345
  corr = numeric_cols.corr()
346
  corr_melted = corr.reset_index().melt(id_vars="index")
@@ -357,7 +344,7 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
357
  .properties(width=400, height=400, title="Correlation Heatmap")
358
  )
359
 
360
- distribution_chart = None
361
  if numeric_cols.shape[1] >= 1:
362
  df_long = numeric_cols.melt(var_name='Column', value_name='Value')
363
  distribution_chart = (
@@ -383,83 +370,108 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
383
  logger.error(f"Enhanced EDA Error: {e}")
384
  return f"Enhanced EDA failed: {e}", None, None
385
 
386
- ###################################################
387
- # FILE PARSING #
388
- ###################################################
389
 
390
- def parse_text_file(uploaded_file: gr.File) -> str:
391
- """Reads a .txt file as UTF-8 text."""
392
- return uploaded_file.read().decode("utf-8")
 
 
 
 
 
393
 
394
- def parse_csv_file(uploaded_file: gr.File) -> pd.DataFrame:
395
  """
396
- Reads CSV content with possible BOM issues
397
- by trying 'utf-8' and 'utf-8-sig'.
 
 
398
  """
399
- content = uploaded_file.read().decode("utf-8", errors="replace")
400
- # We can attempt to parse with multiple encodings if needed:
401
- # For simplicity, let's just do a fallback approach:
402
- try:
403
- from io import StringIO
404
- df = pd.read_csv(StringIO(content))
405
- return df
406
- except Exception as e:
407
- raise ValueError(f"CSV parse error: {e}")
408
 
409
- def parse_excel_file(uploaded_file: gr.File) -> pd.DataFrame:
410
  """
411
- Parse an Excel file into a pandas DataFrame.
412
- 1) If the path exists, read directly from path.
413
- 2) Else read from uploaded_file.file (in-memory) in binary mode.
414
  """
415
- import pandas as pd
416
  import os
417
-
418
- excel_path = uploaded_file.name
419
- # Try local path first
420
  if os.path.isfile(excel_path):
421
  return pd.read_excel(excel_path, engine="openpyxl")
 
 
 
 
 
 
422
 
423
- # Fall back to reading raw bytes from uploaded_file.file
424
- try:
425
- excel_bytes = uploaded_file.file.read()
426
- return pd.read_excel(io.BytesIO(excel_bytes), engine="openpyxl")
427
- except Exception as e:
428
- raise ValueError(f"Excel parse error: {e}")
429
-
430
- def parse_pdf_file(uploaded_file: gr.File) -> str:
431
- """Reads a PDF file with PyPDF2, extracting text from each page."""
432
- try:
433
- pdf_reader = PyPDF2.PdfReader(uploaded_file)
434
- text_content = []
435
- for page in pdf_reader.pages:
436
- text_content.append(page.extract_text())
437
- return "\n".join(text_content)
438
- except Exception as e:
439
- logger.error(f"PDF parse error: {e}")
440
- return f"Error reading PDF file: {e}"
 
 
 
 
 
 
 
441
 
442
- ###################################################
443
- # GRADIO INTERFACE #
444
- ###################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
  with gr.Blocks() as demo:
447
- gr.Markdown("# Advanced Clinical Research Assistant with Enhanced EDA")
448
  gr.Markdown("""
449
- Welcome to the **Enhanced** AI-Powered Clinical Assistant!
450
- - **Summarize** large blocks of clinical text.
451
- - **Predict** outcomes with a fine-tuned model.
452
- - **Translate** text (English ↔ French).
453
- - **Perform Named Entity Recognition** (spaCy).
454
- - **Fetch** from PubMed, Crossref, Europe PMC.
455
- - **Generate** professional PDF reports.
456
- - **Perform Enhanced EDA** on CSV/Excel data (correlation heatmaps + distribution plots).
457
  """)
458
-
459
- # Inputs
460
  with gr.Row():
461
- text_input = gr.Textbox(label="Input Text", lines=5, placeholder="Enter clinical text or query...")
462
- # We'll rely on .name and .file for the path and file handle
463
  file_input = gr.File(
464
  label="Upload File (txt/csv/xls/xlsx/pdf)",
465
  file_types=[".txt", ".csv", ".xls", ".xlsx", ".pdf"]
@@ -485,127 +497,167 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
485
  label="Translation Option",
486
  value="English to French"
487
  )
488
- query_params_input = gr.Textbox(
489
- label="Query Parameters (JSON Format)",
490
- placeholder='{"term": "cancer", "retmax": "5"}'
491
- )
492
- nct_id_input = gr.Textbox(label="NCT ID for Article Search")
493
- report_filename_input = gr.Textbox(
494
- label="Report Filename",
495
- placeholder="clinical_report.pdf",
496
- value="clinical_report.pdf"
497
- )
498
- export_format = gr.Dropdown(["None", "CSV", "JSON"], label="Export Format")
499
 
500
- # Outputs
501
- output_text = gr.Textbox(label="Output", lines=10)
502
  with gr.Row():
503
- output_chart = gr.Plot(label="Visualization 1")
504
- output_chart2 = gr.Plot(label="Visualization 2")
505
  output_file = gr.File(label="Generated File")
506
 
507
- submit_button = gr.Button("Submit")
508
-
509
  ################################################################
510
- # MAIN HANDLER FUNCTION #
511
  ################################################################
512
-
513
  async def handle_action(
514
  action: str,
515
- text: str,
516
  file_up: gr.File,
517
  translation_opt: str,
518
- query_params: str,
519
  nct_id: str,
520
- report_filename: str,
521
- export_format: str
522
  ) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
523
 
524
- # 1) Start with user-provided text
525
- combined_text = text.strip()
526
 
527
- # 2) If user uploaded a file, parse it based on extension
528
  if file_up is not None:
529
  file_ext = os.path.splitext(file_up.name)[1].lower()
530
 
531
- if file_ext == ".txt":
532
- file_text = parse_text_file(file_up)
533
- combined_text = (combined_text + "\n" + file_text).strip()
534
 
535
- elif file_ext == ".csv":
536
- # If user chose EDA, we'll parse into DataFrame below
537
- # If we just want to combine text for Summarize, etc., do so:
538
- pass
539
-
540
- elif file_ext in [".xls", ".xlsx"]:
541
- # We'll handle Excel parsing in the EDA step if needed
542
- pass
543
 
544
  elif file_ext == ".pdf":
545
- file_text = parse_pdf_file(file_up)
546
- combined_text = (combined_text + "\n" + file_text).strip()
547
-
548
- ### ACTIONS ###
549
- if action == "Summarize":
550
- if file_up and file_up.name.endswith(".csv"):
551
- # Merge CSV text into combined_text
552
- # in case user wants summarization of the CSV's raw text
553
  try:
554
- df_csv = parse_csv_file(file_up)
555
- # Turn CSV into text
556
- csv_as_text = df_csv.to_csv(index=False)
557
- combined_text = (combined_text + "\n" + csv_as_text).strip()
558
  except Exception as e:
559
- return f"CSV parse error for Summarize: {e}", None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
560
 
561
- # Summarize the combined text
562
- return summarize_text(combined_text), None, None, None
563
 
564
  elif action == "Predict Outcome":
565
- return _action_predict_outcome(combined_text, file_up)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566
 
567
  elif action == "Generate Report":
568
- # Add CSV content if needed
569
- if file_up and file_up.name.endswith(".csv"):
570
- try:
571
- df_csv = parse_csv_file(file_up)
572
- combined_text += "\n" + df_csv.to_csv(index=False)
573
- except Exception as e:
574
- logger.error(f"Error reading CSV for report: {e}")
575
- file_path = generate_report(combined_text, filename=report_filename)
576
- msg = f"Report generated: {file_path}" if file_path else "Report generation failed."
577
- return msg, None, None, file_path
 
 
 
 
 
 
 
 
 
578
 
579
  elif action == "Translate":
580
- # Optionally read CSV or PDF text?
581
- if file_up and file_up.name.endswith(".csv"):
582
- try:
583
- df_csv = parse_csv_file(file_up)
584
- combined_text += "\n" + df_csv.to_csv(index=False)
585
- except Exception as e:
586
- return f"CSV parse error for Translate: {e}", None, None, None
 
 
 
 
 
 
 
 
587
  translated = translate_text(combined_text, translation_opt)
588
  return translated, None, None, None
589
 
590
  elif action == "Perform Named Entity Recognition":
591
- # Merge CSV as text if user wants NER on CSV
592
- if file_up and file_up.name.endswith(".csv"):
593
- try:
594
- df_csv = parse_csv_file(file_up)
595
- combined_text += "\n" + df_csv.to_csv(index=False)
596
- except Exception as e:
597
- return f"CSV parse error for NER: {e}", None, None, None
 
 
 
 
 
 
 
 
598
  ner_result = perform_named_entity_recognition(combined_text)
599
  return ner_result, None, None, None
600
 
601
  elif action == "Perform Enhanced EDA":
602
- return await _action_eda(combined_text, file_up, text)
603
 
604
  elif action == "Fetch Clinical Studies":
605
  if nct_id:
606
  result = await fetch_articles_by_nct_id(nct_id)
607
- elif query_params:
608
- result = await fetch_articles_by_query(query_params)
609
  else:
610
  return "Provide either an NCT ID or valid query parameters.", None, None, None
611
 
@@ -620,7 +672,7 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
620
  return formatted_results, None, None, None
621
 
622
  elif action in ["Fetch PubMed Articles (Legacy)", "Fetch PubMed by Query"]:
623
- pubmed_result = await fetch_pubmed_by_query(query_params)
624
  xml_data = pubmed_result.get("result")
625
  if xml_data:
626
  articles = parse_pubmed_xml(xml_data)
@@ -634,7 +686,7 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
634
  return "No articles found or error fetching data.", None, None, None
635
 
636
  elif action == "Fetch Crossref by Query":
637
- crossref_result = await fetch_crossref_by_query(query_params)
638
  items = crossref_result.get("message", {}).get("items", [])
639
  if not items:
640
  return "No results found.", None, None, None
@@ -645,86 +697,49 @@ Welcome to the **Enhanced** AI-Powered Clinical Assistant!
645
  return formatted, None, None, None
646
 
647
  return "Invalid action.", None, None, None
648
-
649
- def _action_predict_outcome(combined_text: str, file_up: gr.File) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
650
- # If CSV is uploaded, we can merge it into text or do separate logic
651
- if file_up and file_up.name.endswith(".csv"):
652
- try:
653
- df_csv = parse_csv_file(file_up)
654
- # Optionally, merge CSV content into the text to be classified
655
- combined_text_local = combined_text + "\n" + df_csv.to_csv(index=False)
656
- except Exception as e:
657
- return f"CSV parse error for Predict Outcome: {e}", None, None, None
658
- else:
659
- combined_text_local = combined_text
660
-
661
- predictions = predict_outcome(combined_text_local)
662
- if isinstance(predictions, dict):
663
- chart = visualize_predictions(predictions)
664
- return json.dumps(predictions, indent=2), chart, None, None
665
- return predictions, None, None, None
666
 
667
- async def _action_eda(combined_text: str, file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
668
  """
669
- Perform Enhanced EDA on a CSV or Excel file if uploaded.
670
- If .csv is present, parse as CSV; if .xls/.xlsx is present, parse as Excel.
671
  """
672
- # Make sure we either have a file or some data in the text
673
- if not file_up and not raw_text.strip():
674
  return "No data provided for EDA.", None, None, None
675
 
676
- if file_up:
677
- file_ext = os.path.splitext(file_up.name)[1].lower()
678
- if file_ext == ".csv":
 
679
  try:
680
- df_csv = parse_csv_file(file_up)
681
- eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df_csv)
682
  return eda_summary, corr_chart, dist_chart, None
683
  except Exception as e:
684
  return f"CSV EDA failed: {e}", None, None, None
685
-
686
- elif file_ext in [".xls", ".xlsx"]:
687
  try:
688
- df_excel = parse_excel_file(file_up)
689
- eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df_excel)
690
  return eda_summary, corr_chart, dist_chart, None
691
  except Exception as e:
692
  return f"Excel EDA failed: {e}", None, None, None
693
-
694
  else:
695
- # EDA not supported for PDF or .txt in this example
696
- return "No valid CSV/Excel data found for EDA.", None, None, None
697
  else:
698
- # If no file, maybe the user pasted CSV into the text box
699
  if "," in raw_text:
700
- # Attempt to parse text as CSV
701
  try:
702
- from io import StringIO
703
- df_csv = pd.read_csv(StringIO(raw_text))
704
- eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df_csv)
705
  return eda_summary, corr_chart, dist_chart, None
706
  except Exception as e:
707
- return f"EDA parse error for pasted CSV: {e}", None, None, None
708
  return "No valid CSV/Excel data found for EDA.", None, None, None
709
 
710
- submit_button.click(
711
- handle_action,
712
- inputs=[
713
- action,
714
- text_input,
715
- file_input,
716
- translation_option,
717
- query_params_input,
718
- nct_id_input,
719
- report_filename_input,
720
- export_format,
721
- ],
722
- outputs=[
723
- output_text,
724
- output_chart,
725
- output_chart2,
726
- output_file,
727
- ],
728
  )
729
 
730
  demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
43
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
44
  ENTREZ_EMAIL = os.getenv("ENTREZ_EMAIL")
45
 
 
46
  if not HUGGINGFACE_TOKEN or not OPENAI_API_KEY:
47
  logger.error("Missing Hugging Face or OpenAI credentials.")
48
  raise ValueError("Missing credentials for Hugging Face or OpenAI.")
49
 
50
+ # Hugging Face & OpenAI
 
 
 
 
 
51
  login(HUGGINGFACE_TOKEN)
 
 
52
  client = OpenAI(api_key=OPENAI_API_KEY)
53
 
 
54
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
55
  logger.info(f"Using device: {device}")
56
 
57
+ # Model: Classification
58
  MODEL_NAME = "mgbam/bert-base-finetuned-mgbam"
59
  try:
60
  model = AutoModelForSequenceClassification.from_pretrained(
 
67
  logger.error(f"Model load error: {e}")
68
  raise
69
 
70
+ # Model: Translation
71
  try:
72
  translation_model_name = "Helsinki-NLP/opus-mt-en-fr"
73
  translation_model = MarianMTModel.from_pretrained(
 
85
  "French to English": ("fr", "en"),
86
  }
87
 
88
+ # API endpoints
89
+ PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
90
+ PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
91
+ EUROPE_PMC_BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
92
+
93
+ ##########################################################
94
+ # HELPER FUNCTIONS #
95
+ ##########################################################
96
 
97
  def safe_json_parse(text: str) -> Union[Dict, None]:
 
98
  try:
99
  return json.loads(text)
100
  except json.JSONDecodeError as e:
 
102
  return None
103
 
104
  def parse_pubmed_xml(xml_data: str) -> List[Dict[str, Any]]:
105
+ """Parse PubMed XML and return structured articles."""
106
  root = ET.fromstring(xml_data)
107
  articles = []
108
  for article in root.findall(".//PubmedArticle"):
 
129
  })
130
  return articles
131
 
132
+ ##########################################################
133
+ # ASYNC FETCH FUNCTIONS #
134
+ ##########################################################
135
 
136
  async def fetch_articles_by_nct_id(nct_id: str) -> Dict[str, Any]:
137
  params = {"query": nct_id, "format": "json"}
 
208
  logger.error(f"Error fetching Crossref data: {e}")
209
  return {"error": str(e)}
210
 
211
+ ##########################################################
212
+ # CORE FUNCTIONS #
213
+ ##########################################################
214
 
215
  def summarize_text(text: str) -> str:
 
216
  if not text.strip():
217
  return "No text provided for summarization."
218
  try:
 
228
  return "Summarization failed."
229
 
230
  def predict_outcome(text: str) -> Union[Dict[str, float], str]:
 
231
  if not text.strip():
232
  return "No text provided for prediction."
233
  try:
 
242
  return "Prediction failed."
243
 
244
  def generate_report(text: str, filename: str = "clinical_report.pdf") -> Optional[str]:
 
245
  try:
246
  if not text.strip():
247
  logger.warning("No text provided for the report.")
 
263
  return None
264
 
265
  def visualize_predictions(predictions: Dict[str, float]) -> Optional[alt.Chart]:
 
266
  try:
267
  data = pd.DataFrame(list(predictions.items()), columns=["Label", "Probability"])
268
  chart = (
 
281
  return None
282
 
283
  def translate_text(text: str, translation_option: str) -> str:
 
284
  if not text.strip():
285
  return "No text provided for translation."
286
  try:
 
294
  return "Translation failed."
295
 
296
  def perform_named_entity_recognition(text: str) -> str:
 
297
  if not text.strip():
298
  return "No text provided for NER."
299
  try:
 
306
  logger.error(f"NER Error: {e}")
307
  return "Named Entity Recognition failed."
308
 
309
+ ##########################################################
310
+ # ENHANCED EDA FUNCTIONS #
311
+ ##########################################################
312
 
313
  def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Optional[alt.Chart]]:
314
+ """Show columns, shape, numeric summary, correlation heatmap, distribution histograms."""
 
 
 
315
  try:
316
  columns_info = f"Columns: {list(df.columns)}"
317
  shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
 
318
  with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
319
  describe_info = df.describe(include="all").to_string()
320
 
 
325
  )
326
 
327
  numeric_cols = df.select_dtypes(include="number")
328
+ corr_chart, distribution_chart = None, None
329
+
330
+ # Correlation
331
  if numeric_cols.shape[1] >= 2:
332
  corr = numeric_cols.corr()
333
  corr_melted = corr.reset_index().melt(id_vars="index")
 
344
  .properties(width=400, height=400, title="Correlation Heatmap")
345
  )
346
 
347
+ # Distribution
348
  if numeric_cols.shape[1] >= 1:
349
  df_long = numeric_cols.melt(var_name='Column', value_name='Value')
350
  distribution_chart = (
 
370
  logger.error(f"Enhanced EDA Error: {e}")
371
  return f"Enhanced EDA failed: {e}", None, None
372
 
373
+ ##########################################################
374
+ # PARSING FILES WITHOUT .read() ERRORS #
375
+ ##########################################################
376
 
377
+ def parse_text_file_as_str(file_up: gr.File) -> str:
378
+ """
379
+ For .txt or .pdf, read them manually.
380
+ (We'll do PDF in a separate function.)
381
+ """
382
+ # If user has older Gradio that doesn't store .file or .read()
383
+ # let's do the same approach as CSV:
384
+ return _read_file_contents(file_up)
385
 
386
+ def parse_csv_file_to_df(file_up: gr.File) -> pd.DataFrame:
387
  """
388
+ Safely parse a CSV with fallback approach:
389
+ 1) If file path exists, read from disk.
390
+ 2) Else read from uploaded_file.file in memory.
391
+ Then parse with pandas.
392
  """
393
+ raw_text = _read_file_contents(file_up)
394
+ # Parse with pandas
395
+ from io import StringIO
396
+ df = pd.read_csv(StringIO(raw_text))
397
+ return df
 
 
 
 
398
 
399
+ def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
400
  """
401
+ For .xls or .xlsx:
402
+ 1) If file path exists, read from that path.
403
+ 2) Else read from .file in memory.
404
  """
 
405
  import os
406
+ excel_path = file_up.name
 
 
407
  if os.path.isfile(excel_path):
408
  return pd.read_excel(excel_path, engine="openpyxl")
409
+ else:
410
+ try:
411
+ raw_bytes = file_up.file.read() # fallback approach
412
+ return pd.read_excel(io.BytesIO(raw_bytes), engine="openpyxl")
413
+ except Exception as e:
414
+ raise ValueError(f"Excel parse error: {e}")
415
 
416
+ def parse_pdf_file_as_str(file_up: gr.File) -> str:
417
+ """
418
+ For PDFs, read pages with PyPDF2.
419
+ """
420
+ import os
421
+ pdf_path = file_up.name
422
+ # If the path is real
423
+ if os.path.isfile(pdf_path):
424
+ with open(pdf_path, "rb") as f:
425
+ pdf_reader = PyPDF2.PdfReader(f)
426
+ text_content = []
427
+ for page in pdf_reader.pages:
428
+ text_content.append(page.extract_text() or "")
429
+ return "\n".join(text_content)
430
+ else:
431
+ # Fallback read from memory
432
+ try:
433
+ pdf_bytes = file_up.file.read()
434
+ reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
435
+ text_content = []
436
+ for page in reader.pages:
437
+ text_content.append(page.extract_text() or "")
438
+ return "\n".join(text_content)
439
+ except Exception as e:
440
+ raise ValueError(f"PDF parse error: {e}")
441
 
442
+ def _read_file_contents(file_up: gr.File, encoding="utf-8") -> str:
443
+ """
444
+ Generic fallback approach for .txt or .csv:
445
+ 1) If file path is real, read from disk.
446
+ 2) Else read from file_up.file in memory.
447
+ """
448
+ import os
449
+ path = file_up.name
450
+ if os.path.isfile(path):
451
+ with open(path, "rb") as f:
452
+ return f.read().decode(encoding, errors="replace")
453
+ else:
454
+ # fallback
455
+ return file_up.file.read().decode(encoding, errors="replace")
456
+
457
+ ##########################################################
458
+ # GRADIO APP SETUP #
459
+ ##########################################################
460
 
461
  with gr.Blocks() as demo:
462
+ gr.Markdown("# 🩺 Enhanced Clinical Research Assistant with EDA")
463
  gr.Markdown("""
464
+ - **Summarize** text (GPT-3.5)
465
+ - **Predict** outcomes (fine-tuned model)
466
+ - **Translate** (English French)
467
+ - **Named Entity Recognition** (spaCy)
468
+ - **Fetch** from PubMed, Crossref, Europe PMC
469
+ - **Generate** PDF reports
470
+ - **Enhanced EDA** on CSV/Excel (correlation, distributions)
 
471
  """)
472
+
 
473
  with gr.Row():
474
+ text_input = gr.Textbox(label="Input Text", lines=5)
 
475
  file_input = gr.File(
476
  label="Upload File (txt/csv/xls/xlsx/pdf)",
477
  file_types=[".txt", ".csv", ".xls", ".xlsx", ".pdf"]
 
497
  label="Translation Option",
498
  value="English to French"
499
  )
500
+ query_params_input = gr.Textbox(label="Query Params (JSON)", placeholder='{"term": "cancer"}')
501
+ nct_id_input = gr.Textbox(label="NCT ID")
502
+ report_filename_input = gr.Textbox(label="Report Filename", value="clinical_report.pdf")
503
+ export_format = gr.Dropdown(choices=["None", "CSV", "JSON"], label="Export Format")
 
 
 
 
 
 
 
504
 
505
+ output_text = gr.Textbox(label="Output", lines=8)
 
506
  with gr.Row():
507
+ output_chart = gr.Plot(label="Chart 1")
508
+ output_chart2 = gr.Plot(label="Chart 2")
509
  output_file = gr.File(label="Generated File")
510
 
511
+ submit_btn = gr.Button("Submit")
512
+
513
  ################################################################
514
+ # MAIN ACTION HANDLER #
515
  ################################################################
 
516
  async def handle_action(
517
  action: str,
518
+ txt: str,
519
  file_up: gr.File,
520
  translation_opt: str,
521
+ query_str: str,
522
  nct_id: str,
523
+ report_fn: str,
524
+ exp_fmt: str
525
  ) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
526
 
527
+ # Start with user text
528
+ combined_text = txt.strip()
529
 
 
530
  if file_up is not None:
531
  file_ext = os.path.splitext(file_up.name)[1].lower()
532
 
533
+ # For Summaries, NER, etc. we'll just append the file text to 'combined_text'
534
+ # For EDA, we'll parse into a DataFrame
535
+ # Let's do minimal logic here, then handle in each action block.
536
 
537
+ if file_ext == ".txt":
538
+ file_text = _read_file_contents(file_up)
539
+ combined_text += "\n" + file_text
 
 
 
 
 
540
 
541
  elif file_ext == ".pdf":
 
 
 
 
 
 
 
 
542
  try:
543
+ pdf_text = parse_pdf_file_as_str(file_up)
544
+ combined_text += "\n" + pdf_text
 
 
545
  except Exception as e:
546
+ return f"PDF parse error: {e}", None, None, None
547
+
548
+ # Now handle each action:
549
+ if action == "Summarize":
550
+ # If user uploaded CSV or Excel, optionally parse it into text
551
+ if file_up:
552
+ fx = file_up.name.lower()
553
+ if fx.endswith(".csv"):
554
+ try:
555
+ df_csv = parse_csv_file_to_df(file_up)
556
+ csv_as_text = df_csv.to_csv(index=False)
557
+ combined_text += "\n" + csv_as_text
558
+ except Exception as e:
559
+ return f"CSV parse error for Summarize: {e}", None, None, None
560
+ elif fx.endswith((".xls", ".xlsx")):
561
+ try:
562
+ df_xl = parse_excel_file_to_df(file_up)
563
+ excel_as_text = df_xl.to_csv(index=False)
564
+ combined_text += "\n" + excel_as_text
565
+ except Exception as e:
566
+ return f"Excel parse error for Summarize: {e}", None, None, None
567
 
568
+ summary = summarize_text(combined_text)
569
+ return summary, None, None, None
570
 
571
  elif action == "Predict Outcome":
572
+ # Optionally parse CSV/Excel into text
573
+ if file_up:
574
+ fx = file_up.name.lower()
575
+ if fx.endswith(".csv"):
576
+ try:
577
+ df_csv = parse_csv_file_to_df(file_up)
578
+ combined_text += "\n" + df_csv.to_csv(index=False)
579
+ except Exception as e:
580
+ return f"CSV parse error: {e}", None, None, None
581
+ elif fx.endswith((".xls", ".xlsx")):
582
+ try:
583
+ df_xl = parse_excel_file_to_df(file_up)
584
+ combined_text += "\n" + df_xl.to_csv(index=False)
585
+ except Exception as e:
586
+ return f"Excel parse error: {e}", None, None, None
587
+
588
+ predictions = predict_outcome(combined_text)
589
+ if isinstance(predictions, dict):
590
+ chart = visualize_predictions(predictions)
591
+ return json.dumps(predictions, indent=2), chart, None, None
592
+ return predictions, None, None, None
593
 
594
  elif action == "Generate Report":
595
+ # Merge CSV/Excel if needed
596
+ if file_up:
597
+ fx = file_up.name.lower()
598
+ if fx.endswith(".csv"):
599
+ try:
600
+ df_csv = parse_csv_file_to_df(file_up)
601
+ combined_text += "\n" + df_csv.to_csv(index=False)
602
+ except Exception as e:
603
+ return f"CSV parse error for Report: {e}", None, None, None
604
+ elif fx.endswith((".xls", ".xlsx")):
605
+ try:
606
+ df_xl = parse_excel_file_to_df(file_up)
607
+ combined_text += "\n" + df_xl.to_csv(index=False)
608
+ except Exception as e:
609
+ return f"Excel parse error for Report: {e}", None, None, None
610
+
611
+ fp = generate_report(combined_text, report_fn)
612
+ msg = f"Report generated: {fp}" if fp else "Report generation failed."
613
+ return msg, None, None, fp
614
 
615
  elif action == "Translate":
616
+ if file_up:
617
+ fx = file_up.name.lower()
618
+ if fx.endswith(".csv"):
619
+ try:
620
+ df_csv = parse_csv_file_to_df(file_up)
621
+ combined_text += "\n" + df_csv.to_csv(index=False)
622
+ except Exception as e:
623
+ return f"CSV parse error for Translate: {e}", None, None, None
624
+ elif fx.endswith((".xls", ".xlsx")):
625
+ try:
626
+ df_xl = parse_excel_file_to_df(file_up)
627
+ combined_text += "\n" + df_xl.to_csv(index=False)
628
+ except Exception as e:
629
+ return f"Excel parse error for Translate: {e}", None, None, None
630
+
631
  translated = translate_text(combined_text, translation_opt)
632
  return translated, None, None, None
633
 
634
  elif action == "Perform Named Entity Recognition":
635
+ if file_up:
636
+ fx = file_up.name.lower()
637
+ if fx.endswith(".csv"):
638
+ try:
639
+ df_csv = parse_csv_file_to_df(file_up)
640
+ combined_text += "\n" + df_csv.to_csv(index=False)
641
+ except Exception as e:
642
+ return f"CSV parse error for NER: {e}", None, None, None
643
+ elif fx.endswith((".xls", ".xlsx")):
644
+ try:
645
+ df_xl = parse_excel_file_to_df(file_up)
646
+ combined_text += "\n" + df_xl.to_csv(index=False)
647
+ except Exception as e:
648
+ return f"Excel parse error for NER: {e}", None, None, None
649
+
650
  ner_result = perform_named_entity_recognition(combined_text)
651
  return ner_result, None, None, None
652
 
653
  elif action == "Perform Enhanced EDA":
654
+ return await _action_eda(file_up, txt)
655
 
656
  elif action == "Fetch Clinical Studies":
657
  if nct_id:
658
  result = await fetch_articles_by_nct_id(nct_id)
659
+ elif query_str:
660
+ result = await fetch_articles_by_query(query_str)
661
  else:
662
  return "Provide either an NCT ID or valid query parameters.", None, None, None
663
 
 
672
  return formatted_results, None, None, None
673
 
674
  elif action in ["Fetch PubMed Articles (Legacy)", "Fetch PubMed by Query"]:
675
+ pubmed_result = await fetch_pubmed_by_query(query_str)
676
  xml_data = pubmed_result.get("result")
677
  if xml_data:
678
  articles = parse_pubmed_xml(xml_data)
 
686
  return "No articles found or error fetching data.", None, None, None
687
 
688
  elif action == "Fetch Crossref by Query":
689
+ crossref_result = await fetch_crossref_by_query(query_str)
690
  items = crossref_result.get("message", {}).get("items", [])
691
  if not items:
692
  return "No results found.", None, None, None
 
697
  return formatted, None, None, None
698
 
699
  return "Invalid action.", None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
 
701
+ async def _action_eda(file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
702
  """
703
+ Perform Enhanced EDA on CSV or Excel. If no file, try parsing raw_text as CSV.
 
704
  """
705
+ if file_up is None and not raw_text.strip():
 
706
  return "No data provided for EDA.", None, None, None
707
 
708
+ # If a file is present
709
+ if file_up is not None:
710
+ ext = os.path.splitext(file_up.name)[1].lower()
711
+ if ext == ".csv":
712
  try:
713
+ df = parse_csv_file_to_df(file_up)
714
+ eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df)
715
  return eda_summary, corr_chart, dist_chart, None
716
  except Exception as e:
717
  return f"CSV EDA failed: {e}", None, None, None
718
+ elif ext in [".xls", ".xlsx"]:
 
719
  try:
720
+ df = parse_excel_file_to_df(file_up)
721
+ eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df)
722
  return eda_summary, corr_chart, dist_chart, None
723
  except Exception as e:
724
  return f"Excel EDA failed: {e}", None, None, None
 
725
  else:
726
+ return "No valid CSV/Excel data for EDA.", None, None, None
 
727
  else:
728
+ # If no file, maybe user pasted CSV text
729
  if "," in raw_text:
730
+ from io import StringIO
731
  try:
732
+ df = pd.read_csv(StringIO(raw_text))
733
+ eda_summary, corr_chart, dist_chart = perform_enhanced_eda(df)
 
734
  return eda_summary, corr_chart, dist_chart, None
735
  except Exception as e:
736
+ return f"Text-based CSV parse error: {e}", None, None, None
737
  return "No valid CSV/Excel data found for EDA.", None, None, None
738
 
739
+ submit_btn.click(
740
+ fn=handle_action,
741
+ inputs=[action, text_input, file_input, translation_option, query_params_input, nct_id_input, report_filename_input, export_format],
742
+ outputs=[output_text, output_chart, output_chart2, output_file],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
743
  )
744
 
745
  demo.launch(server_name="0.0.0.0", server_port=7860, share=True)