mgbam commited on
Commit
50a3ce2
·
verified ·
1 Parent(s): d3ccae5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -64
app.py CHANGED
@@ -315,6 +315,7 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
315
  try:
316
  columns_info = f"Columns: {list(df.columns)}"
317
  shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
 
318
  with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
319
  describe_info = df.describe(include="all").to_string()
320
 
@@ -371,30 +372,47 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
371
  return f"Enhanced EDA failed: {e}", None, None
372
 
373
  ##########################################################
374
- # PARSING FILES WITHOUT .read() ERRORS #
375
  ##########################################################
376
 
377
- def parse_text_file_as_str(file_up: gr.File) -> str:
378
- """
379
- For .txt or .pdf, read them manually.
380
- (We'll do PDF in a separate function.)
381
- """
382
- # If user has older Gradio that doesn't store .file or .read()
383
- # let's do the same approach as CSV:
384
- return _read_file_contents(file_up)
385
-
386
  def parse_csv_file_to_df(file_up: gr.File) -> pd.DataFrame:
387
  """
388
- Safely parse a CSV with fallback approach:
389
- 1) If file path exists, read from disk.
390
- 2) Else read from uploaded_file.file in memory.
391
- Then parse with pandas.
 
392
  """
393
- raw_text = _read_file_contents(file_up)
394
- # Parse with pandas
395
- from io import StringIO
396
- df = pd.read_csv(StringIO(raw_text))
397
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
  def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
400
  """
@@ -407,19 +425,20 @@ def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
407
  if os.path.isfile(excel_path):
408
  return pd.read_excel(excel_path, engine="openpyxl")
409
  else:
 
 
410
  try:
411
- raw_bytes = file_up.file.read() # fallback approach
412
- return pd.read_excel(io.BytesIO(raw_bytes), engine="openpyxl")
413
  except Exception as e:
414
  raise ValueError(f"Excel parse error: {e}")
415
 
416
  def parse_pdf_file_as_str(file_up: gr.File) -> str:
417
  """
418
- For PDFs, read pages with PyPDF2.
 
419
  """
420
- import os
421
  pdf_path = file_up.name
422
- # If the path is real
423
  if os.path.isfile(pdf_path):
424
  with open(pdf_path, "rb") as f:
425
  pdf_reader = PyPDF2.PdfReader(f)
@@ -428,7 +447,8 @@ def parse_pdf_file_as_str(file_up: gr.File) -> str:
428
  text_content.append(page.extract_text() or "")
429
  return "\n".join(text_content)
430
  else:
431
- # Fallback read from memory
 
432
  try:
433
  pdf_bytes = file_up.file.read()
434
  reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
@@ -439,20 +459,20 @@ def parse_pdf_file_as_str(file_up: gr.File) -> str:
439
  except Exception as e:
440
  raise ValueError(f"PDF parse error: {e}")
441
 
442
- def _read_file_contents(file_up: gr.File, encoding="utf-8") -> str:
443
  """
444
- Generic fallback approach for .txt or .csv:
445
- 1) If file path is real, read from disk.
446
- 2) Else read from file_up.file in memory.
447
  """
448
- import os
449
  path = file_up.name
450
  if os.path.isfile(path):
451
  with open(path, "rb") as f:
452
- return f.read().decode(encoding, errors="replace")
453
  else:
454
- # fallback
455
- return file_up.file.read().decode(encoding, errors="replace")
 
 
456
 
457
  ##########################################################
458
  # GRADIO APP SETUP #
@@ -467,7 +487,7 @@ with gr.Blocks() as demo:
467
  - **Named Entity Recognition** (spaCy)
468
  - **Fetch** from PubMed, Crossref, Europe PMC
469
  - **Generate** PDF reports
470
- - **Enhanced EDA** on CSV/Excel (correlation, distributions)
471
  """)
472
 
473
  with gr.Row():
@@ -524,44 +544,38 @@ with gr.Blocks() as demo:
524
  exp_fmt: str
525
  ) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
526
 
527
- # Start with user text
528
  combined_text = txt.strip()
529
-
 
530
  if file_up is not None:
531
  file_ext = os.path.splitext(file_up.name)[1].lower()
532
-
533
- # For Summaries, NER, etc. we'll just append the file text to 'combined_text'
534
- # For EDA, we'll parse into a DataFrame
535
- # Let's do minimal logic here, then handle in each action block.
536
-
537
- if file_ext == ".txt":
538
- file_text = _read_file_contents(file_up)
539
- combined_text += "\n" + file_text
540
-
541
- elif file_ext == ".pdf":
542
- try:
543
  pdf_text = parse_pdf_file_as_str(file_up)
544
  combined_text += "\n" + pdf_text
545
- except Exception as e:
546
- return f"PDF parse error: {e}", None, None, None
 
 
547
 
548
- # Now handle each action:
549
  if action == "Summarize":
550
- # If user uploaded CSV or Excel, optionally parse it into text
551
  if file_up:
552
  fx = file_up.name.lower()
553
  if fx.endswith(".csv"):
554
  try:
555
  df_csv = parse_csv_file_to_df(file_up)
556
- csv_as_text = df_csv.to_csv(index=False)
557
- combined_text += "\n" + csv_as_text
558
  except Exception as e:
559
  return f"CSV parse error for Summarize: {e}", None, None, None
560
  elif fx.endswith((".xls", ".xlsx")):
561
  try:
562
  df_xl = parse_excel_file_to_df(file_up)
563
- excel_as_text = df_xl.to_csv(index=False)
564
- combined_text += "\n" + excel_as_text
565
  except Exception as e:
566
  return f"Excel parse error for Summarize: {e}", None, None, None
567
 
@@ -569,7 +583,6 @@ with gr.Blocks() as demo:
569
  return summary, None, None, None
570
 
571
  elif action == "Predict Outcome":
572
- # Optionally parse CSV/Excel into text
573
  if file_up:
574
  fx = file_up.name.lower()
575
  if fx.endswith(".csv"):
@@ -577,13 +590,13 @@ with gr.Blocks() as demo:
577
  df_csv = parse_csv_file_to_df(file_up)
578
  combined_text += "\n" + df_csv.to_csv(index=False)
579
  except Exception as e:
580
- return f"CSV parse error: {e}", None, None, None
581
  elif fx.endswith((".xls", ".xlsx")):
582
  try:
583
  df_xl = parse_excel_file_to_df(file_up)
584
  combined_text += "\n" + df_xl.to_csv(index=False)
585
  except Exception as e:
586
- return f"Excel parse error: {e}", None, None, None
587
 
588
  predictions = predict_outcome(combined_text)
589
  if isinstance(predictions, dict):
@@ -592,7 +605,6 @@ with gr.Blocks() as demo:
592
  return predictions, None, None, None
593
 
594
  elif action == "Generate Report":
595
- # Merge CSV/Excel if needed
596
  if file_up:
597
  fx = file_up.name.lower()
598
  if fx.endswith(".csv"):
@@ -699,14 +711,11 @@ with gr.Blocks() as demo:
699
  return "Invalid action.", None, None, None
700
 
701
  async def _action_eda(file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
702
- """
703
- Perform Enhanced EDA on CSV or Excel. If no file, try parsing raw_text as CSV.
704
- """
705
  if file_up is None and not raw_text.strip():
706
  return "No data provided for EDA.", None, None, None
707
 
708
- # If a file is present
709
- if file_up is not None:
710
  ext = os.path.splitext(file_up.name)[1].lower()
711
  if ext == ".csv":
712
  try:
 
315
  try:
316
  columns_info = f"Columns: {list(df.columns)}"
317
  shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
318
+
319
  with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
320
  describe_info = df.describe(include="all").to_string()
321
 
 
372
  return f"Enhanced EDA failed: {e}", None, None
373
 
374
  ##########################################################
375
+ # PARSING FILES WITH MULTI-ENCODING CSV #
376
  ##########################################################
377
 
 
 
 
 
 
 
 
 
 
378
  def parse_csv_file_to_df(file_up: gr.File) -> pd.DataFrame:
379
  """
380
+ Safely parse a CSV by:
381
+ 1) Checking if the file path on disk exists; if so, read from disk.
382
+ 2) Otherwise, read from .file in memory.
383
+ 3) For each approach, we try multiple encodings:
384
+ ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"].
385
  """
386
+ path = file_up.name
387
+ # 1) If the file exists on disk, read from that path
388
+ if os.path.isfile(path):
389
+ for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
390
+ try:
391
+ df = pd.read_csv(path, encoding=enc)
392
+ return df
393
+ except UnicodeDecodeError:
394
+ logger.warning(f"CSV parse failed with encoding={enc}. Trying next...")
395
+ except Exception as e:
396
+ logger.warning(f"Unexpected CSV read error with encoding={enc}: {e}")
397
+ raise ValueError("Could not parse CSV with any tried encodings (disk).")
398
+ else:
399
+ # 2) Fallback: read from in-memory
400
+ if not hasattr(file_up, "file"):
401
+ raise ValueError("Gradio file object has no .file attribute. Cannot parse CSV.")
402
+ raw_bytes = file_up.file.read()
403
+
404
+ # Try multiple encodings on the raw bytes
405
+ for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
406
+ try:
407
+ text_decoded = raw_bytes.decode(enc, errors="replace")
408
+ from io import StringIO
409
+ df = pd.read_csv(StringIO(text_decoded))
410
+ return df
411
+ except UnicodeDecodeError:
412
+ logger.warning(f"In-memory CSV parse failed with encoding={enc}. Trying next...")
413
+ except Exception as e:
414
+ logger.warning(f"Unexpected in-memory CSV error (enc={enc}): {e}")
415
+ raise ValueError("Could not parse CSV with any tried encodings (in-memory).")
416
 
417
  def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
418
  """
 
425
  if os.path.isfile(excel_path):
426
  return pd.read_excel(excel_path, engine="openpyxl")
427
  else:
428
+ if not hasattr(file_up, "file"):
429
+ raise ValueError("Gradio file object has no .file attribute. Cannot parse Excel.")
430
  try:
431
+ excel_bytes = file_up.file.read()
432
+ return pd.read_excel(io.BytesIO(excel_bytes), engine="openpyxl")
433
  except Exception as e:
434
  raise ValueError(f"Excel parse error: {e}")
435
 
436
  def parse_pdf_file_as_str(file_up: gr.File) -> str:
437
  """
438
+ For PDFs, read pages with PyPDF2.
439
+ Similar two-step approach: local path or fallback to memory.
440
  """
 
441
  pdf_path = file_up.name
 
442
  if os.path.isfile(pdf_path):
443
  with open(pdf_path, "rb") as f:
444
  pdf_reader = PyPDF2.PdfReader(f)
 
447
  text_content.append(page.extract_text() or "")
448
  return "\n".join(text_content)
449
  else:
450
+ if not hasattr(file_up, "file"):
451
+ raise ValueError("Gradio file object has no .file attribute. Cannot parse PDF.")
452
  try:
453
  pdf_bytes = file_up.file.read()
454
  reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
 
459
  except Exception as e:
460
  raise ValueError(f"PDF parse error: {e}")
461
 
462
+ def parse_text_file_as_str(file_up: gr.File) -> str:
463
  """
464
+ For .txt, do the same path or fallback approach,
465
+ possibly with multiple encodings if needed.
 
466
  """
 
467
  path = file_up.name
468
  if os.path.isfile(path):
469
  with open(path, "rb") as f:
470
+ return f.read().decode("utf-8", errors="replace")
471
  else:
472
+ if not hasattr(file_up, "file"):
473
+ raise ValueError("Gradio file object has no .file attribute. Cannot parse txt.")
474
+ raw_bytes = file_up.file.read()
475
+ return raw_bytes.decode("utf-8", errors="replace")
476
 
477
  ##########################################################
478
  # GRADIO APP SETUP #
 
487
  - **Named Entity Recognition** (spaCy)
488
  - **Fetch** from PubMed, Crossref, Europe PMC
489
  - **Generate** PDF reports
490
+ - **Enhanced EDA** on CSV/Excel (with fallback encodings)
491
  """)
492
 
493
  with gr.Row():
 
544
  exp_fmt: str
545
  ) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
546
 
 
547
  combined_text = txt.strip()
548
+
549
+ # If a file is uploaded, parse based on extension
550
  if file_up is not None:
551
  file_ext = os.path.splitext(file_up.name)[1].lower()
552
+ try:
553
+ if file_ext == ".txt":
554
+ txt_content = parse_text_file_as_str(file_up)
555
+ combined_text += "\n" + txt_content
556
+ elif file_ext == ".pdf":
 
 
 
 
 
 
557
  pdf_text = parse_pdf_file_as_str(file_up)
558
  combined_text += "\n" + pdf_text
559
+ # For CSV/Excel, we usually parse them *inside* certain actions (EDA, Summarize, etc.)
560
+ # Because sometimes you want the raw DataFrame, not the text.
561
+ except Exception as e:
562
+ return f"File parse error: {e}", None, None, None
563
 
564
+ # Now handle the action
565
  if action == "Summarize":
566
+ # If CSV or Excel is uploaded, parse into DF and then convert to text
567
  if file_up:
568
  fx = file_up.name.lower()
569
  if fx.endswith(".csv"):
570
  try:
571
  df_csv = parse_csv_file_to_df(file_up)
572
+ combined_text += "\n" + df_csv.to_csv(index=False)
 
573
  except Exception as e:
574
  return f"CSV parse error for Summarize: {e}", None, None, None
575
  elif fx.endswith((".xls", ".xlsx")):
576
  try:
577
  df_xl = parse_excel_file_to_df(file_up)
578
+ combined_text += "\n" + df_xl.to_csv(index=False)
 
579
  except Exception as e:
580
  return f"Excel parse error for Summarize: {e}", None, None, None
581
 
 
583
  return summary, None, None, None
584
 
585
  elif action == "Predict Outcome":
 
586
  if file_up:
587
  fx = file_up.name.lower()
588
  if fx.endswith(".csv"):
 
590
  df_csv = parse_csv_file_to_df(file_up)
591
  combined_text += "\n" + df_csv.to_csv(index=False)
592
  except Exception as e:
593
+ return f"CSV parse error for Predict: {e}", None, None, None
594
  elif fx.endswith((".xls", ".xlsx")):
595
  try:
596
  df_xl = parse_excel_file_to_df(file_up)
597
  combined_text += "\n" + df_xl.to_csv(index=False)
598
  except Exception as e:
599
+ return f"Excel parse error for Predict: {e}", None, None, None
600
 
601
  predictions = predict_outcome(combined_text)
602
  if isinstance(predictions, dict):
 
605
  return predictions, None, None, None
606
 
607
  elif action == "Generate Report":
 
608
  if file_up:
609
  fx = file_up.name.lower()
610
  if fx.endswith(".csv"):
 
711
  return "Invalid action.", None, None, None
712
 
713
  async def _action_eda(file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
714
+ """Perform Enhanced EDA on CSV or Excel. If no file, try parsing raw_text as CSV."""
 
 
715
  if file_up is None and not raw_text.strip():
716
  return "No data provided for EDA.", None, None, None
717
 
718
+ if file_up:
 
719
  ext = os.path.splitext(file_up.name)[1].lower()
720
  if ext == ".csv":
721
  try: