Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -315,6 +315,7 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
|
|
315 |
try:
|
316 |
columns_info = f"Columns: {list(df.columns)}"
|
317 |
shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
|
|
|
318 |
with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
|
319 |
describe_info = df.describe(include="all").to_string()
|
320 |
|
@@ -371,30 +372,47 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
|
|
371 |
return f"Enhanced EDA failed: {e}", None, None
|
372 |
|
373 |
##########################################################
|
374 |
-
# PARSING FILES
|
375 |
##########################################################
|
376 |
|
377 |
-
def parse_text_file_as_str(file_up: gr.File) -> str:
|
378 |
-
"""
|
379 |
-
For .txt or .pdf, read them manually.
|
380 |
-
(We'll do PDF in a separate function.)
|
381 |
-
"""
|
382 |
-
# If user has older Gradio that doesn't store .file or .read()
|
383 |
-
# let's do the same approach as CSV:
|
384 |
-
return _read_file_contents(file_up)
|
385 |
-
|
386 |
def parse_csv_file_to_df(file_up: gr.File) -> pd.DataFrame:
|
387 |
"""
|
388 |
-
Safely parse a CSV
|
389 |
-
1)
|
390 |
-
2)
|
391 |
-
|
|
|
392 |
"""
|
393 |
-
|
394 |
-
#
|
395 |
-
|
396 |
-
|
397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
|
399 |
def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
|
400 |
"""
|
@@ -407,19 +425,20 @@ def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
|
|
407 |
if os.path.isfile(excel_path):
|
408 |
return pd.read_excel(excel_path, engine="openpyxl")
|
409 |
else:
|
|
|
|
|
410 |
try:
|
411 |
-
|
412 |
-
return pd.read_excel(io.BytesIO(
|
413 |
except Exception as e:
|
414 |
raise ValueError(f"Excel parse error: {e}")
|
415 |
|
416 |
def parse_pdf_file_as_str(file_up: gr.File) -> str:
|
417 |
"""
|
418 |
-
For PDFs, read pages with PyPDF2.
|
|
|
419 |
"""
|
420 |
-
import os
|
421 |
pdf_path = file_up.name
|
422 |
-
# If the path is real
|
423 |
if os.path.isfile(pdf_path):
|
424 |
with open(pdf_path, "rb") as f:
|
425 |
pdf_reader = PyPDF2.PdfReader(f)
|
@@ -428,7 +447,8 @@ def parse_pdf_file_as_str(file_up: gr.File) -> str:
|
|
428 |
text_content.append(page.extract_text() or "")
|
429 |
return "\n".join(text_content)
|
430 |
else:
|
431 |
-
|
|
|
432 |
try:
|
433 |
pdf_bytes = file_up.file.read()
|
434 |
reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
|
@@ -439,20 +459,20 @@ def parse_pdf_file_as_str(file_up: gr.File) -> str:
|
|
439 |
except Exception as e:
|
440 |
raise ValueError(f"PDF parse error: {e}")
|
441 |
|
442 |
-
def
|
443 |
"""
|
444 |
-
|
445 |
-
|
446 |
-
2) Else read from file_up.file in memory.
|
447 |
"""
|
448 |
-
import os
|
449 |
path = file_up.name
|
450 |
if os.path.isfile(path):
|
451 |
with open(path, "rb") as f:
|
452 |
-
return f.read().decode(
|
453 |
else:
|
454 |
-
|
455 |
-
|
|
|
|
|
456 |
|
457 |
##########################################################
|
458 |
# GRADIO APP SETUP #
|
@@ -467,7 +487,7 @@ with gr.Blocks() as demo:
|
|
467 |
- **Named Entity Recognition** (spaCy)
|
468 |
- **Fetch** from PubMed, Crossref, Europe PMC
|
469 |
- **Generate** PDF reports
|
470 |
-
- **Enhanced EDA** on CSV/Excel (
|
471 |
""")
|
472 |
|
473 |
with gr.Row():
|
@@ -524,44 +544,38 @@ with gr.Blocks() as demo:
|
|
524 |
exp_fmt: str
|
525 |
) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
|
526 |
|
527 |
-
# Start with user text
|
528 |
combined_text = txt.strip()
|
529 |
-
|
|
|
530 |
if file_up is not None:
|
531 |
file_ext = os.path.splitext(file_up.name)[1].lower()
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
if file_ext == ".txt":
|
538 |
-
file_text = _read_file_contents(file_up)
|
539 |
-
combined_text += "\n" + file_text
|
540 |
-
|
541 |
-
elif file_ext == ".pdf":
|
542 |
-
try:
|
543 |
pdf_text = parse_pdf_file_as_str(file_up)
|
544 |
combined_text += "\n" + pdf_text
|
545 |
-
|
546 |
-
|
|
|
|
|
547 |
|
548 |
-
# Now handle
|
549 |
if action == "Summarize":
|
550 |
-
# If
|
551 |
if file_up:
|
552 |
fx = file_up.name.lower()
|
553 |
if fx.endswith(".csv"):
|
554 |
try:
|
555 |
df_csv = parse_csv_file_to_df(file_up)
|
556 |
-
|
557 |
-
combined_text += "\n" + csv_as_text
|
558 |
except Exception as e:
|
559 |
return f"CSV parse error for Summarize: {e}", None, None, None
|
560 |
elif fx.endswith((".xls", ".xlsx")):
|
561 |
try:
|
562 |
df_xl = parse_excel_file_to_df(file_up)
|
563 |
-
|
564 |
-
combined_text += "\n" + excel_as_text
|
565 |
except Exception as e:
|
566 |
return f"Excel parse error for Summarize: {e}", None, None, None
|
567 |
|
@@ -569,7 +583,6 @@ with gr.Blocks() as demo:
|
|
569 |
return summary, None, None, None
|
570 |
|
571 |
elif action == "Predict Outcome":
|
572 |
-
# Optionally parse CSV/Excel into text
|
573 |
if file_up:
|
574 |
fx = file_up.name.lower()
|
575 |
if fx.endswith(".csv"):
|
@@ -577,13 +590,13 @@ with gr.Blocks() as demo:
|
|
577 |
df_csv = parse_csv_file_to_df(file_up)
|
578 |
combined_text += "\n" + df_csv.to_csv(index=False)
|
579 |
except Exception as e:
|
580 |
-
return f"CSV parse error: {e}", None, None, None
|
581 |
elif fx.endswith((".xls", ".xlsx")):
|
582 |
try:
|
583 |
df_xl = parse_excel_file_to_df(file_up)
|
584 |
combined_text += "\n" + df_xl.to_csv(index=False)
|
585 |
except Exception as e:
|
586 |
-
return f"Excel parse error: {e}", None, None, None
|
587 |
|
588 |
predictions = predict_outcome(combined_text)
|
589 |
if isinstance(predictions, dict):
|
@@ -592,7 +605,6 @@ with gr.Blocks() as demo:
|
|
592 |
return predictions, None, None, None
|
593 |
|
594 |
elif action == "Generate Report":
|
595 |
-
# Merge CSV/Excel if needed
|
596 |
if file_up:
|
597 |
fx = file_up.name.lower()
|
598 |
if fx.endswith(".csv"):
|
@@ -699,14 +711,11 @@ with gr.Blocks() as demo:
|
|
699 |
return "Invalid action.", None, None, None
|
700 |
|
701 |
async def _action_eda(file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
|
702 |
-
"""
|
703 |
-
Perform Enhanced EDA on CSV or Excel. If no file, try parsing raw_text as CSV.
|
704 |
-
"""
|
705 |
if file_up is None and not raw_text.strip():
|
706 |
return "No data provided for EDA.", None, None, None
|
707 |
|
708 |
-
|
709 |
-
if file_up is not None:
|
710 |
ext = os.path.splitext(file_up.name)[1].lower()
|
711 |
if ext == ".csv":
|
712 |
try:
|
|
|
315 |
try:
|
316 |
columns_info = f"Columns: {list(df.columns)}"
|
317 |
shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
|
318 |
+
|
319 |
with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
|
320 |
describe_info = df.describe(include="all").to_string()
|
321 |
|
|
|
372 |
return f"Enhanced EDA failed: {e}", None, None
|
373 |
|
374 |
##########################################################
|
375 |
+
# PARSING FILES WITH MULTI-ENCODING CSV #
|
376 |
##########################################################
|
377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
def parse_csv_file_to_df(file_up: gr.File) -> pd.DataFrame:
|
379 |
"""
|
380 |
+
Safely parse a CSV by:
|
381 |
+
1) Checking if the file path on disk exists; if so, read from disk.
|
382 |
+
2) Otherwise, read from .file in memory.
|
383 |
+
3) For each approach, we try multiple encodings:
|
384 |
+
["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"].
|
385 |
"""
|
386 |
+
path = file_up.name
|
387 |
+
# 1) If the file exists on disk, read from that path
|
388 |
+
if os.path.isfile(path):
|
389 |
+
for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
|
390 |
+
try:
|
391 |
+
df = pd.read_csv(path, encoding=enc)
|
392 |
+
return df
|
393 |
+
except UnicodeDecodeError:
|
394 |
+
logger.warning(f"CSV parse failed with encoding={enc}. Trying next...")
|
395 |
+
except Exception as e:
|
396 |
+
logger.warning(f"Unexpected CSV read error with encoding={enc}: {e}")
|
397 |
+
raise ValueError("Could not parse CSV with any tried encodings (disk).")
|
398 |
+
else:
|
399 |
+
# 2) Fallback: read from in-memory
|
400 |
+
if not hasattr(file_up, "file"):
|
401 |
+
raise ValueError("Gradio file object has no .file attribute. Cannot parse CSV.")
|
402 |
+
raw_bytes = file_up.file.read()
|
403 |
+
|
404 |
+
# Try multiple encodings on the raw bytes
|
405 |
+
for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
|
406 |
+
try:
|
407 |
+
text_decoded = raw_bytes.decode(enc, errors="replace")
|
408 |
+
from io import StringIO
|
409 |
+
df = pd.read_csv(StringIO(text_decoded))
|
410 |
+
return df
|
411 |
+
except UnicodeDecodeError:
|
412 |
+
logger.warning(f"In-memory CSV parse failed with encoding={enc}. Trying next...")
|
413 |
+
except Exception as e:
|
414 |
+
logger.warning(f"Unexpected in-memory CSV error (enc={enc}): {e}")
|
415 |
+
raise ValueError("Could not parse CSV with any tried encodings (in-memory).")
|
416 |
|
417 |
def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
|
418 |
"""
|
|
|
425 |
if os.path.isfile(excel_path):
|
426 |
return pd.read_excel(excel_path, engine="openpyxl")
|
427 |
else:
|
428 |
+
if not hasattr(file_up, "file"):
|
429 |
+
raise ValueError("Gradio file object has no .file attribute. Cannot parse Excel.")
|
430 |
try:
|
431 |
+
excel_bytes = file_up.file.read()
|
432 |
+
return pd.read_excel(io.BytesIO(excel_bytes), engine="openpyxl")
|
433 |
except Exception as e:
|
434 |
raise ValueError(f"Excel parse error: {e}")
|
435 |
|
436 |
def parse_pdf_file_as_str(file_up: gr.File) -> str:
|
437 |
"""
|
438 |
+
For PDFs, read pages with PyPDF2.
|
439 |
+
Similar two-step approach: local path or fallback to memory.
|
440 |
"""
|
|
|
441 |
pdf_path = file_up.name
|
|
|
442 |
if os.path.isfile(pdf_path):
|
443 |
with open(pdf_path, "rb") as f:
|
444 |
pdf_reader = PyPDF2.PdfReader(f)
|
|
|
447 |
text_content.append(page.extract_text() or "")
|
448 |
return "\n".join(text_content)
|
449 |
else:
|
450 |
+
if not hasattr(file_up, "file"):
|
451 |
+
raise ValueError("Gradio file object has no .file attribute. Cannot parse PDF.")
|
452 |
try:
|
453 |
pdf_bytes = file_up.file.read()
|
454 |
reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
|
|
|
459 |
except Exception as e:
|
460 |
raise ValueError(f"PDF parse error: {e}")
|
461 |
|
462 |
+
def parse_text_file_as_str(file_up: gr.File) -> str:
|
463 |
"""
|
464 |
+
For .txt, do the same path or fallback approach,
|
465 |
+
possibly with multiple encodings if needed.
|
|
|
466 |
"""
|
|
|
467 |
path = file_up.name
|
468 |
if os.path.isfile(path):
|
469 |
with open(path, "rb") as f:
|
470 |
+
return f.read().decode("utf-8", errors="replace")
|
471 |
else:
|
472 |
+
if not hasattr(file_up, "file"):
|
473 |
+
raise ValueError("Gradio file object has no .file attribute. Cannot parse txt.")
|
474 |
+
raw_bytes = file_up.file.read()
|
475 |
+
return raw_bytes.decode("utf-8", errors="replace")
|
476 |
|
477 |
##########################################################
|
478 |
# GRADIO APP SETUP #
|
|
|
487 |
- **Named Entity Recognition** (spaCy)
|
488 |
- **Fetch** from PubMed, Crossref, Europe PMC
|
489 |
- **Generate** PDF reports
|
490 |
+
- **Enhanced EDA** on CSV/Excel (with fallback encodings)
|
491 |
""")
|
492 |
|
493 |
with gr.Row():
|
|
|
544 |
exp_fmt: str
|
545 |
) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
|
546 |
|
|
|
547 |
combined_text = txt.strip()
|
548 |
+
|
549 |
+
# If a file is uploaded, parse based on extension
|
550 |
if file_up is not None:
|
551 |
file_ext = os.path.splitext(file_up.name)[1].lower()
|
552 |
+
try:
|
553 |
+
if file_ext == ".txt":
|
554 |
+
txt_content = parse_text_file_as_str(file_up)
|
555 |
+
combined_text += "\n" + txt_content
|
556 |
+
elif file_ext == ".pdf":
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
pdf_text = parse_pdf_file_as_str(file_up)
|
558 |
combined_text += "\n" + pdf_text
|
559 |
+
# For CSV/Excel, we usually parse them *inside* certain actions (EDA, Summarize, etc.)
|
560 |
+
# Because sometimes you want the raw DataFrame, not the text.
|
561 |
+
except Exception as e:
|
562 |
+
return f"File parse error: {e}", None, None, None
|
563 |
|
564 |
+
# Now handle the action
|
565 |
if action == "Summarize":
|
566 |
+
# If CSV or Excel is uploaded, parse into DF and then convert to text
|
567 |
if file_up:
|
568 |
fx = file_up.name.lower()
|
569 |
if fx.endswith(".csv"):
|
570 |
try:
|
571 |
df_csv = parse_csv_file_to_df(file_up)
|
572 |
+
combined_text += "\n" + df_csv.to_csv(index=False)
|
|
|
573 |
except Exception as e:
|
574 |
return f"CSV parse error for Summarize: {e}", None, None, None
|
575 |
elif fx.endswith((".xls", ".xlsx")):
|
576 |
try:
|
577 |
df_xl = parse_excel_file_to_df(file_up)
|
578 |
+
combined_text += "\n" + df_xl.to_csv(index=False)
|
|
|
579 |
except Exception as e:
|
580 |
return f"Excel parse error for Summarize: {e}", None, None, None
|
581 |
|
|
|
583 |
return summary, None, None, None
|
584 |
|
585 |
elif action == "Predict Outcome":
|
|
|
586 |
if file_up:
|
587 |
fx = file_up.name.lower()
|
588 |
if fx.endswith(".csv"):
|
|
|
590 |
df_csv = parse_csv_file_to_df(file_up)
|
591 |
combined_text += "\n" + df_csv.to_csv(index=False)
|
592 |
except Exception as e:
|
593 |
+
return f"CSV parse error for Predict: {e}", None, None, None
|
594 |
elif fx.endswith((".xls", ".xlsx")):
|
595 |
try:
|
596 |
df_xl = parse_excel_file_to_df(file_up)
|
597 |
combined_text += "\n" + df_xl.to_csv(index=False)
|
598 |
except Exception as e:
|
599 |
+
return f"Excel parse error for Predict: {e}", None, None, None
|
600 |
|
601 |
predictions = predict_outcome(combined_text)
|
602 |
if isinstance(predictions, dict):
|
|
|
605 |
return predictions, None, None, None
|
606 |
|
607 |
elif action == "Generate Report":
|
|
|
608 |
if file_up:
|
609 |
fx = file_up.name.lower()
|
610 |
if fx.endswith(".csv"):
|
|
|
711 |
return "Invalid action.", None, None, None
|
712 |
|
713 |
async def _action_eda(file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
|
714 |
+
"""Perform Enhanced EDA on CSV or Excel. If no file, try parsing raw_text as CSV."""
|
|
|
|
|
715 |
if file_up is None and not raw_text.strip():
|
716 |
return "No data provided for EDA.", None, None, None
|
717 |
|
718 |
+
if file_up:
|
|
|
719 |
ext = os.path.splitext(file_up.name)[1].lower()
|
720 |
if ext == ".csv":
|
721 |
try:
|