Spaces:

mgbam
/

CraAssitant

Runtime error

App Files Files Community

mgbam commited on Jan 20

Commit

50a3ce2

verified ·

1 Parent(s): d3ccae5

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -64

app.py CHANGED Viewed

@@ -315,6 +315,7 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
     try:
         columns_info = f"Columns: {list(df.columns)}"
         shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
         with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
             describe_info = df.describe(include="all").to_string()
@@ -371,30 +372,47 @@ def perform_enhanced_eda(df: pd.DataFrame) -> Tuple[str, Optional[alt.Chart], Op
         return f"Enhanced EDA failed: {e}", None, None
 ##########################################################
-#         PARSING FILES WITHOUT .read() ERRORS           #
 ##########################################################
-def parse_text_file_as_str(file_up: gr.File) -> str:
-    """
-    For .txt or .pdf, read them manually.
-    (We'll do PDF in a separate function.)
-    """
-    # If user has older Gradio that doesn't store .file or .read()
-    # let's do the same approach as CSV:
-    return _read_file_contents(file_up)
 def parse_csv_file_to_df(file_up: gr.File) -> pd.DataFrame:
     """
-    Safely parse a CSV with fallback approach:
-      1) If file path exists, read from disk.
-      2) Else read from uploaded_file.file in memory.
-    Then parse with pandas.
     """
-    raw_text = _read_file_contents(file_up)
-    # Parse with pandas
-    from io import StringIO
-    df = pd.read_csv(StringIO(raw_text))
-    return df
 def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
     """
@@ -407,19 +425,20 @@ def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
     if os.path.isfile(excel_path):
         return pd.read_excel(excel_path, engine="openpyxl")
     else:
         try:
-            raw_bytes = file_up.file.read()  # fallback approach
-            return pd.read_excel(io.BytesIO(raw_bytes), engine="openpyxl")
         except Exception as e:
             raise ValueError(f"Excel parse error: {e}")
 def parse_pdf_file_as_str(file_up: gr.File) -> str:
     """
-    For PDFs, read pages with PyPDF2.
     """
-    import os
     pdf_path = file_up.name
-    # If the path is real
     if os.path.isfile(pdf_path):
         with open(pdf_path, "rb") as f:
             pdf_reader = PyPDF2.PdfReader(f)
@@ -428,7 +447,8 @@ def parse_pdf_file_as_str(file_up: gr.File) -> str:
                 text_content.append(page.extract_text() or "")
             return "\n".join(text_content)
     else:
-        # Fallback read from memory
         try:
             pdf_bytes = file_up.file.read()
             reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
@@ -439,20 +459,20 @@ def parse_pdf_file_as_str(file_up: gr.File) -> str:
         except Exception as e:
             raise ValueError(f"PDF parse error: {e}")
-def _read_file_contents(file_up: gr.File, encoding="utf-8") -> str:
     """
-    Generic fallback approach for .txt or .csv:
-      1) If file path is real, read from disk.
-      2) Else read from file_up.file in memory.
     """
-    import os
     path = file_up.name
     if os.path.isfile(path):
         with open(path, "rb") as f:
-            return f.read().decode(encoding, errors="replace")
     else:
-        # fallback
-        return file_up.file.read().decode(encoding, errors="replace")
 ##########################################################
 #                   GRADIO APP SETUP                     #
@@ -467,7 +487,7 @@ with gr.Blocks() as demo:
 - **Named Entity Recognition** (spaCy)
 - **Fetch** from PubMed, Crossref, Europe PMC
 - **Generate** PDF reports
-- **Enhanced EDA** on CSV/Excel (correlation, distributions)
 """)
     with gr.Row():
@@ -524,44 +544,38 @@ with gr.Blocks() as demo:
         exp_fmt: str
     ) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
-        # Start with user text
         combined_text = txt.strip()
         if file_up is not None:
             file_ext = os.path.splitext(file_up.name)[1].lower()
-            # For Summaries, NER, etc. we'll just append the file text to 'combined_text'
-            # For EDA, we'll parse into a DataFrame
-            # Let's do minimal logic here, then handle in each action block.
-            if file_ext == ".txt":
-                file_text = _read_file_contents(file_up)
-                combined_text += "\n" + file_text
-            elif file_ext == ".pdf":
-                try:
                     pdf_text = parse_pdf_file_as_str(file_up)
                     combined_text += "\n" + pdf_text
-                except Exception as e:
-                    return f"PDF parse error: {e}", None, None, None
-        # Now handle each action:
         if action == "Summarize":
-            # If user uploaded CSV or Excel, optionally parse it into text
             if file_up:
                 fx = file_up.name.lower()
                 if fx.endswith(".csv"):
                     try:
                         df_csv = parse_csv_file_to_df(file_up)
-                        csv_as_text = df_csv.to_csv(index=False)
-                        combined_text += "\n" + csv_as_text
                     except Exception as e:
                         return f"CSV parse error for Summarize: {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
-                        excel_as_text = df_xl.to_csv(index=False)
-                        combined_text += "\n" + excel_as_text
                     except Exception as e:
                         return f"Excel parse error for Summarize: {e}", None, None, None
@@ -569,7 +583,6 @@ with gr.Blocks() as demo:
             return summary, None, None, None
         elif action == "Predict Outcome":
-            # Optionally parse CSV/Excel into text
             if file_up:
                 fx = file_up.name.lower()
                 if fx.endswith(".csv"):
@@ -577,13 +590,13 @@ with gr.Blocks() as demo:
                         df_csv = parse_csv_file_to_df(file_up)
                         combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
-                        return f"CSV parse error: {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
                         combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
-                        return f"Excel parse error: {e}", None, None, None
             predictions = predict_outcome(combined_text)
             if isinstance(predictions, dict):
@@ -592,7 +605,6 @@ with gr.Blocks() as demo:
             return predictions, None, None, None
         elif action == "Generate Report":
-            # Merge CSV/Excel if needed
             if file_up:
                 fx = file_up.name.lower()
                 if fx.endswith(".csv"):
@@ -699,14 +711,11 @@ with gr.Blocks() as demo:
         return "Invalid action.", None, None, None
     async def _action_eda(file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
-        """
-        Perform Enhanced EDA on CSV or Excel. If no file, try parsing raw_text as CSV.
-        """
         if file_up is None and not raw_text.strip():
             return "No data provided for EDA.", None, None, None
-        # If a file is present
-        if file_up is not None:
             ext = os.path.splitext(file_up.name)[1].lower()
             if ext == ".csv":
                 try:

     try:
         columns_info = f"Columns: {list(df.columns)}"
         shape_info = f"Shape: {df.shape[0]} rows x {df.shape[1]} columns"
         with pd.option_context("display.max_colwidth", 200, "display.max_rows", None):
             describe_info = df.describe(include="all").to_string()
         return f"Enhanced EDA failed: {e}", None, None
 ##########################################################
+#         PARSING FILES WITH MULTI-ENCODING CSV          #
 ##########################################################
 def parse_csv_file_to_df(file_up: gr.File) -> pd.DataFrame:
     """
+    Safely parse a CSV by:
+      1) Checking if the file path on disk exists; if so, read from disk.
+      2) Otherwise, read from .file in memory.
+    3) For each approach, we try multiple encodings:
+       ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"].
     """
+    path = file_up.name
+    # 1) If the file exists on disk, read from that path
+    if os.path.isfile(path):
+        for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
+            try:
+                df = pd.read_csv(path, encoding=enc)
+                return df
+            except UnicodeDecodeError:
+                logger.warning(f"CSV parse failed with encoding={enc}. Trying next...")
+            except Exception as e:
+                logger.warning(f"Unexpected CSV read error with encoding={enc}: {e}")
+        raise ValueError("Could not parse CSV with any tried encodings (disk).")
+    else:
+        # 2) Fallback: read from in-memory
+        if not hasattr(file_up, "file"):
+            raise ValueError("Gradio file object has no .file attribute. Cannot parse CSV.")
+        raw_bytes = file_up.file.read()
+        # Try multiple encodings on the raw bytes
+        for enc in ["utf-8", "utf-8-sig", "latin1", "ISO-8859-1"]:
+            try:
+                text_decoded = raw_bytes.decode(enc, errors="replace")
+                from io import StringIO
+                df = pd.read_csv(StringIO(text_decoded))
+                return df
+            except UnicodeDecodeError:
+                logger.warning(f"In-memory CSV parse failed with encoding={enc}. Trying next...")
+            except Exception as e:
+                logger.warning(f"Unexpected in-memory CSV error (enc={enc}): {e}")
+        raise ValueError("Could not parse CSV with any tried encodings (in-memory).")
 def parse_excel_file_to_df(file_up: gr.File) -> pd.DataFrame:
     """
     if os.path.isfile(excel_path):
         return pd.read_excel(excel_path, engine="openpyxl")
     else:
+        if not hasattr(file_up, "file"):
+            raise ValueError("Gradio file object has no .file attribute. Cannot parse Excel.")
         try:
+            excel_bytes = file_up.file.read()
+            return pd.read_excel(io.BytesIO(excel_bytes), engine="openpyxl")
         except Exception as e:
             raise ValueError(f"Excel parse error: {e}")
 def parse_pdf_file_as_str(file_up: gr.File) -> str:
     """
+    For PDFs, read pages with PyPDF2.
+    Similar two-step approach: local path or fallback to memory.
     """
     pdf_path = file_up.name
     if os.path.isfile(pdf_path):
         with open(pdf_path, "rb") as f:
             pdf_reader = PyPDF2.PdfReader(f)
                 text_content.append(page.extract_text() or "")
             return "\n".join(text_content)
     else:
+        if not hasattr(file_up, "file"):
+            raise ValueError("Gradio file object has no .file attribute. Cannot parse PDF.")
         try:
             pdf_bytes = file_up.file.read()
             reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
         except Exception as e:
             raise ValueError(f"PDF parse error: {e}")
+def parse_text_file_as_str(file_up: gr.File) -> str:
     """
+    For .txt, do the same path or fallback approach,
+    possibly with multiple encodings if needed.
     """
     path = file_up.name
     if os.path.isfile(path):
         with open(path, "rb") as f:
+            return f.read().decode("utf-8", errors="replace")
     else:
+        if not hasattr(file_up, "file"):
+            raise ValueError("Gradio file object has no .file attribute. Cannot parse txt.")
+        raw_bytes = file_up.file.read()
+        return raw_bytes.decode("utf-8", errors="replace")
 ##########################################################
 #                   GRADIO APP SETUP                     #
 - **Named Entity Recognition** (spaCy)
 - **Fetch** from PubMed, Crossref, Europe PMC
 - **Generate** PDF reports
+- **Enhanced EDA** on CSV/Excel (with fallback encodings)
 """)
     with gr.Row():
         exp_fmt: str
     ) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
         combined_text = txt.strip()
+        # If a file is uploaded, parse based on extension
         if file_up is not None:
             file_ext = os.path.splitext(file_up.name)[1].lower()
+            try:
+                if file_ext == ".txt":
+                    txt_content = parse_text_file_as_str(file_up)
+                    combined_text += "\n" + txt_content
+                elif file_ext == ".pdf":
                     pdf_text = parse_pdf_file_as_str(file_up)
                     combined_text += "\n" + pdf_text
+                # For CSV/Excel, we usually parse them *inside* certain actions (EDA, Summarize, etc.)
+                # Because sometimes you want the raw DataFrame, not the text.
+            except Exception as e:
+                return f"File parse error: {e}", None, None, None
+        # Now handle the action
         if action == "Summarize":
+            # If CSV or Excel is uploaded, parse into DF and then convert to text
             if file_up:
                 fx = file_up.name.lower()
                 if fx.endswith(".csv"):
                     try:
                         df_csv = parse_csv_file_to_df(file_up)
+                        combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
                         return f"CSV parse error for Summarize: {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
+                        combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
                         return f"Excel parse error for Summarize: {e}", None, None, None
             return summary, None, None, None
         elif action == "Predict Outcome":
             if file_up:
                 fx = file_up.name.lower()
                 if fx.endswith(".csv"):
                         df_csv = parse_csv_file_to_df(file_up)
                         combined_text += "\n" + df_csv.to_csv(index=False)
                     except Exception as e:
+                        return f"CSV parse error for Predict: {e}", None, None, None
                 elif fx.endswith((".xls", ".xlsx")):
                     try:
                         df_xl = parse_excel_file_to_df(file_up)
                         combined_text += "\n" + df_xl.to_csv(index=False)
                     except Exception as e:
+                        return f"Excel parse error for Predict: {e}", None, None, None
             predictions = predict_outcome(combined_text)
             if isinstance(predictions, dict):
             return predictions, None, None, None
         elif action == "Generate Report":
             if file_up:
                 fx = file_up.name.lower()
                 if fx.endswith(".csv"):
         return "Invalid action.", None, None, None
     async def _action_eda(file_up: Optional[gr.File], raw_text: str) -> Tuple[Optional[str], Optional[Any], Optional[Any], Optional[str]]:
+        """Perform Enhanced EDA on CSV or Excel. If no file, try parsing raw_text as CSV."""
         if file_up is None and not raw_text.strip():
             return "No data provided for EDA.", None, None, None
+        if file_up:
             ext = os.path.splitext(file_up.name)[1].lower()
             if ext == ".csv":
                 try: