Spaces:

mgbam
/

BizIntel_AI

Sleeping

App Files Files Community

mgbam commited on about 1 month ago

Commit

e1d8bc9

verified ·

1 Parent(s): 8a0173b

Update tools/csv_parser.py

Browse files

Files changed (1) hide show

tools/csv_parser.py +65 -4

tools/csv_parser.py CHANGED Viewed

@@ -1,6 +1,67 @@
 import pandas as pd
-def parse_csv_tool(file_path: str) -> str:
-    df = pd.read_csv(file_path)
-    return f"Schema: {list(df.columns)}\n\nStats:\n{df.describe().to_string()}"

 import pandas as pd
+from typing import Union
+import os
+def parse_csv_tool(file: Union[str, bytes]) -> str:
+    """
+    Parses a CSV or Excel file and returns a comprehensive schema and statistics report in Markdown.
+    Supports large files by sampling if necessary and handles common parsing errors.
+    """
+    # Determine extension
+    try:
+        filename = getattr(file, 'name', file)
+        ext = os.path.splitext(filename)[1].lower()
+    except Exception:
+        ext = ".csv"
+    # Load DataFrame
+    try:
+        if ext in ('.xls', '.xlsx'):
+            df = pd.read_excel(file, engine='openpyxl')
+        else:
+            df = pd.read_csv(file)
+    except Exception as e:
+        return f"❌ Failed to load data ({ext}): {e}"
+    # Basic dimensions
+    n_rows, n_cols = df.shape
+    # Schema & dtypes
+    schema_lines = [f"- **{col}**: {dtype}" for col, dtype in df.dtypes.items()]
+    schema_md = "\n".join(schema_lines)
+    # Missing values
+    missing = df.isna().sum()
+    missing_pct = (missing / n_rows * 100).round(1)
+    missing_lines = []
+    for col in df.columns:
+        if missing[col] > 0:
+            missing_lines.append(f"- **{col}**: {missing[col]} ({missing_pct[col]}%)")
+    missing_md = "\n".join(missing_lines) or "None"
+    # Descriptive stats (numeric)
+    desc = df.describe().T.round(2)
+    desc_md = desc.to_markdown()
+    # Memory usage
+    mem_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)
+    # Assemble report
+    report = f"""
+# 📊 Dataset Overview
+- **Rows:** {n_rows}
+- **Columns:** {n_cols}
+- **Memory Usage:** {mem_mb:.2f} MB
+## 🗂 Schema & Data Types
+{schema_md}
+## 🛠 Missing Values
+{missing_md}
+## 📈 Descriptive Statistics
+{desc_md}
+""".strip()
+    return report