import pandas as pd from typing import Union import os def parse_csv_tool(file: Union[str, bytes]) -> str: """ Parses a CSV or Excel file and returns a comprehensive schema and statistics report in Markdown. Supports large files by sampling if necessary and handles common parsing errors. """ # Determine extension try: filename = getattr(file, 'name', file) ext = os.path.splitext(filename)[1].lower() except Exception: ext = ".csv" # Load DataFrame try: if ext in ('.xls', '.xlsx'): df = pd.read_excel(file, engine='openpyxl') else: df = pd.read_csv(file) except Exception as e: return f"❌ Failed to load data ({ext}): {e}" # Basic dimensions n_rows, n_cols = df.shape # Schema & dtypes schema_lines = [f"- **{col}**: {dtype}" for col, dtype in df.dtypes.items()] schema_md = "\n".join(schema_lines) # Missing values missing = df.isna().sum() missing_pct = (missing / n_rows * 100).round(1) missing_lines = [] for col in df.columns: if missing[col] > 0: missing_lines.append(f"- **{col}**: {missing[col]} ({missing_pct[col]}%)") missing_md = "\n".join(missing_lines) or "None" # Descriptive stats (numeric) desc = df.describe().T.round(2) desc_md = desc.to_markdown() # Memory usage mem_mb = df.memory_usage(deep=True).sum() / (1024 ** 2) # Assemble report report = f""" # 📊 Dataset Overview - **Rows:** {n_rows} - **Columns:** {n_cols} - **Memory Usage:** {mem_mb:.2f} MB ## 🗂 Schema & Data Types {schema_md} ## 🛠 Missing Values {missing_md} ## 📈 Descriptive Statistics {desc_md} """.strip() return report