Spaces:
Sleeping
Sleeping
import pandas as pd | |
from typing import Union | |
import os | |
def parse_csv_tool(file: Union[str, bytes]) -> str: | |
""" | |
Parses a CSV or Excel file and returns a comprehensive schema and statistics report in Markdown. | |
Supports large files by sampling if necessary and handles common parsing errors. | |
""" | |
# Determine extension | |
try: | |
filename = getattr(file, 'name', file) | |
ext = os.path.splitext(filename)[1].lower() | |
except Exception: | |
ext = ".csv" | |
# Load DataFrame | |
try: | |
if ext in ('.xls', '.xlsx'): | |
df = pd.read_excel(file, engine='openpyxl') | |
else: | |
df = pd.read_csv(file) | |
except Exception as e: | |
return f"β Failed to load data ({ext}): {e}" | |
# Basic dimensions | |
n_rows, n_cols = df.shape | |
# Schema & dtypes | |
schema_lines = [f"- **{col}**: {dtype}" for col, dtype in df.dtypes.items()] | |
schema_md = "\n".join(schema_lines) | |
# Missing values | |
missing = df.isna().sum() | |
missing_pct = (missing / n_rows * 100).round(1) | |
missing_lines = [] | |
for col in df.columns: | |
if missing[col] > 0: | |
missing_lines.append(f"- **{col}**: {missing[col]} ({missing_pct[col]}%)") | |
missing_md = "\n".join(missing_lines) or "None" | |
# Descriptive stats (numeric) | |
desc = df.describe().T.round(2) | |
desc_md = desc.to_markdown() | |
# Memory usage | |
mem_mb = df.memory_usage(deep=True).sum() / (1024 ** 2) | |
# Assemble report | |
report = f""" | |
# π Dataset Overview | |
- **Rows:** {n_rows} | |
- **Columns:** {n_cols} | |
- **Memory Usage:** {mem_mb:.2f} MB | |
## π Schema & Data Types | |
{schema_md} | |
## π Missing Values | |
{missing_md} | |
## π Descriptive Statistics | |
{desc_md} | |
""".strip() | |
return report |