Spaces:
Sleeping
Sleeping
File size: 1,743 Bytes
010071f e1d8bc9 010071f e1d8bc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import pandas as pd
from typing import Union
import os
def parse_csv_tool(file: Union[str, bytes]) -> str:
"""
Parses a CSV or Excel file and returns a comprehensive schema and statistics report in Markdown.
Supports large files by sampling if necessary and handles common parsing errors.
"""
# Determine extension
try:
filename = getattr(file, 'name', file)
ext = os.path.splitext(filename)[1].lower()
except Exception:
ext = ".csv"
# Load DataFrame
try:
if ext in ('.xls', '.xlsx'):
df = pd.read_excel(file, engine='openpyxl')
else:
df = pd.read_csv(file)
except Exception as e:
return f"β Failed to load data ({ext}): {e}"
# Basic dimensions
n_rows, n_cols = df.shape
# Schema & dtypes
schema_lines = [f"- **{col}**: {dtype}" for col, dtype in df.dtypes.items()]
schema_md = "\n".join(schema_lines)
# Missing values
missing = df.isna().sum()
missing_pct = (missing / n_rows * 100).round(1)
missing_lines = []
for col in df.columns:
if missing[col] > 0:
missing_lines.append(f"- **{col}**: {missing[col]} ({missing_pct[col]}%)")
missing_md = "\n".join(missing_lines) or "None"
# Descriptive stats (numeric)
desc = df.describe().T.round(2)
desc_md = desc.to_markdown()
# Memory usage
mem_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)
# Assemble report
report = f"""
# π Dataset Overview
- **Rows:** {n_rows}
- **Columns:** {n_cols}
- **Memory Usage:** {mem_mb:.2f} MB
## π Schema & Data Types
{schema_md}
## π Missing Values
{missing_md}
## π Descriptive Statistics
{desc_md}
""".strip()
return report |