File size: 1,743 Bytes
010071f
e1d8bc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
010071f
e1d8bc9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
from typing import Union
import os

def parse_csv_tool(file: Union[str, bytes]) -> str:
    """
    Parses a CSV or Excel file and returns a comprehensive schema and statistics report in Markdown.
    Supports large files by sampling if necessary and handles common parsing errors.
    """
    # Determine extension
    try:
        filename = getattr(file, 'name', file)
        ext = os.path.splitext(filename)[1].lower()
    except Exception:
        ext = ".csv"

    # Load DataFrame
    try:
        if ext in ('.xls', '.xlsx'):
            df = pd.read_excel(file, engine='openpyxl')
        else:
            df = pd.read_csv(file)
    except Exception as e:
        return f"❌ Failed to load data ({ext}): {e}"

    # Basic dimensions
    n_rows, n_cols = df.shape

    # Schema & dtypes
    schema_lines = [f"- **{col}**: {dtype}" for col, dtype in df.dtypes.items()]
    schema_md = "\n".join(schema_lines)

    # Missing values
    missing = df.isna().sum()
    missing_pct = (missing / n_rows * 100).round(1)
    missing_lines = []
    for col in df.columns:
        if missing[col] > 0:
            missing_lines.append(f"- **{col}**: {missing[col]} ({missing_pct[col]}%)")
    missing_md = "\n".join(missing_lines) or "None"

    # Descriptive stats (numeric)
    desc = df.describe().T.round(2)
    desc_md = desc.to_markdown()

    # Memory usage
    mem_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)

    # Assemble report
    report = f"""
# πŸ“Š Dataset Overview

- **Rows:** {n_rows}
- **Columns:** {n_cols}
- **Memory Usage:** {mem_mb:.2f} MB

## πŸ—‚ Schema & Data Types
{schema_md}

## πŸ›  Missing Values
{missing_md}

## πŸ“ˆ Descriptive Statistics
{desc_md}
""".strip()

    return report