File size: 4,373 Bytes
b64c41b
 
7453b19
 
 
 
 
 
 
 
b64c41b
 
 
e1d8bc9
b64c41b
 
7453b19
b64c41b
 
 
7453b19
 
 
 
 
 
 
b64c41b
 
7453b19
 
 
 
 
 
 
 
 
b64c41b
7453b19
 
 
 
 
b64c41b
7453b19
e1d8bc9
7453b19
 
 
 
 
e1d8bc9
7453b19
e1d8bc9
b64c41b
 
7453b19
 
 
b64c41b
 
e1d8bc9
7453b19
b64c41b
 
e1d8bc9
7453b19
 
e1d8bc9
7453b19
b64c41b
7453b19
b64c41b
e1d8bc9
7453b19
 
b64c41b
7453b19
 
 
 
 
 
 
 
e1d8bc9
7453b19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1d8bc9
7453b19
b64c41b
 
e1d8bc9
b64c41b
 
7453b19
 
b64c41b
e1d8bc9
7453b19
e1d8bc9
 
b64c41b
e1d8bc9
 
7453b19
e1d8bc9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# tools/csv_parser.py
# ------------------------------------------------------------
# Reads a CSVโ€ฏ/โ€ฏExcel file (sampling ultraโ€‘large CSVs), then
# returns a Markdown report:
#   โ–ธ dimensions         โ–ธ schema & dtypes
#   โ–ธ missingโ€‘value map  โ–ธ numeric describe()
#   โ–ธ memory footprint
# If the optional dependency **tabulate** is unavailable,
# it falls back to a plainโ€‘text table wrapped in Markdown
# code fences, so no ImportError ever reaches the UI.

from __future__ import annotations

import os
from typing import Union

import numpy as np
import pandas as pd


# โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ
# โ”‚  Helper: efficient reader with sampling for huge CSVs    โ”‚
# โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ
def _safe_read(path: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
    """Load CSV / Excel.  If CSV has >โ€ฏsample_rows, read a uniform sample."""
    is_str = isinstance(path, str)
    ext = os.path.splitext(path)[1].lower() if is_str else ".csv"

    if ext in (".xls", ".xlsx"):
        return pd.read_excel(path, engine="openpyxl")

    # --- CSV branch --------------------------------------------------------
    if is_str:
        # fast line count (memoryโ€‘map); falls back to full read for nonโ€‘files
        with open(path, "rb") as fh:
            n_total = sum(1 for _ in fh)
    else:
        n_total = None

    if n_total and n_total > sample_rows:
        # sample without reading entire file
        rng = np.random.default_rng(seed=42)
        skip = sorted(rng.choice(range(1, n_total), n_total - sample_rows, replace=False))
        return pd.read_csv(path, skiprows=skip)

    return pd.read_csv(path)


# โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ
# โ”‚               Main public helper                         โ”‚
# โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ
def parse_csv_tool(path: Union[str, bytes]) -> str:
    """
    Return a Markdown report that Streamlit can render.

    Sections:
    โ€ข Dimensions
    โ€ข Schema & dtypes
    โ€ข Missingโ€‘value counts (+%)
    โ€ข Numeric describe()
    โ€ข Memory usage
    """
    try:
        df = _safe_read(path)
    except Exception as exc:
        return f"โŒ Failed to load data: {exc}"

    rows, cols = df.shape
    mem_mb = df.memory_usage(deep=True).sum() / 1024**2

    # โ”€โ”€ Schema -------------------------------------------------------------
    schema_md = "\n".join(
        f"- **{col}** โ€“ `{dtype}`" for col, dtype in df.dtypes.items()
    )

    # โ”€โ”€ Missing map --------------------------------------------------------
    miss_ct = df.isna().sum()
    miss_pct = (miss_ct / len(df) * 100).round(1)
    missing_md = (
        "\n".join(
            f"- **{c}**: {miss_ct[c]}ย ({miss_pct[c]}โ€ฏ%)"
            for c in df.columns
            if miss_ct[c] > 0
        )
        or "None"
    )

    # โ”€โ”€ Numeric describe() -------------------------------------------------
    numeric_df = df.select_dtypes("number")
    if numeric_df.empty:
        desc_md = "_No numeric columns_"
    else:
        try:
            # requires the optional 'tabulate' package
            desc_md = numeric_df.describe().T.round(2).to_markdown()
        except ImportError:
            # graceful fallback without extra dependency
            desc_md = (
                "```text\n"
                + numeric_df.describe().T.round(2).to_string()
                + "\n```"
            )

    # โ”€โ”€ Assemble markdown --------------------------------------------------
    return f"""
# ๐Ÿ“Šย Dataset Overview

| metric | value |
| ------ | ----- |
| Rows   | {rows:,} |
| Columns| {cols} |
| Memory | {mem_mb:.2f}ย MB |

## ๐Ÿ—‚ย Schema & Dtypes
{schema_md}

## ๐Ÿ› ย Missing Values
{missing_md}

## ๐Ÿ“ˆย Descriptive Statisticsย (numeric)
{desc_md}
""".strip()