Spaces:

mgbam
/

BizIntel_AI

Sleeping

File size: 4,373 Bytes

b64c41b
 
7453b19
 
 
 
 
 
 
 
b64c41b
 
 
e1d8bc9
b64c41b
 
7453b19
b64c41b
 
 
7453b19
 
 
 
 
 
 
b64c41b
 
7453b19
 
 
 
 
 
 
 
 
b64c41b
7453b19
 
 
 
 
b64c41b
7453b19
e1d8bc9
7453b19
 
 
 
 
e1d8bc9
7453b19
e1d8bc9
b64c41b
 
7453b19
 
 
b64c41b
 
e1d8bc9
7453b19
b64c41b
 
e1d8bc9
7453b19
 
e1d8bc9
7453b19
b64c41b
7453b19
b64c41b
e1d8bc9
7453b19
 
b64c41b
7453b19
 
 
 
 
 
 
 
e1d8bc9
7453b19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1d8bc9
7453b19
b64c41b
 
e1d8bc9
b64c41b
 
7453b19
 
b64c41b
e1d8bc9
7453b19
e1d8bc9
 
b64c41b
e1d8bc9
 
7453b19
e1d8bc9

# tools/csv_parser.py
# ------------------------------------------------------------
# Reads a CSV / Excel file (sampling ultra‑large CSVs), then
# returns a Markdown report:
#   ▸ dimensions         ▸ schema & dtypes
#   ▸ missing‑value map  ▸ numeric describe()
#   ▸ memory footprint
# If the optional dependency **tabulate** is unavailable,
# it falls back to a plain‑text table wrapped in Markdown
# code fences, so no ImportError ever reaches the UI.

from __future__ import annotations

import os
from typing import Union

import numpy as np
import pandas as pd


# ╭──────────────────────────────────────────────────────────╮
# │  Helper: efficient reader with sampling for huge CSVs    │
# ╰──────────────────────────────────────────────────────────╯
def _safe_read(path: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
    """Load CSV / Excel.  If CSV has > sample_rows, read a uniform sample."""
    is_str = isinstance(path, str)
    ext = os.path.splitext(path)[1].lower() if is_str else ".csv"

    if ext in (".xls", ".xlsx"):
        return pd.read_excel(path, engine="openpyxl")

    # --- CSV branch --------------------------------------------------------
    if is_str:
        # fast line count (memory‑map); falls back to full read for non‑files
        with open(path, "rb") as fh:
            n_total = sum(1 for _ in fh)
    else:
        n_total = None

    if n_total and n_total > sample_rows:
        # sample without reading entire file
        rng = np.random.default_rng(seed=42)
        skip = sorted(rng.choice(range(1, n_total), n_total - sample_rows, replace=False))
        return pd.read_csv(path, skiprows=skip)

    return pd.read_csv(path)


# ╭──────────────────────────────────────────────────────────╮
# │               Main public helper                         │
# ╰──────────────────────────────────────────────────────────╯
def parse_csv_tool(path: Union[str, bytes]) -> str:
    """
    Return a Markdown report that Streamlit can render.

    Sections:
    • Dimensions
    • Schema & dtypes
    • Missing‑value counts (+%)
    • Numeric describe()
    • Memory usage
    """
    try:
        df = _safe_read(path)
    except Exception as exc:
        return f"❌ Failed to load data: {exc}"

    rows, cols = df.shape
    mem_mb = df.memory_usage(deep=True).sum() / 1024**2

    # ── Schema -------------------------------------------------------------
    schema_md = "\n".join(
        f"- **{col}** – `{dtype}`" for col, dtype in df.dtypes.items()
    )

    # ── Missing map --------------------------------------------------------
    miss_ct = df.isna().sum()
    miss_pct = (miss_ct / len(df) * 100).round(1)
    missing_md = (
        "\n".join(
            f"- **{c}**: {miss_ct[c]} ({miss_pct[c]} %)"
            for c in df.columns
            if miss_ct[c] > 0
        )
        or "None"
    )

    # ── Numeric describe() -------------------------------------------------
    numeric_df = df.select_dtypes("number")
    if numeric_df.empty:
        desc_md = "_No numeric columns_"
    else:
        try:
            # requires the optional 'tabulate' package
            desc_md = numeric_df.describe().T.round(2).to_markdown()
        except ImportError:
            # graceful fallback without extra dependency
            desc_md = (
                "```text\n"
                + numeric_df.describe().T.round(2).to_string()
                + "\n```"
            )

    # ── Assemble markdown --------------------------------------------------
    return f"""
# 📊 Dataset Overview

| metric | value |
| ------ | ----- |
| Rows   | {rows:,} |
| Columns| {cols} |
| Memory | {mem_mb:.2f} MB |

## 🗂 Schema & Dtypes
{schema_md}

## 🛠 Missing Values
{missing_md}

## 📈 Descriptive Statistics (numeric)
{desc_md}
""".strip()