BizIntel_AI / tools /csv_parser.py
mgbam's picture
Update tools/csv_parser.py
7453b19 verified
# tools/csv_parser.py
# ------------------------------------------------------------
# Reads a CSVโ€ฏ/โ€ฏExcel file (sampling ultraโ€‘large CSVs), then
# returns a Markdown report:
# โ–ธ dimensions โ–ธ schema & dtypes
# โ–ธ missingโ€‘value map โ–ธ numeric describe()
# โ–ธ memory footprint
# If the optional dependency **tabulate** is unavailable,
# it falls back to a plainโ€‘text table wrapped in Markdown
# code fences, so no ImportError ever reaches the UI.
from __future__ import annotations
import os
from typing import Union
import numpy as np
import pandas as pd
# โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ
# โ”‚ Helper: efficient reader with sampling for huge CSVs โ”‚
# โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ
def _safe_read(path: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
"""Load CSV / Excel. If CSV has >โ€ฏsample_rows, read a uniform sample."""
is_str = isinstance(path, str)
ext = os.path.splitext(path)[1].lower() if is_str else ".csv"
if ext in (".xls", ".xlsx"):
return pd.read_excel(path, engine="openpyxl")
# --- CSV branch --------------------------------------------------------
if is_str:
# fast line count (memoryโ€‘map); falls back to full read for nonโ€‘files
with open(path, "rb") as fh:
n_total = sum(1 for _ in fh)
else:
n_total = None
if n_total and n_total > sample_rows:
# sample without reading entire file
rng = np.random.default_rng(seed=42)
skip = sorted(rng.choice(range(1, n_total), n_total - sample_rows, replace=False))
return pd.read_csv(path, skiprows=skip)
return pd.read_csv(path)
# โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ
# โ”‚ Main public helper โ”‚
# โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ
def parse_csv_tool(path: Union[str, bytes]) -> str:
"""
Return a Markdown report that Streamlit can render.
Sections:
โ€ข Dimensions
โ€ข Schema & dtypes
โ€ข Missingโ€‘value counts (+%)
โ€ข Numeric describe()
โ€ข Memory usage
"""
try:
df = _safe_read(path)
except Exception as exc:
return f"โŒ Failed to load data: {exc}"
rows, cols = df.shape
mem_mb = df.memory_usage(deep=True).sum() / 1024**2
# โ”€โ”€ Schema -------------------------------------------------------------
schema_md = "\n".join(
f"- **{col}** โ€“ `{dtype}`" for col, dtype in df.dtypes.items()
)
# โ”€โ”€ Missing map --------------------------------------------------------
miss_ct = df.isna().sum()
miss_pct = (miss_ct / len(df) * 100).round(1)
missing_md = (
"\n".join(
f"- **{c}**: {miss_ct[c]}ย ({miss_pct[c]}โ€ฏ%)"
for c in df.columns
if miss_ct[c] > 0
)
or "None"
)
# โ”€โ”€ Numeric describe() -------------------------------------------------
numeric_df = df.select_dtypes("number")
if numeric_df.empty:
desc_md = "_No numeric columns_"
else:
try:
# requires the optional 'tabulate' package
desc_md = numeric_df.describe().T.round(2).to_markdown()
except ImportError:
# graceful fallback without extra dependency
desc_md = (
"```text\n"
+ numeric_df.describe().T.round(2).to_string()
+ "\n```"
)
# โ”€โ”€ Assemble markdown --------------------------------------------------
return f"""
# ๐Ÿ“Šย Dataset Overview
| metric | value |
| ------ | ----- |
| Rows | {rows:,} |
| Columns| {cols} |
| Memory | {mem_mb:.2f}ย MB |
## ๐Ÿ—‚ย Schema & Dtypes
{schema_md}
## ๐Ÿ› ย Missing Values
{missing_md}
## ๐Ÿ“ˆย Descriptive Statisticsย (numeric)
{desc_md}
""".strip()