Spaces:
Sleeping
Sleeping
# tools/csv_parser.py | |
# ------------------------------------------------------------ | |
# Reads CSV / Excel, samples for very large files, and returns a | |
# Markdown‑formatted “quick‑scan” report: dimensions, schema, | |
# missing‑value profile, numeric describe(), and memory footprint. | |
from __future__ import annotations | |
import os | |
from typing import Union | |
import pandas as pd | |
def _safe_read(path_or_buf: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame: | |
"""Read CSV or Excel. If the file has > sample_rows, read only a sample.""" | |
# Determine extension (best‑effort) | |
ext = ".csv" | |
if isinstance(path_or_buf, str): | |
ext = os.path.splitext(path_or_buf)[1].lower() | |
if ext in (".xls", ".xlsx"): | |
# Excel — read first sheet | |
df = pd.read_excel(path_or_buf, engine="openpyxl") | |
else: # CSV family | |
# First row‑count check: pandas 1.5+ uses memory map ⇒ cheap for header only | |
nrows_total = sum(1 for _ in open(path_or_buf, "rb")) if isinstance(path_or_buf, str) else None | |
if nrows_total and nrows_total > sample_rows: | |
# sample uniformly without loading everything | |
skip = sorted( | |
pd.np.random.choice(range(1, nrows_total), nrows_total - sample_rows, replace=False) | |
) | |
df = pd.read_csv(path_or_buf, skiprows=skip) | |
else: | |
df = pd.read_csv(path_or_buf) | |
return df | |
def parse_csv_tool(file: Union[str, bytes]) -> str: | |
""" | |
Return a **Markdown** report describing the dataset. | |
Sections: | |
• Dimensions | |
• Schema (+ dtypes) | |
• Missing‑value counts + % | |
• Numeric descriptive statistics | |
• Memory usage | |
""" | |
try: | |
df = _safe_read(file) | |
except Exception as exc: | |
return f"❌ Failed to load data: {exc}" | |
n_rows, n_cols = df.shape | |
# ---------- schema ---------- | |
schema_md = "\n".join( | |
f"- **{col}** – `{dtype}`" | |
for col, dtype in df.dtypes.items() | |
) | |
# ---------- missing ---------- | |
miss_ct = df.isna().sum() | |
miss_pct = (miss_ct / len(df) * 100).round(1) | |
missing_md = "\n".join( | |
f"- **{c}**: {miss_ct[c]} ({miss_pct[c]} %)" | |
for c in df.columns if miss_ct[c] > 0 | |
) or "None" | |
# ---------- descriptive stats (numeric only) ---------- | |
if df.select_dtypes("number").shape[1]: | |
desc_md = df.describe().T.round(2).to_markdown() | |
else: | |
desc_md = "_No numeric columns_" | |
# ---------- memory ---------- | |
mem_mb = df.memory_usage(deep=True).sum() / 1024**2 | |
# ---------- assemble ---------- | |
return f""" | |
# 📊 Dataset Overview | |
| metric | value | | |
| ------ | ----- | | |
| Rows | {n_rows:,} | | |
| Columns| {n_cols} | | |
| Memory | {mem_mb:.2f} MB | | |
## 🗂 Schema | |
{schema_md} | |
## 🛠 Missing Values | |
{missing_md} | |
## 📈 Descriptive Statistics (numeric) | |
{desc_md} | |
""".strip() | |