Spaces:
Sleeping
Sleeping
# tools/csv_parser.py | |
# ------------------------------------------------------------ | |
# Reads a CSVโฏ/โฏExcel file (sampling ultraโlarge CSVs), then | |
# returns a Markdown report: | |
# โธ dimensions โธ schema & dtypes | |
# โธ missingโvalue map โธ numeric describe() | |
# โธ memory footprint | |
# If the optional dependency **tabulate** is unavailable, | |
# it falls back to a plainโtext table wrapped in Markdown | |
# code fences, so no ImportError ever reaches the UI. | |
from __future__ import annotations | |
import os | |
from typing import Union | |
import numpy as np | |
import pandas as pd | |
# โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ | |
# โ Helper: efficient reader with sampling for huge CSVs โ | |
# โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ | |
def _safe_read(path: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame: | |
"""Load CSV / Excel. If CSV has >โฏsample_rows, read a uniform sample.""" | |
is_str = isinstance(path, str) | |
ext = os.path.splitext(path)[1].lower() if is_str else ".csv" | |
if ext in (".xls", ".xlsx"): | |
return pd.read_excel(path, engine="openpyxl") | |
# --- CSV branch -------------------------------------------------------- | |
if is_str: | |
# fast line count (memoryโmap); falls back to full read for nonโfiles | |
with open(path, "rb") as fh: | |
n_total = sum(1 for _ in fh) | |
else: | |
n_total = None | |
if n_total and n_total > sample_rows: | |
# sample without reading entire file | |
rng = np.random.default_rng(seed=42) | |
skip = sorted(rng.choice(range(1, n_total), n_total - sample_rows, replace=False)) | |
return pd.read_csv(path, skiprows=skip) | |
return pd.read_csv(path) | |
# โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ | |
# โ Main public helper โ | |
# โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ | |
def parse_csv_tool(path: Union[str, bytes]) -> str: | |
""" | |
Return a Markdown report that Streamlit can render. | |
Sections: | |
โข Dimensions | |
โข Schema & dtypes | |
โข Missingโvalue counts (+%) | |
โข Numeric describe() | |
โข Memory usage | |
""" | |
try: | |
df = _safe_read(path) | |
except Exception as exc: | |
return f"โ Failed to load data: {exc}" | |
rows, cols = df.shape | |
mem_mb = df.memory_usage(deep=True).sum() / 1024**2 | |
# โโ Schema ------------------------------------------------------------- | |
schema_md = "\n".join( | |
f"- **{col}** โ `{dtype}`" for col, dtype in df.dtypes.items() | |
) | |
# โโ Missing map -------------------------------------------------------- | |
miss_ct = df.isna().sum() | |
miss_pct = (miss_ct / len(df) * 100).round(1) | |
missing_md = ( | |
"\n".join( | |
f"- **{c}**: {miss_ct[c]}ย ({miss_pct[c]}โฏ%)" | |
for c in df.columns | |
if miss_ct[c] > 0 | |
) | |
or "None" | |
) | |
# โโ Numeric describe() ------------------------------------------------- | |
numeric_df = df.select_dtypes("number") | |
if numeric_df.empty: | |
desc_md = "_No numeric columns_" | |
else: | |
try: | |
# requires the optional 'tabulate' package | |
desc_md = numeric_df.describe().T.round(2).to_markdown() | |
except ImportError: | |
# graceful fallback without extra dependency | |
desc_md = ( | |
"```text\n" | |
+ numeric_df.describe().T.round(2).to_string() | |
+ "\n```" | |
) | |
# โโ Assemble markdown -------------------------------------------------- | |
return f""" | |
# ๐ย Dataset Overview | |
| metric | value | | |
| ------ | ----- | | |
| Rows | {rows:,} | | |
| Columns| {cols} | | |
| Memory | {mem_mb:.2f}ย MB | | |
## ๐ย Schema & Dtypes | |
{schema_md} | |
## ๐ ย Missing Values | |
{missing_md} | |
## ๐ย Descriptive Statisticsย (numeric) | |
{desc_md} | |
""".strip() | |