Spaces:
Sleeping
Sleeping
File size: 4,373 Bytes
b64c41b 7453b19 b64c41b e1d8bc9 b64c41b 7453b19 b64c41b 7453b19 b64c41b 7453b19 b64c41b 7453b19 b64c41b 7453b19 e1d8bc9 7453b19 e1d8bc9 7453b19 e1d8bc9 b64c41b 7453b19 b64c41b e1d8bc9 7453b19 b64c41b e1d8bc9 7453b19 e1d8bc9 7453b19 b64c41b 7453b19 b64c41b e1d8bc9 7453b19 b64c41b 7453b19 e1d8bc9 7453b19 e1d8bc9 7453b19 b64c41b e1d8bc9 b64c41b 7453b19 b64c41b e1d8bc9 7453b19 e1d8bc9 b64c41b e1d8bc9 7453b19 e1d8bc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# tools/csv_parser.py
# ------------------------------------------------------------
# Reads a CSVโฏ/โฏExcel file (sampling ultraโlarge CSVs), then
# returns a Markdown report:
# โธ dimensions โธ schema & dtypes
# โธ missingโvalue map โธ numeric describe()
# โธ memory footprint
# If the optional dependency **tabulate** is unavailable,
# it falls back to a plainโtext table wrapped in Markdown
# code fences, so no ImportError ever reaches the UI.
from __future__ import annotations
import os
from typing import Union
import numpy as np
import pandas as pd
# โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ
# โ Helper: efficient reader with sampling for huge CSVs โ
# โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
def _safe_read(path: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
"""Load CSV / Excel. If CSV has >โฏsample_rows, read a uniform sample."""
is_str = isinstance(path, str)
ext = os.path.splitext(path)[1].lower() if is_str else ".csv"
if ext in (".xls", ".xlsx"):
return pd.read_excel(path, engine="openpyxl")
# --- CSV branch --------------------------------------------------------
if is_str:
# fast line count (memoryโmap); falls back to full read for nonโfiles
with open(path, "rb") as fh:
n_total = sum(1 for _ in fh)
else:
n_total = None
if n_total and n_total > sample_rows:
# sample without reading entire file
rng = np.random.default_rng(seed=42)
skip = sorted(rng.choice(range(1, n_total), n_total - sample_rows, replace=False))
return pd.read_csv(path, skiprows=skip)
return pd.read_csv(path)
# โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ
# โ Main public helper โ
# โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
def parse_csv_tool(path: Union[str, bytes]) -> str:
"""
Return a Markdown report that Streamlit can render.
Sections:
โข Dimensions
โข Schema & dtypes
โข Missingโvalue counts (+%)
โข Numeric describe()
โข Memory usage
"""
try:
df = _safe_read(path)
except Exception as exc:
return f"โ Failed to load data: {exc}"
rows, cols = df.shape
mem_mb = df.memory_usage(deep=True).sum() / 1024**2
# โโ Schema -------------------------------------------------------------
schema_md = "\n".join(
f"- **{col}** โ `{dtype}`" for col, dtype in df.dtypes.items()
)
# โโ Missing map --------------------------------------------------------
miss_ct = df.isna().sum()
miss_pct = (miss_ct / len(df) * 100).round(1)
missing_md = (
"\n".join(
f"- **{c}**: {miss_ct[c]}ย ({miss_pct[c]}โฏ%)"
for c in df.columns
if miss_ct[c] > 0
)
or "None"
)
# โโ Numeric describe() -------------------------------------------------
numeric_df = df.select_dtypes("number")
if numeric_df.empty:
desc_md = "_No numeric columns_"
else:
try:
# requires the optional 'tabulate' package
desc_md = numeric_df.describe().T.round(2).to_markdown()
except ImportError:
# graceful fallback without extra dependency
desc_md = (
"```text\n"
+ numeric_df.describe().T.round(2).to_string()
+ "\n```"
)
# โโ Assemble markdown --------------------------------------------------
return f"""
# ๐ย Dataset Overview
| metric | value |
| ------ | ----- |
| Rows | {rows:,} |
| Columns| {cols} |
| Memory | {mem_mb:.2f}ย MB |
## ๐ย Schema & Dtypes
{schema_md}
## ๐ ย Missing Values
{missing_md}
## ๐ย Descriptive Statisticsย (numeric)
{desc_md}
""".strip()
|