Spaces:
Sleeping
Sleeping
File size: 2,923 Bytes
b64c41b e1d8bc9 b64c41b e1d8bc9 b64c41b e1d8bc9 b64c41b e1d8bc9 b64c41b e1d8bc9 b64c41b e1d8bc9 b64c41b e1d8bc9 b64c41b e1d8bc9 b64c41b e1d8bc9 b64c41b e1d8bc9 b64c41b e1d8bc9 b64c41b e1d8bc9 b64c41b e1d8bc9 b64c41b e1d8bc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# tools/csv_parser.py
# ------------------------------------------------------------
# Reads CSV / Excel, samples for very large files, and returns a
# Markdown‑formatted “quick‑scan” report: dimensions, schema,
# missing‑value profile, numeric describe(), and memory footprint.
from __future__ import annotations
import os
from typing import Union
import pandas as pd
def _safe_read(path_or_buf: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
"""Read CSV or Excel. If the file has > sample_rows, read only a sample."""
# Determine extension (best‑effort)
ext = ".csv"
if isinstance(path_or_buf, str):
ext = os.path.splitext(path_or_buf)[1].lower()
if ext in (".xls", ".xlsx"):
# Excel — read first sheet
df = pd.read_excel(path_or_buf, engine="openpyxl")
else: # CSV family
# First row‑count check: pandas 1.5+ uses memory map ⇒ cheap for header only
nrows_total = sum(1 for _ in open(path_or_buf, "rb")) if isinstance(path_or_buf, str) else None
if nrows_total and nrows_total > sample_rows:
# sample uniformly without loading everything
skip = sorted(
pd.np.random.choice(range(1, nrows_total), nrows_total - sample_rows, replace=False)
)
df = pd.read_csv(path_or_buf, skiprows=skip)
else:
df = pd.read_csv(path_or_buf)
return df
def parse_csv_tool(file: Union[str, bytes]) -> str:
"""
Return a **Markdown** report describing the dataset.
Sections:
• Dimensions
• Schema (+ dtypes)
• Missing‑value counts + %
• Numeric descriptive statistics
• Memory usage
"""
try:
df = _safe_read(file)
except Exception as exc:
return f"❌ Failed to load data: {exc}"
n_rows, n_cols = df.shape
# ---------- schema ----------
schema_md = "\n".join(
f"- **{col}** – `{dtype}`"
for col, dtype in df.dtypes.items()
)
# ---------- missing ----------
miss_ct = df.isna().sum()
miss_pct = (miss_ct / len(df) * 100).round(1)
missing_md = "\n".join(
f"- **{c}**: {miss_ct[c]} ({miss_pct[c]} %)"
for c in df.columns if miss_ct[c] > 0
) or "None"
# ---------- descriptive stats (numeric only) ----------
if df.select_dtypes("number").shape[1]:
desc_md = df.describe().T.round(2).to_markdown()
else:
desc_md = "_No numeric columns_"
# ---------- memory ----------
mem_mb = df.memory_usage(deep=True).sum() / 1024**2
# ---------- assemble ----------
return f"""
# 📊 Dataset Overview
| metric | value |
| ------ | ----- |
| Rows | {n_rows:,} |
| Columns| {n_cols} |
| Memory | {mem_mb:.2f} MB |
## 🗂 Schema
{schema_md}
## 🛠 Missing Values
{missing_md}
## 📈 Descriptive Statistics (numeric)
{desc_md}
""".strip()
|