# tools/csv_parser.py # ------------------------------------------------------------ # Reads CSV / Excel, samples for very large files, and returns a # Markdown‑formatted “quick‑scan” report: dimensions, schema, # missing‑value profile, numeric describe(), and memory footprint. from __future__ import annotations import os from typing import Union import pandas as pd def _safe_read(path_or_buf: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame: """Read CSV or Excel. If the file has > sample_rows, read only a sample.""" # Determine extension (best‑effort) ext = ".csv" if isinstance(path_or_buf, str): ext = os.path.splitext(path_or_buf)[1].lower() if ext in (".xls", ".xlsx"): # Excel — read first sheet df = pd.read_excel(path_or_buf, engine="openpyxl") else: # CSV family # First row‑count check: pandas 1.5+ uses memory map ⇒ cheap for header only nrows_total = sum(1 for _ in open(path_or_buf, "rb")) if isinstance(path_or_buf, str) else None if nrows_total and nrows_total > sample_rows: # sample uniformly without loading everything skip = sorted( pd.np.random.choice(range(1, nrows_total), nrows_total - sample_rows, replace=False) ) df = pd.read_csv(path_or_buf, skiprows=skip) else: df = pd.read_csv(path_or_buf) return df def parse_csv_tool(file: Union[str, bytes]) -> str: """ Return a **Markdown** report describing the dataset. Sections: • Dimensions • Schema (+ dtypes) • Missing‑value counts + % • Numeric descriptive statistics • Memory usage """ try: df = _safe_read(file) except Exception as exc: return f"❌ Failed to load data: {exc}" n_rows, n_cols = df.shape # ---------- schema ---------- schema_md = "\n".join( f"- **{col}** – `{dtype}`" for col, dtype in df.dtypes.items() ) # ---------- missing ---------- miss_ct = df.isna().sum() miss_pct = (miss_ct / len(df) * 100).round(1) missing_md = "\n".join( f"- **{c}**: {miss_ct[c]} ({miss_pct[c]} %)" for c in df.columns if miss_ct[c] > 0 ) or "None" # ---------- descriptive stats (numeric only) ---------- if df.select_dtypes("number").shape[1]: desc_md = df.describe().T.round(2).to_markdown() else: desc_md = "_No numeric columns_" # ---------- memory ---------- mem_mb = df.memory_usage(deep=True).sum() / 1024**2 # ---------- assemble ---------- return f""" # 📊 Dataset Overview | metric | value | | ------ | ----- | | Rows | {n_rows:,} | | Columns| {n_cols} | | Memory | {mem_mb:.2f} MB | ## 🗂 Schema {schema_md} ## 🛠 Missing Values {missing_md} ## 📈 Descriptive Statistics (numeric) {desc_md} """.strip()