Spaces:

mgbam
/

BizIntel_AI

Sleeping

App Files Files Community

BizIntel_AI / tools /csv_parser.py

mgbam

Update tools/csv_parser.py

b64c41b verified about 1 month ago

raw

history blame

2.92 kB

	# tools/csv_parser.py
	# ------------------------------------------------------------
	# Reads CSV / Excel, samples for very large files, and returns a
	# Markdown‑formatted “quick‑scan” report: dimensions, schema,
	# missing‑value profile, numeric describe(), and memory footprint.

	from __future__ import annotations

	import os
	from typing import Union

	import pandas as pd


	def _safe_read(path_or_buf: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
	"""Read CSV or Excel. If the file has > sample_rows, read only a sample."""
	# Determine extension (best‑effort)
	ext = ".csv"
	if isinstance(path_or_buf, str):
	ext = os.path.splitext(path_or_buf)[1].lower()

	if ext in (".xls", ".xlsx"):
	# Excel — read first sheet
	df = pd.read_excel(path_or_buf, engine="openpyxl")
	else: # CSV family
	# First row‑count check: pandas 1.5+ uses memory map ⇒ cheap for header only
	nrows_total = sum(1 for _ in open(path_or_buf, "rb")) if isinstance(path_or_buf, str) else None
	if nrows_total and nrows_total > sample_rows:
	# sample uniformly without loading everything
	skip = sorted(
	pd.np.random.choice(range(1, nrows_total), nrows_total - sample_rows, replace=False)
	)
	df = pd.read_csv(path_or_buf, skiprows=skip)
	else:
	df = pd.read_csv(path_or_buf)

	return df


	def parse_csv_tool(file: Union[str, bytes]) -> str:
	"""
	Return a Markdown report describing the dataset.

	Sections:
	• Dimensions
	• Schema (+ dtypes)
	• Missing‑value counts + %
	• Numeric descriptive statistics
	• Memory usage
	"""
	try:
	df = _safe_read(file)
	except Exception as exc:
	return f"❌ Failed to load data: {exc}"

	n_rows, n_cols = df.shape

	# ---------- schema ----------
	schema_md = "\n".join(
	f"- {col} – `{dtype}`"
	for col, dtype in df.dtypes.items()
	)

	# ---------- missing ----------
	miss_ct = df.isna().sum()
	miss_pct = (miss_ct / len(df) * 100).round(1)
	missing_md = "\n".join(
	f"- {c}: {miss_ct[c]} ({miss_pct[c]} %)"
	for c in df.columns if miss_ct[c] > 0
	) or "None"

	# ---------- descriptive stats (numeric only) ----------
	if df.select_dtypes("number").shape[1]:
	desc_md = df.describe().T.round(2).to_markdown()
	else:
	desc_md = "_No numeric columns_"

	# ---------- memory ----------
	mem_mb = df.memory_usage(deep=True).sum() / 1024**2

	# ---------- assemble ----------
	return f"""
	# 📊 Dataset Overview

	\| metric \| value \|
	\| ------ \| ----- \|
	\| Rows \| {n_rows:,} \|
	\| Columns\| {n_cols} \|
	\| Memory \| {mem_mb:.2f} MB \|

	## 🗂 Schema
	{schema_md}

	## 🛠 Missing Values
	{missing_md}

	## 📈 Descriptive Statistics (numeric)
	{desc_md}
	""".strip()