Spaces:

mgbam
/

BizIntel_AI

Sleeping

BizIntel_AI / tools /csv_parser.py

Update tools/csv_parser.py

e1d8bc9 verified about 1 month ago

1.74 kB

	import pandas as pd
	from typing import Union
	import os

	def parse_csv_tool(file: Union[str, bytes]) -> str:
	"""
	Parses a CSV or Excel file and returns a comprehensive schema and statistics report in Markdown.
	Supports large files by sampling if necessary and handles common parsing errors.
	"""
	# Determine extension
	try:
	filename = getattr(file, 'name', file)
	ext = os.path.splitext(filename)[1].lower()
	except Exception:
	ext = ".csv"

	# Load DataFrame
	try:
	if ext in ('.xls', '.xlsx'):
	df = pd.read_excel(file, engine='openpyxl')
	else:
	df = pd.read_csv(file)
	except Exception as e:
	return f"❌ Failed to load data ({ext}): {e}"

	# Basic dimensions
	n_rows, n_cols = df.shape

	# Schema & dtypes
	schema_lines = [f"- {col}: {dtype}" for col, dtype in df.dtypes.items()]
	schema_md = "\n".join(schema_lines)

	# Missing values
	missing = df.isna().sum()
	missing_pct = (missing / n_rows * 100).round(1)
	missing_lines = []
	for col in df.columns:
	if missing[col] > 0:
	missing_lines.append(f"- {col}: {missing[col]} ({missing_pct[col]}%)")
	missing_md = "\n".join(missing_lines) or "None"

	# Descriptive stats (numeric)
	desc = df.describe().T.round(2)
	desc_md = desc.to_markdown()

	# Memory usage
	mem_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)

	# Assemble report
	report = f"""
	# 📊 Dataset Overview

	- Rows: {n_rows}
	- Columns: {n_cols}
	- Memory Usage: {mem_mb:.2f} MB

	## 🗂 Schema & Data Types
	{schema_md}

	## 🛠 Missing Values
	{missing_md}

	## 📈 Descriptive Statistics
	{desc_md}
	""".strip()

	return report