mgbam commited on
Commit
e1d8bc9
Β·
verified Β·
1 Parent(s): 8a0173b

Update tools/csv_parser.py

Browse files
Files changed (1) hide show
  1. tools/csv_parser.py +65 -4
tools/csv_parser.py CHANGED
@@ -1,6 +1,67 @@
1
-
2
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- def parse_csv_tool(file_path: str) -> str:
5
- df = pd.read_csv(file_path)
6
- return f"Schema: {list(df.columns)}\n\nStats:\n{df.describe().to_string()}"
 
 
1
  import pandas as pd
2
+ from typing import Union
3
+ import os
4
+
5
+ def parse_csv_tool(file: Union[str, bytes]) -> str:
6
+ """
7
+ Parses a CSV or Excel file and returns a comprehensive schema and statistics report in Markdown.
8
+ Supports large files by sampling if necessary and handles common parsing errors.
9
+ """
10
+ # Determine extension
11
+ try:
12
+ filename = getattr(file, 'name', file)
13
+ ext = os.path.splitext(filename)[1].lower()
14
+ except Exception:
15
+ ext = ".csv"
16
+
17
+ # Load DataFrame
18
+ try:
19
+ if ext in ('.xls', '.xlsx'):
20
+ df = pd.read_excel(file, engine='openpyxl')
21
+ else:
22
+ df = pd.read_csv(file)
23
+ except Exception as e:
24
+ return f"❌ Failed to load data ({ext}): {e}"
25
+
26
+ # Basic dimensions
27
+ n_rows, n_cols = df.shape
28
+
29
+ # Schema & dtypes
30
+ schema_lines = [f"- **{col}**: {dtype}" for col, dtype in df.dtypes.items()]
31
+ schema_md = "\n".join(schema_lines)
32
+
33
+ # Missing values
34
+ missing = df.isna().sum()
35
+ missing_pct = (missing / n_rows * 100).round(1)
36
+ missing_lines = []
37
+ for col in df.columns:
38
+ if missing[col] > 0:
39
+ missing_lines.append(f"- **{col}**: {missing[col]} ({missing_pct[col]}%)")
40
+ missing_md = "\n".join(missing_lines) or "None"
41
+
42
+ # Descriptive stats (numeric)
43
+ desc = df.describe().T.round(2)
44
+ desc_md = desc.to_markdown()
45
+
46
+ # Memory usage
47
+ mem_mb = df.memory_usage(deep=True).sum() / (1024 ** 2)
48
+
49
+ # Assemble report
50
+ report = f"""
51
+ # πŸ“Š Dataset Overview
52
+
53
+ - **Rows:** {n_rows}
54
+ - **Columns:** {n_cols}
55
+ - **Memory Usage:** {mem_mb:.2f} MB
56
+
57
+ ## πŸ—‚ Schema & Data Types
58
+ {schema_md}
59
+
60
+ ## πŸ›  Missing Values
61
+ {missing_md}
62
+
63
+ ## πŸ“ˆ Descriptive Statistics
64
+ {desc_md}
65
+ """.strip()
66
 
67
+ return report