import os import tempfile import pandas as pd import numpy as np import plotly.express as px import plotly.figure_factory as ff import plotly.graph_objects as go from scipy.cluster.hierarchy import linkage, leaves_list from typing import Union, Tuple, List def _save_fig(fig: go.Figure, prefix: str, output_dir: str) -> str: """ Save a Plotly figure as a high-res PNG and return the file path. """ os.makedirs(output_dir, exist_ok=True) tmp = tempfile.NamedTemporaryFile(suffix='.png', prefix=prefix, dir=output_dir, delete=False) path = tmp.name tmp.close() fig.write_image(path, scale=3) return path def histogram_tool( file_path: str, column: str, bins: int = 30, kde: bool = True, output_dir: str = '/tmp' ) -> Union[Tuple[ff.FigureFactory, str], str]: """ Create a histogram with optional KDE overlay for a given numeric column. Returns (figure, png_path) or error string. """ # Load ext = os.path.splitext(file_path)[1].lower() df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path) # Validate if column not in df.columns: return f"❌ Column '{column}' not found." series = pd.to_numeric(df[column], errors='coerce').dropna() if series.empty: return f"❌ No numeric data in '{column}'." # Build histogram + KDE if kde: fig = ff.create_distplot([series], [column], bin_size=(series.max()-series.min())/bins) else: fig = px.histogram(series, nbins=bins, title=f"Histogram – {column}", template='plotly_dark') fig.update_layout(template='plotly_dark') # Save img_path = _save_fig(fig, f"hist_{column}_", output_dir) return fig, img_path def boxplot_tool( file_path: str, column: str, output_dir: str = '/tmp' ) -> Union[Tuple[px.Figure, str], str]: """ Create a box plot with outliers for a numeric column. Returns (figure, png_path) or error string. """ ext = os.path.splitext(file_path)[1].lower() df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path) if column not in df.columns: return f"❌ Column '{column}' not found." series = pd.to_numeric(df[column], errors='coerce').dropna() if series.empty: return f"❌ No numeric data in '{column}'." fig = px.box(series, points='outliers', title=f"Boxplot – {column}", template='plotly_dark') img_path = _save_fig(fig, f"box_{column}_", output_dir) return fig, img_path def violin_tool( file_path: str, column: str, output_dir: str = '/tmp' ) -> Union[Tuple[px.Figure, str], str]: """ Create a violin plot with inner box for a numeric column. Returns (figure, png_path) or error string. """ ext = os.path.splitext(file_path)[1].lower() df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path) if column not in df.columns: return f"❌ Column '{column}' not found." series = pd.to_numeric(df[column], errors='coerce').dropna() if series.empty: return f"❌ No numeric data in '{column}'." fig = px.violin(series, box=True, points='all', title=f"Violin – {column}", template='plotly_dark') img_path = _save_fig(fig, f"violin_{column}_", output_dir) return fig, img_path def scatter_matrix_tool( file_path: str, columns: List[str], output_dir: str = '/tmp', size: int = 5 ) -> Union[Tuple[px.Figure, str], str]: """ Create an interactive scatter matrix for selected numeric columns. Returns (figure, png_path) or error string. """ ext = os.path.splitext(file_path)[1].lower() df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path) missing = [c for c in columns if c not in df.columns] if missing: return f"❌ Missing columns: {', '.join(missing)}" df_num = df[columns].apply(pd.to_numeric, errors='coerce').dropna() if df_num.empty: return "❌ No valid numeric data." fig = px.scatter_matrix(df_num, dimensions=columns, title="Scatter Matrix", template='plotly_dark') fig.update_traces(diagonal_visible=False, marker={'size': size}) img_path = _save_fig(fig, "scatter_matrix_", output_dir) return fig, img_path def corr_heatmap_tool( file_path: str, columns: List[str] = None, output_dir: str = '/tmp', cluster: bool = True ) -> Union[Tuple[px.Figure, str], str]: """ Create a correlation heatmap, with optional hierarchical clustering of variables. Returns (figure, png_path) or error string. """ ext = os.path.splitext(file_path)[1].lower() df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path) df_num = df.select_dtypes(include='number') if columns is None else df[columns] df_num = df_num.apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='all') if df_num.shape[1] < 2: return "❌ Need at least two numeric columns for correlation." corr = df_num.corr() if cluster: link = linkage(corr, method='average') order = leaves_list(link) corr = corr.iloc[order, order] fig = px.imshow( corr, color_continuous_scale='RdBu', title="Correlation Heatmap", labels=dict(color="Correlation"), template='plotly_dark' ) img_path = _save_fig(fig, "corr_heatmap_", output_dir) return fig, img_path