Spaces:
Sleeping
Sleeping
import os | |
import tempfile | |
import pandas as pd | |
import numpy as np | |
import plotly.express as px | |
import plotly.figure_factory as ff | |
import plotly.graph_objects as go | |
from scipy.cluster.hierarchy import linkage, leaves_list | |
from typing import Union, Tuple, List | |
def _save_fig(fig: go.Figure, prefix: str, output_dir: str) -> str: | |
""" | |
Save a Plotly figure as a high-res PNG and return the file path. | |
""" | |
os.makedirs(output_dir, exist_ok=True) | |
tmp = tempfile.NamedTemporaryFile(suffix='.png', prefix=prefix, dir=output_dir, delete=False) | |
path = tmp.name | |
tmp.close() | |
fig.write_image(path, scale=3) | |
return path | |
def histogram_tool( | |
file_path: str, | |
column: str, | |
bins: int = 30, | |
kde: bool = True, | |
output_dir: str = '/tmp' | |
) -> Union[Tuple[ff.FigureFactory, str], str]: | |
""" | |
Create a histogram with optional KDE overlay for a given numeric column. | |
Returns (figure, png_path) or error string. | |
""" | |
# Load | |
ext = os.path.splitext(file_path)[1].lower() | |
df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path) | |
# Validate | |
if column not in df.columns: | |
return f"β Column '{column}' not found." | |
series = pd.to_numeric(df[column], errors='coerce').dropna() | |
if series.empty: | |
return f"β No numeric data in '{column}'." | |
# Build histogram + KDE | |
if kde: | |
fig = ff.create_distplot([series], [column], bin_size=(series.max()-series.min())/bins) | |
else: | |
fig = px.histogram(series, nbins=bins, title=f"Histogram β {column}", template='plotly_dark') | |
fig.update_layout(template='plotly_dark') | |
# Save | |
img_path = _save_fig(fig, f"hist_{column}_", output_dir) | |
return fig, img_path | |
def boxplot_tool( | |
file_path: str, | |
column: str, | |
output_dir: str = '/tmp' | |
) -> Union[Tuple[px.Figure, str], str]: | |
""" | |
Create a box plot with outliers for a numeric column. | |
Returns (figure, png_path) or error string. | |
""" | |
ext = os.path.splitext(file_path)[1].lower() | |
df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path) | |
if column not in df.columns: | |
return f"β Column '{column}' not found." | |
series = pd.to_numeric(df[column], errors='coerce').dropna() | |
if series.empty: | |
return f"β No numeric data in '{column}'." | |
fig = px.box(series, points='outliers', title=f"Boxplot β {column}", template='plotly_dark') | |
img_path = _save_fig(fig, f"box_{column}_", output_dir) | |
return fig, img_path | |
def violin_tool( | |
file_path: str, | |
column: str, | |
output_dir: str = '/tmp' | |
) -> Union[Tuple[px.Figure, str], str]: | |
""" | |
Create a violin plot with inner box for a numeric column. | |
Returns (figure, png_path) or error string. | |
""" | |
ext = os.path.splitext(file_path)[1].lower() | |
df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path) | |
if column not in df.columns: | |
return f"β Column '{column}' not found." | |
series = pd.to_numeric(df[column], errors='coerce').dropna() | |
if series.empty: | |
return f"β No numeric data in '{column}'." | |
fig = px.violin(series, box=True, points='all', title=f"Violin β {column}", template='plotly_dark') | |
img_path = _save_fig(fig, f"violin_{column}_", output_dir) | |
return fig, img_path | |
def scatter_matrix_tool( | |
file_path: str, | |
columns: List[str], | |
output_dir: str = '/tmp', | |
size: int = 5 | |
) -> Union[Tuple[px.Figure, str], str]: | |
""" | |
Create an interactive scatter matrix for selected numeric columns. | |
Returns (figure, png_path) or error string. | |
""" | |
ext = os.path.splitext(file_path)[1].lower() | |
df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path) | |
missing = [c for c in columns if c not in df.columns] | |
if missing: | |
return f"β Missing columns: {', '.join(missing)}" | |
df_num = df[columns].apply(pd.to_numeric, errors='coerce').dropna() | |
if df_num.empty: | |
return "β No valid numeric data." | |
fig = px.scatter_matrix(df_num, dimensions=columns, title="Scatter Matrix", template='plotly_dark') | |
fig.update_traces(diagonal_visible=False, marker={'size': size}) | |
img_path = _save_fig(fig, "scatter_matrix_", output_dir) | |
return fig, img_path | |
def corr_heatmap_tool( | |
file_path: str, | |
columns: List[str] = None, | |
output_dir: str = '/tmp', | |
cluster: bool = True | |
) -> Union[Tuple[px.Figure, str], str]: | |
""" | |
Create a correlation heatmap, with optional hierarchical clustering of variables. | |
Returns (figure, png_path) or error string. | |
""" | |
ext = os.path.splitext(file_path)[1].lower() | |
df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path) | |
df_num = df.select_dtypes(include='number') if columns is None else df[columns] | |
df_num = df_num.apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='all') | |
if df_num.shape[1] < 2: | |
return "β Need at least two numeric columns for correlation." | |
corr = df_num.corr() | |
if cluster: | |
link = linkage(corr, method='average') | |
order = leaves_list(link) | |
corr = corr.iloc[order, order] | |
fig = px.imshow( | |
corr, | |
color_continuous_scale='RdBu', | |
title="Correlation Heatmap", | |
labels=dict(color="Correlation"), | |
template='plotly_dark' | |
) | |
img_path = _save_fig(fig, "corr_heatmap_", output_dir) | |
return fig, img_path | |