Spaces:
Sleeping
Sleeping
File size: 5,474 Bytes
dc5ae18 bf400de 9538f35 bf400de 9538f35 bf400de 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 bf400de dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 9538f35 dc5ae18 bf400de 9538f35 dc5ae18 bf400de dc5ae18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import os
import tempfile
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from scipy.cluster.hierarchy import linkage, leaves_list
from typing import Union, Tuple, List
def _save_fig(fig: go.Figure, prefix: str, output_dir: str) -> str:
"""
Save a Plotly figure as a high-res PNG and return the file path.
"""
os.makedirs(output_dir, exist_ok=True)
tmp = tempfile.NamedTemporaryFile(suffix='.png', prefix=prefix, dir=output_dir, delete=False)
path = tmp.name
tmp.close()
fig.write_image(path, scale=3)
return path
def histogram_tool(
file_path: str,
column: str,
bins: int = 30,
kde: bool = True,
output_dir: str = '/tmp'
) -> Union[Tuple[ff.FigureFactory, str], str]:
"""
Create a histogram with optional KDE overlay for a given numeric column.
Returns (figure, png_path) or error string.
"""
# Load
ext = os.path.splitext(file_path)[1].lower()
df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
# Validate
if column not in df.columns:
return f"β Column '{column}' not found."
series = pd.to_numeric(df[column], errors='coerce').dropna()
if series.empty:
return f"β No numeric data in '{column}'."
# Build histogram + KDE
if kde:
fig = ff.create_distplot([series], [column], bin_size=(series.max()-series.min())/bins)
else:
fig = px.histogram(series, nbins=bins, title=f"Histogram β {column}", template='plotly_dark')
fig.update_layout(template='plotly_dark')
# Save
img_path = _save_fig(fig, f"hist_{column}_", output_dir)
return fig, img_path
def boxplot_tool(
file_path: str,
column: str,
output_dir: str = '/tmp'
) -> Union[Tuple[px.Figure, str], str]:
"""
Create a box plot with outliers for a numeric column.
Returns (figure, png_path) or error string.
"""
ext = os.path.splitext(file_path)[1].lower()
df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
if column not in df.columns:
return f"β Column '{column}' not found."
series = pd.to_numeric(df[column], errors='coerce').dropna()
if series.empty:
return f"β No numeric data in '{column}'."
fig = px.box(series, points='outliers', title=f"Boxplot β {column}", template='plotly_dark')
img_path = _save_fig(fig, f"box_{column}_", output_dir)
return fig, img_path
def violin_tool(
file_path: str,
column: str,
output_dir: str = '/tmp'
) -> Union[Tuple[px.Figure, str], str]:
"""
Create a violin plot with inner box for a numeric column.
Returns (figure, png_path) or error string.
"""
ext = os.path.splitext(file_path)[1].lower()
df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
if column not in df.columns:
return f"β Column '{column}' not found."
series = pd.to_numeric(df[column], errors='coerce').dropna()
if series.empty:
return f"β No numeric data in '{column}'."
fig = px.violin(series, box=True, points='all', title=f"Violin β {column}", template='plotly_dark')
img_path = _save_fig(fig, f"violin_{column}_", output_dir)
return fig, img_path
def scatter_matrix_tool(
file_path: str,
columns: List[str],
output_dir: str = '/tmp',
size: int = 5
) -> Union[Tuple[px.Figure, str], str]:
"""
Create an interactive scatter matrix for selected numeric columns.
Returns (figure, png_path) or error string.
"""
ext = os.path.splitext(file_path)[1].lower()
df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
missing = [c for c in columns if c not in df.columns]
if missing:
return f"β Missing columns: {', '.join(missing)}"
df_num = df[columns].apply(pd.to_numeric, errors='coerce').dropna()
if df_num.empty:
return "β No valid numeric data."
fig = px.scatter_matrix(df_num, dimensions=columns, title="Scatter Matrix", template='plotly_dark')
fig.update_traces(diagonal_visible=False, marker={'size': size})
img_path = _save_fig(fig, "scatter_matrix_", output_dir)
return fig, img_path
def corr_heatmap_tool(
file_path: str,
columns: List[str] = None,
output_dir: str = '/tmp',
cluster: bool = True
) -> Union[Tuple[px.Figure, str], str]:
"""
Create a correlation heatmap, with optional hierarchical clustering of variables.
Returns (figure, png_path) or error string.
"""
ext = os.path.splitext(file_path)[1].lower()
df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
df_num = df.select_dtypes(include='number') if columns is None else df[columns]
df_num = df_num.apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='all')
if df_num.shape[1] < 2:
return "β Need at least two numeric columns for correlation."
corr = df_num.corr()
if cluster:
link = linkage(corr, method='average')
order = leaves_list(link)
corr = corr.iloc[order, order]
fig = px.imshow(
corr,
color_continuous_scale='RdBu',
title="Correlation Heatmap",
labels=dict(color="Correlation"),
template='plotly_dark'
)
img_path = _save_fig(fig, "corr_heatmap_", output_dir)
return fig, img_path
|