File size: 5,474 Bytes
dc5ae18
 
bf400de
9538f35
bf400de
9538f35
 
 
 
bf400de
9538f35
 
dc5ae18
9538f35
dc5ae18
 
 
 
 
9538f35
dc5ae18
bf400de
 
dc5ae18
 
 
9538f35
 
 
 
dc5ae18
9538f35
 
 
dc5ae18
9538f35
dc5ae18
9538f35
dc5ae18
9538f35
dc5ae18
 
9538f35
dc5ae18
9538f35
 
 
 
 
 
 
 
 
 
dc5ae18
 
 
 
9538f35
dc5ae18
9538f35
dc5ae18
 
 
9538f35
 
 
dc5ae18
 
9538f35
 
 
 
 
 
 
 
 
 
dc5ae18
9538f35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc5ae18
 
9538f35
dc5ae18
9538f35
dc5ae18
9538f35
 
dc5ae18
 
 
 
 
 
9538f35
dc5ae18
9538f35
dc5ae18
 
9538f35
 
 
dc5ae18
 
9538f35
 
 
 
 
dc5ae18
 
9538f35
 
 
 
dc5ae18
bf400de
 
9538f35
dc5ae18
 
 
bf400de
dc5ae18
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
import tempfile
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from scipy.cluster.hierarchy import linkage, leaves_list
from typing import Union, Tuple, List


def _save_fig(fig: go.Figure, prefix: str, output_dir: str) -> str:
    """
    Save a Plotly figure as a high-res PNG and return the file path.
    """
    os.makedirs(output_dir, exist_ok=True)
    tmp = tempfile.NamedTemporaryFile(suffix='.png', prefix=prefix, dir=output_dir, delete=False)
    path = tmp.name
    tmp.close()
    fig.write_image(path, scale=3)
    return path


def histogram_tool(
    file_path: str,
    column: str,
    bins: int = 30,
    kde: bool = True,
    output_dir: str = '/tmp'
) -> Union[Tuple[ff.FigureFactory, str], str]:
    """
    Create a histogram with optional KDE overlay for a given numeric column.

    Returns (figure, png_path) or error string.
    """
    # Load
    ext = os.path.splitext(file_path)[1].lower()
    df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)

    # Validate
    if column not in df.columns:
        return f"❌ Column '{column}' not found."
    series = pd.to_numeric(df[column], errors='coerce').dropna()
    if series.empty:
        return f"❌ No numeric data in '{column}'."

    # Build histogram + KDE
    if kde:
        fig = ff.create_distplot([series], [column], bin_size=(series.max()-series.min())/bins)
    else:
        fig = px.histogram(series, nbins=bins, title=f"Histogram – {column}", template='plotly_dark')
    fig.update_layout(template='plotly_dark')

    # Save
    img_path = _save_fig(fig, f"hist_{column}_", output_dir)
    return fig, img_path


def boxplot_tool(
    file_path: str,
    column: str,
    output_dir: str = '/tmp'
) -> Union[Tuple[px.Figure, str], str]:
    """
    Create a box plot with outliers for a numeric column.

    Returns (figure, png_path) or error string.
    """
    ext = os.path.splitext(file_path)[1].lower()
    df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
    if column not in df.columns:
        return f"❌ Column '{column}' not found."
    series = pd.to_numeric(df[column], errors='coerce').dropna()
    if series.empty:
        return f"❌ No numeric data in '{column}'."

    fig = px.box(series, points='outliers', title=f"Boxplot – {column}", template='plotly_dark')
    img_path = _save_fig(fig, f"box_{column}_", output_dir)
    return fig, img_path


def violin_tool(
    file_path: str,
    column: str,
    output_dir: str = '/tmp'
) -> Union[Tuple[px.Figure, str], str]:
    """
    Create a violin plot with inner box for a numeric column.

    Returns (figure, png_path) or error string.
    """
    ext = os.path.splitext(file_path)[1].lower()
    df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
    if column not in df.columns:
        return f"❌ Column '{column}' not found."
    series = pd.to_numeric(df[column], errors='coerce').dropna()
    if series.empty:
        return f"❌ No numeric data in '{column}'."

    fig = px.violin(series, box=True, points='all', title=f"Violin – {column}", template='plotly_dark')
    img_path = _save_fig(fig, f"violin_{column}_", output_dir)
    return fig, img_path


def scatter_matrix_tool(
    file_path: str,
    columns: List[str],
    output_dir: str = '/tmp',
    size: int = 5
) -> Union[Tuple[px.Figure, str], str]:
    """
    Create an interactive scatter matrix for selected numeric columns.

    Returns (figure, png_path) or error string.
    """
    ext = os.path.splitext(file_path)[1].lower()
    df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
    missing = [c for c in columns if c not in df.columns]
    if missing:
        return f"❌ Missing columns: {', '.join(missing)}"
    df_num = df[columns].apply(pd.to_numeric, errors='coerce').dropna()
    if df_num.empty:
        return "❌ No valid numeric data."

    fig = px.scatter_matrix(df_num, dimensions=columns, title="Scatter Matrix", template='plotly_dark')
    fig.update_traces(diagonal_visible=False, marker={'size': size})
    img_path = _save_fig(fig, "scatter_matrix_", output_dir)
    return fig, img_path


def corr_heatmap_tool(
    file_path: str,
    columns: List[str] = None,
    output_dir: str = '/tmp',
    cluster: bool = True
) -> Union[Tuple[px.Figure, str], str]:
    """
    Create a correlation heatmap, with optional hierarchical clustering of variables.

    Returns (figure, png_path) or error string.
    """
    ext = os.path.splitext(file_path)[1].lower()
    df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
    df_num = df.select_dtypes(include='number') if columns is None else df[columns]
    df_num = df_num.apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='all')
    if df_num.shape[1] < 2:
        return "❌ Need at least two numeric columns for correlation."

    corr = df_num.corr()
    if cluster:
        link = linkage(corr, method='average')
        order = leaves_list(link)
        corr = corr.iloc[order, order]

    fig = px.imshow(
        corr,
        color_continuous_scale='RdBu',
        title="Correlation Heatmap",
        labels=dict(color="Correlation"),
        template='plotly_dark'
    )
    img_path = _save_fig(fig, "corr_heatmap_", output_dir)
    return fig, img_path