Spaces:

mgbam
/

BizIntel_AI

Sleeping

File size: 5,474 Bytes

dc5ae18
 
bf400de
9538f35
bf400de
9538f35
 
 
 
bf400de
9538f35
 
dc5ae18
9538f35
dc5ae18
 
 
 
 
9538f35
dc5ae18
bf400de
 
dc5ae18
 
 
9538f35
 
 
 
dc5ae18
9538f35
 
 
dc5ae18
9538f35
dc5ae18
9538f35
dc5ae18
9538f35
dc5ae18
 
9538f35
dc5ae18
9538f35
 
 
 
 
 
 
 
 
 
dc5ae18
 
 
 
9538f35
dc5ae18
9538f35
dc5ae18
 
 
9538f35
 
 
dc5ae18
 
9538f35
 
 
 
 
 
 
 
 
 
dc5ae18
9538f35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc5ae18
 
9538f35
dc5ae18
9538f35
dc5ae18
9538f35
 
dc5ae18
 
 
 
 
 
9538f35
dc5ae18
9538f35
dc5ae18
 
9538f35
 
 
dc5ae18
 
9538f35
 
 
 
 
dc5ae18
 
9538f35
 
 
 
dc5ae18
bf400de
 
9538f35
dc5ae18
 
 
bf400de
dc5ae18

import os
import tempfile
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from scipy.cluster.hierarchy import linkage, leaves_list
from typing import Union, Tuple, List


def _save_fig(fig: go.Figure, prefix: str, output_dir: str) -> str:
    """
    Save a Plotly figure as a high-res PNG and return the file path.
    """
    os.makedirs(output_dir, exist_ok=True)
    tmp = tempfile.NamedTemporaryFile(suffix='.png', prefix=prefix, dir=output_dir, delete=False)
    path = tmp.name
    tmp.close()
    fig.write_image(path, scale=3)
    return path


def histogram_tool(
    file_path: str,
    column: str,
    bins: int = 30,
    kde: bool = True,
    output_dir: str = '/tmp'
) -> Union[Tuple[ff.FigureFactory, str], str]:
    """
    Create a histogram with optional KDE overlay for a given numeric column.

    Returns (figure, png_path) or error string.
    """
    # Load
    ext = os.path.splitext(file_path)[1].lower()
    df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)

    # Validate
    if column not in df.columns:
        return f"❌ Column '{column}' not found."
    series = pd.to_numeric(df[column], errors='coerce').dropna()
    if series.empty:
        return f"❌ No numeric data in '{column}'."

    # Build histogram + KDE
    if kde:
        fig = ff.create_distplot([series], [column], bin_size=(series.max()-series.min())/bins)
    else:
        fig = px.histogram(series, nbins=bins, title=f"Histogram – {column}", template='plotly_dark')
    fig.update_layout(template='plotly_dark')

    # Save
    img_path = _save_fig(fig, f"hist_{column}_", output_dir)
    return fig, img_path


def boxplot_tool(
    file_path: str,
    column: str,
    output_dir: str = '/tmp'
) -> Union[Tuple[px.Figure, str], str]:
    """
    Create a box plot with outliers for a numeric column.

    Returns (figure, png_path) or error string.
    """
    ext = os.path.splitext(file_path)[1].lower()
    df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
    if column not in df.columns:
        return f"❌ Column '{column}' not found."
    series = pd.to_numeric(df[column], errors='coerce').dropna()
    if series.empty:
        return f"❌ No numeric data in '{column}'."

    fig = px.box(series, points='outliers', title=f"Boxplot – {column}", template='plotly_dark')
    img_path = _save_fig(fig, f"box_{column}_", output_dir)
    return fig, img_path


def violin_tool(
    file_path: str,
    column: str,
    output_dir: str = '/tmp'
) -> Union[Tuple[px.Figure, str], str]:
    """
    Create a violin plot with inner box for a numeric column.

    Returns (figure, png_path) or error string.
    """
    ext = os.path.splitext(file_path)[1].lower()
    df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
    if column not in df.columns:
        return f"❌ Column '{column}' not found."
    series = pd.to_numeric(df[column], errors='coerce').dropna()
    if series.empty:
        return f"❌ No numeric data in '{column}'."

    fig = px.violin(series, box=True, points='all', title=f"Violin – {column}", template='plotly_dark')
    img_path = _save_fig(fig, f"violin_{column}_", output_dir)
    return fig, img_path


def scatter_matrix_tool(
    file_path: str,
    columns: List[str],
    output_dir: str = '/tmp',
    size: int = 5
) -> Union[Tuple[px.Figure, str], str]:
    """
    Create an interactive scatter matrix for selected numeric columns.

    Returns (figure, png_path) or error string.
    """
    ext = os.path.splitext(file_path)[1].lower()
    df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
    missing = [c for c in columns if c not in df.columns]
    if missing:
        return f"❌ Missing columns: {', '.join(missing)}"
    df_num = df[columns].apply(pd.to_numeric, errors='coerce').dropna()
    if df_num.empty:
        return "❌ No valid numeric data."

    fig = px.scatter_matrix(df_num, dimensions=columns, title="Scatter Matrix", template='plotly_dark')
    fig.update_traces(diagonal_visible=False, marker={'size': size})
    img_path = _save_fig(fig, "scatter_matrix_", output_dir)
    return fig, img_path


def corr_heatmap_tool(
    file_path: str,
    columns: List[str] = None,
    output_dir: str = '/tmp',
    cluster: bool = True
) -> Union[Tuple[px.Figure, str], str]:
    """
    Create a correlation heatmap, with optional hierarchical clustering of variables.

    Returns (figure, png_path) or error string.
    """
    ext = os.path.splitext(file_path)[1].lower()
    df = pd.read_excel(file_path) if ext in ('.xls','.xlsx') else pd.read_csv(file_path)
    df_num = df.select_dtypes(include='number') if columns is None else df[columns]
    df_num = df_num.apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='all')
    if df_num.shape[1] < 2:
        return "❌ Need at least two numeric columns for correlation."

    corr = df_num.corr()
    if cluster:
        link = linkage(corr, method='average')
        order = leaves_list(link)
        corr = corr.iloc[order, order]

    fig = px.imshow(
        corr,
        color_continuous_scale='RdBu',
        title="Correlation Heatmap",
        labels=dict(color="Correlation"),
        template='plotly_dark'
    )
    img_path = _save_fig(fig, "corr_heatmap_", output_dir)
    return fig, img_path