File size: 7,482 Bytes
640d10c
 
7fa4e3f
 
 
 
640d10c
 
7fa4e3f
 
cff0e3d
7fa4e3f
640d10c
 
 
 
 
 
cff0e3d
 
 
 
 
 
 
 
 
 
 
7fa4e3f
cff0e3d
7fa4e3f
 
cff0e3d
7fa4e3f
 
 
 
 
cff0e3d
 
 
 
 
 
7fa4e3f
cff0e3d
 
7fa4e3f
 
 
cff0e3d
 
640d10c
7fa4e3f
 
 
 
cff0e3d
7fa4e3f
 
cff0e3d
7fa4e3f
 
 
 
cff0e3d
640d10c
 
 
cff0e3d
 
 
 
 
 
 
 
 
 
7fa4e3f
cff0e3d
7fa4e3f
 
cff0e3d
7fa4e3f
cff0e3d
 
7fa4e3f
cff0e3d
7fa4e3f
 
 
 
cff0e3d
7fa4e3f
 
 
cff0e3d
640d10c
 
 
cff0e3d
 
 
 
 
 
 
 
 
 
 
7fa4e3f
cff0e3d
7fa4e3f
 
cff0e3d
7fa4e3f
 
 
 
cff0e3d
7fa4e3f
 
 
cff0e3d
 
7fa4e3f
 
cff0e3d
 
 
 
 
 
 
7fa4e3f
cff0e3d
 
 
7fa4e3f
 
cff0e3d
 
7fa4e3f
cff0e3d
 
 
 
 
7fa4e3f
 
 
 
cff0e3d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# analysis_modules.py

import base64
import io
import logging

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from wordcloud import WordCloud

# --- Time-Series Module ---
def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
    """
    Performs time-series decomposition and stationarity testing with robust error handling.

    Args:
        df (pd.DataFrame): The input DataFrame.
        date_col (str): The name of the column containing datetime information.
        value_col (str): The name of the numeric column to analyze.

    Returns:
        tuple: A Plotly Figure and a Markdown string with analysis.
    """
    if not date_col or not value_col:
        return go.Figure(), "Please select both a date/time column and a value column to begin analysis."

    try:
        logging.info(f"Analyzing time-series for date='{date_col}' and value='{value_col}'")
        ts_df = df.copy()
        ts_df[date_col] = pd.to_datetime(ts_df[date_col])
        ts_df = ts_df.set_index(date_col).sort_index()
        ts_data = ts_df[value_col].dropna()

        # A common period for decomposition is 12 (monthly), require at least 2 full periods.
        period = 12
        if len(ts_data) < 2 * period:
            msg = f"Not enough data points ({len(ts_data)}) for a reliable time-series decomposition (requires at least {2*period})."
            logging.warning(msg)
            return go.Figure().update_layout(title=msg), ""

        # Decomposition
        result = seasonal_decompose(ts_data, model='additive', period=period)
        fig_decomp = px.line(
            pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}),
            title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
            labels={'value': 'Value', 'index': 'Date'}, template="plotly_white"
        ).update_layout(legend_title_text='Components')
                         
        # Stationarity Test (ADF)
        adf_result = adfuller(ts_data)
        conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)'
        adf_md = f"""
        ### Stationarity Analysis (Augmented Dickey-Fuller Test)
        - **ADF Statistic:** `{adf_result[0]:.4f}`
        - **p-value:** `{adf_result[1]:.4f}`
        - **Conclusion:** The time-series is {conclusion}. Non-stationary series often require differencing before being used in forecasting models like ARIMA.
        """
        return fig_decomp, adf_md
    except Exception as e:
        logging.error(f"Time-series analysis failed: {e}", exc_info=True)
        return go.Figure(), f"❌ **Error:** Could not perform analysis. Please ensure the date column is a valid time format and the value column is numeric. \n`{e}`"

# --- Text Analysis Module ---
def generate_word_cloud(df: pd.DataFrame, text_col: str):
    """
    Generates a word cloud from a text column and returns it as an HTML object.

    Args:
        df (pd.DataFrame): The input DataFrame.
        text_col (str): The name of the column containing text data.

    Returns:
        str: An HTML string containing the word cloud image or an error message.
    """
    if not text_col:
        return "<p style='text-align:center; padding: 20px;'>Select a text column to generate a word cloud.</p>"
        
    try:
        logging.info(f"Generating word cloud for column '{text_col}'")
        text = ' '.join(df[text_col].dropna().astype(str))
        if not text.strip():
            return "<p style='text-align:center; padding: 20px;'>No text data available in this column to generate a cloud.</p>"

        wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis', max_words=150).generate(text)
        
        buf = io.BytesIO()
        wordcloud.to_image().save(buf, format='png')
        img_str = base64.b64encode(buf.getvalue()).decode('utf-8')
        html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud for {text_col}" style="border-radius: 8px;"></div>'
        return html_content
    except Exception as e:
        logging.error(f"Word cloud generation failed: {e}", exc_info=True)
        return f"<p style='text-align:center; color:red; padding: 20px;'>❌ **Error:** Could not generate word cloud. Reason: {e}</p>"

# --- Clustering Module ---
def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
    """
    Performs K-Means clustering using best practices (scaling and PCA for visualization).

    Args:
        df (pd.DataFrame): The input DataFrame.
        numeric_cols (list): A list of numeric columns to use for clustering.
        n_clusters (int): The number of clusters (k) to create.

    Returns:
        tuple: A Plotly Figure and a Markdown string with analysis.
    """
    if len(numeric_cols) < 2:
        return go.Figure(), "Clustering requires at least 2 numeric features. Please select a dataset with more numeric columns."
        
    try:
        logging.info(f"Performing K-Means clustering with k={n_clusters} on {len(numeric_cols)} features.")
        cluster_data = df[numeric_cols].dropna()
        if len(cluster_data) < n_clusters:
            return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters."

        # Step 1: Scale data - Crucial for distance-based algorithms like K-Means
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(cluster_data)

        # Step 2: Perform K-Means clustering
        kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init=10).fit(scaled_data)
        cluster_data['Cluster'] = kmeans.labels_.astype(str)
        
        # Step 3: Use PCA to reduce dimensionality for a meaningful 2D visualization
        pca = PCA(n_components=2)
        components = pca.fit_transform(scaled_data)
        cluster_data['PCA1'] = components[:, 0]
        cluster_data['PCA2'] = components[:, 1]
        
        # Step 4: Create the plot using the principal components
        fig_cluster = px.scatter(
            cluster_data, x='PCA1', y='PCA2', color='Cluster',
            title=f"<b>K-Means Clustering Visualization (k={int(n_clusters)})</b>",
            labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
            template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid
        )
        
        explained_variance = pca.explained_variance_ratio_.sum() * 100
        cluster_md = f"""
        ### Clustering Summary & Methodology
        - **Features Used:** `{len(numeric_cols)}` numeric features were scaled and used for clustering.
        - **Number of Clusters (K):** `{int(n_clusters)}`
        - **Visualization:** To visualize the high-dimensional clusters in 2D, Principal Component Analysis (PCA) was used. 
        - **Explained Variance:** The two components shown explain **{explained_variance:.2f}%** of the variance in the data.
        """
        return fig_cluster, cluster_md
    except Exception as e:
        logging.error(f"Clustering failed: {e}", exc_info=True)
        return go.Figure(), f"❌ **Error:** Could not perform clustering. \n`{e}`"