|
|
|
|
|
import base64 |
|
import io |
|
import logging |
|
|
|
import pandas as pd |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
from sklearn.cluster import KMeans |
|
from sklearn.preprocessing import StandardScaler |
|
from statsmodels.tsa.seasonal import seasonal_decompose |
|
from statsmodels.tsa.stattools import adfuller |
|
from wordcloud import WordCloud |
|
|
|
|
|
def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str): |
|
"""Performs time-series decomposition and stationarity testing.""" |
|
if not date_col or not value_col: |
|
return go.Figure(), "Please select both a date/time column and a value column." |
|
|
|
try: |
|
|
|
ts_df = df.copy() |
|
ts_df[date_col] = pd.to_datetime(ts_df[date_col]) |
|
ts_df = ts_df.set_index(date_col).sort_index() |
|
ts_data = ts_df[value_col].dropna() |
|
|
|
if len(ts_data) < 24: |
|
return go.Figure(), "Not enough data points (< 24) for time-series decomposition." |
|
|
|
|
|
result = seasonal_decompose(ts_data, model='additive', period=12) |
|
fig_decomp = px.line( |
|
pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}), |
|
title=f"<b>Time-Series Decomposition of '{value_col}'</b>", |
|
labels={'value': 'Value', 'index': 'Date'}, |
|
template="plotly_white", |
|
) |
|
fig_decomp.update_layout(legend_title_text='Components') |
|
|
|
|
|
adf_result = adfuller(ts_data) |
|
conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)' |
|
adf_md = f""" |
|
### Stationarity Analysis (ADF Test) |
|
- **ADF Statistic:** `{adf_result[0]:.4f}` |
|
- **p-value:** `{adf_result[1]:.4f}` |
|
- **Conclusion:** The time-series is {conclusion}. A non-stationary series may require differencing for forecasting models. |
|
""" |
|
return fig_decomp, adf_md |
|
except Exception as e: |
|
logging.error(f"Time-series analysis failed: {e}", exc_info=True) |
|
return go.Figure(), f"β **Error:** Could not perform time-series analysis. Reason: {e}" |
|
|
|
|
|
def generate_word_cloud(df: pd.DataFrame, text_col: str): |
|
"""Generates a word cloud from a text column and returns it as a data URI.""" |
|
if not text_col: |
|
return None |
|
|
|
try: |
|
text = ' '.join(df[text_col].dropna().astype(str)) |
|
if not text: |
|
return "<p style='text-align:center;'>No text data available in this column to generate a cloud.</p>" |
|
|
|
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text) |
|
|
|
|
|
buf = io.BytesIO() |
|
wordcloud.to_image().save(buf, format='png') |
|
img_str = base64.b64encode(buf.getvalue()).decode('utf-8') |
|
html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud"></div>' |
|
return html_content |
|
except Exception as e: |
|
logging.error(f"Word cloud generation failed: {e}", exc_info=True) |
|
return f"β **Error:** Could not generate word cloud. Reason: {e}" |
|
|
|
|
|
def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4): |
|
"""Performs K-Means clustering and returns a scatter plot.""" |
|
if len(numeric_cols) < 2: |
|
return go.Figure(), "Clustering requires at least 2 numeric features." |
|
|
|
try: |
|
cluster_data = df[numeric_cols].dropna() |
|
if len(cluster_data) < n_clusters: |
|
return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters." |
|
|
|
|
|
scaler = StandardScaler() |
|
scaled_data = scaler.fit_transform(cluster_data) |
|
|
|
kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init='auto').fit(scaled_data) |
|
cluster_data['Cluster'] = kmeans.labels_.astype(str) |
|
|
|
|
|
fig_cluster = px.scatter( |
|
cluster_data, x=numeric_cols[0], y=numeric_cols[1], color='Cluster', |
|
title=f"<b>K-Means Clustering Result (k={int(n_clusters)})</b>", |
|
template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid |
|
) |
|
cluster_md = f""" |
|
### Clustering Summary |
|
- **Features Used:** {', '.join(numeric_cols)} |
|
- **Number of Clusters (K):** {int(n_clusters)} |
|
- **Insight:** The plot shows the separation of data into {int(n_clusters)} distinct groups based on the selected features. |
|
""" |
|
return fig_cluster, cluster_md |
|
except Exception as e: |
|
logging.error(f"Clustering failed: {e}", exc_info=True) |
|
return go.Figure(), f"β **Error:** Could not perform clustering. Reason: {e}" |