File size: 7,482 Bytes
640d10c 7fa4e3f 640d10c 7fa4e3f cff0e3d 7fa4e3f 640d10c cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 640d10c 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 640d10c cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 640d10c cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d 7fa4e3f cff0e3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
# analysis_modules.py
import base64
import io
import logging
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from wordcloud import WordCloud
# --- Time-Series Module ---
def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
"""
Performs time-series decomposition and stationarity testing with robust error handling.
Args:
df (pd.DataFrame): The input DataFrame.
date_col (str): The name of the column containing datetime information.
value_col (str): The name of the numeric column to analyze.
Returns:
tuple: A Plotly Figure and a Markdown string with analysis.
"""
if not date_col or not value_col:
return go.Figure(), "Please select both a date/time column and a value column to begin analysis."
try:
logging.info(f"Analyzing time-series for date='{date_col}' and value='{value_col}'")
ts_df = df.copy()
ts_df[date_col] = pd.to_datetime(ts_df[date_col])
ts_df = ts_df.set_index(date_col).sort_index()
ts_data = ts_df[value_col].dropna()
# A common period for decomposition is 12 (monthly), require at least 2 full periods.
period = 12
if len(ts_data) < 2 * period:
msg = f"Not enough data points ({len(ts_data)}) for a reliable time-series decomposition (requires at least {2*period})."
logging.warning(msg)
return go.Figure().update_layout(title=msg), ""
# Decomposition
result = seasonal_decompose(ts_data, model='additive', period=period)
fig_decomp = px.line(
pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}),
title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
labels={'value': 'Value', 'index': 'Date'}, template="plotly_white"
).update_layout(legend_title_text='Components')
# Stationarity Test (ADF)
adf_result = adfuller(ts_data)
conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)'
adf_md = f"""
### Stationarity Analysis (Augmented Dickey-Fuller Test)
- **ADF Statistic:** `{adf_result[0]:.4f}`
- **p-value:** `{adf_result[1]:.4f}`
- **Conclusion:** The time-series is {conclusion}. Non-stationary series often require differencing before being used in forecasting models like ARIMA.
"""
return fig_decomp, adf_md
except Exception as e:
logging.error(f"Time-series analysis failed: {e}", exc_info=True)
return go.Figure(), f"β **Error:** Could not perform analysis. Please ensure the date column is a valid time format and the value column is numeric. \n`{e}`"
# --- Text Analysis Module ---
def generate_word_cloud(df: pd.DataFrame, text_col: str):
"""
Generates a word cloud from a text column and returns it as an HTML object.
Args:
df (pd.DataFrame): The input DataFrame.
text_col (str): The name of the column containing text data.
Returns:
str: An HTML string containing the word cloud image or an error message.
"""
if not text_col:
return "<p style='text-align:center; padding: 20px;'>Select a text column to generate a word cloud.</p>"
try:
logging.info(f"Generating word cloud for column '{text_col}'")
text = ' '.join(df[text_col].dropna().astype(str))
if not text.strip():
return "<p style='text-align:center; padding: 20px;'>No text data available in this column to generate a cloud.</p>"
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis', max_words=150).generate(text)
buf = io.BytesIO()
wordcloud.to_image().save(buf, format='png')
img_str = base64.b64encode(buf.getvalue()).decode('utf-8')
html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud for {text_col}" style="border-radius: 8px;"></div>'
return html_content
except Exception as e:
logging.error(f"Word cloud generation failed: {e}", exc_info=True)
return f"<p style='text-align:center; color:red; padding: 20px;'>β **Error:** Could not generate word cloud. Reason: {e}</p>"
# --- Clustering Module ---
def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
"""
Performs K-Means clustering using best practices (scaling and PCA for visualization).
Args:
df (pd.DataFrame): The input DataFrame.
numeric_cols (list): A list of numeric columns to use for clustering.
n_clusters (int): The number of clusters (k) to create.
Returns:
tuple: A Plotly Figure and a Markdown string with analysis.
"""
if len(numeric_cols) < 2:
return go.Figure(), "Clustering requires at least 2 numeric features. Please select a dataset with more numeric columns."
try:
logging.info(f"Performing K-Means clustering with k={n_clusters} on {len(numeric_cols)} features.")
cluster_data = df[numeric_cols].dropna()
if len(cluster_data) < n_clusters:
return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters."
# Step 1: Scale data - Crucial for distance-based algorithms like K-Means
scaler = StandardScaler()
scaled_data = scaler.fit_transform(cluster_data)
# Step 2: Perform K-Means clustering
kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init=10).fit(scaled_data)
cluster_data['Cluster'] = kmeans.labels_.astype(str)
# Step 3: Use PCA to reduce dimensionality for a meaningful 2D visualization
pca = PCA(n_components=2)
components = pca.fit_transform(scaled_data)
cluster_data['PCA1'] = components[:, 0]
cluster_data['PCA2'] = components[:, 1]
# Step 4: Create the plot using the principal components
fig_cluster = px.scatter(
cluster_data, x='PCA1', y='PCA2', color='Cluster',
title=f"<b>K-Means Clustering Visualization (k={int(n_clusters)})</b>",
labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid
)
explained_variance = pca.explained_variance_ratio_.sum() * 100
cluster_md = f"""
### Clustering Summary & Methodology
- **Features Used:** `{len(numeric_cols)}` numeric features were scaled and used for clustering.
- **Number of Clusters (K):** `{int(n_clusters)}`
- **Visualization:** To visualize the high-dimensional clusters in 2D, Principal Component Analysis (PCA) was used.
- **Explained Variance:** The two components shown explain **{explained_variance:.2f}%** of the variance in the data.
"""
return fig_cluster, cluster_md
except Exception as e:
logging.error(f"Clustering failed: {e}", exc_info=True)
return go.Figure(), f"β **Error:** Could not perform clustering. \n`{e}`" |