Update analysis_modules.py
Browse files- analysis_modules.py +78 -34
analysis_modules.py
CHANGED
@@ -8,6 +8,7 @@ import pandas as pd
|
|
8 |
import plotly.express as px
|
9 |
import plotly.graph_objects as go
|
10 |
from sklearn.cluster import KMeans
|
|
|
11 |
from sklearn.preprocessing import StandardScaler
|
12 |
from statsmodels.tsa.seasonal import seasonal_decompose
|
13 |
from statsmodels.tsa.stattools import adfuller
|
@@ -15,98 +16,141 @@ from wordcloud import WordCloud
|
|
15 |
|
16 |
# --- Time-Series Module ---
|
17 |
def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
|
18 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
if not date_col or not value_col:
|
20 |
-
return go.Figure(), "Please select both a date/time column and a value column."
|
21 |
|
22 |
try:
|
23 |
-
|
24 |
ts_df = df.copy()
|
25 |
ts_df[date_col] = pd.to_datetime(ts_df[date_col])
|
26 |
ts_df = ts_df.set_index(date_col).sort_index()
|
27 |
ts_data = ts_df[value_col].dropna()
|
28 |
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
# Decomposition
|
33 |
-
result = seasonal_decompose(ts_data, model='additive', period=
|
34 |
fig_decomp = px.line(
|
35 |
pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}),
|
36 |
title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
|
37 |
-
labels={'value': 'Value', 'index': 'Date'},
|
38 |
-
|
39 |
-
)
|
40 |
-
fig_decomp.update_layout(legend_title_text='Components')
|
41 |
|
42 |
# Stationarity Test (ADF)
|
43 |
adf_result = adfuller(ts_data)
|
44 |
conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)'
|
45 |
adf_md = f"""
|
46 |
-
### Stationarity Analysis (
|
47 |
- **ADF Statistic:** `{adf_result[0]:.4f}`
|
48 |
- **p-value:** `{adf_result[1]:.4f}`
|
49 |
-
- **Conclusion:** The time-series is {conclusion}.
|
50 |
"""
|
51 |
return fig_decomp, adf_md
|
52 |
except Exception as e:
|
53 |
logging.error(f"Time-series analysis failed: {e}", exc_info=True)
|
54 |
-
return go.Figure(), f"β **Error:** Could not perform time
|
55 |
|
56 |
# --- Text Analysis Module ---
|
57 |
def generate_word_cloud(df: pd.DataFrame, text_col: str):
|
58 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
if not text_col:
|
60 |
-
return
|
61 |
|
62 |
try:
|
|
|
63 |
text = ' '.join(df[text_col].dropna().astype(str))
|
64 |
-
if not text:
|
65 |
-
return "<p style='text-align:center;'>No text data available in this column to generate a cloud.</p>"
|
66 |
|
67 |
-
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)
|
68 |
|
69 |
-
# Convert matplotlib plot to a base64 encoded string for Gradio HTML
|
70 |
buf = io.BytesIO()
|
71 |
wordcloud.to_image().save(buf, format='png')
|
72 |
img_str = base64.b64encode(buf.getvalue()).decode('utf-8')
|
73 |
-
html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud"></div>'
|
74 |
return html_content
|
75 |
except Exception as e:
|
76 |
logging.error(f"Word cloud generation failed: {e}", exc_info=True)
|
77 |
-
return f"
|
78 |
|
79 |
# --- Clustering Module ---
|
80 |
def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
|
81 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
if len(numeric_cols) < 2:
|
83 |
-
return go.Figure(), "Clustering requires at least 2 numeric features."
|
84 |
|
85 |
try:
|
|
|
86 |
cluster_data = df[numeric_cols].dropna()
|
87 |
if len(cluster_data) < n_clusters:
|
88 |
return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters."
|
89 |
|
90 |
-
# Scale data for
|
91 |
scaler = StandardScaler()
|
92 |
scaled_data = scaler.fit_transform(cluster_data)
|
93 |
|
94 |
-
|
|
|
95 |
cluster_data['Cluster'] = kmeans.labels_.astype(str)
|
96 |
|
97 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
fig_cluster = px.scatter(
|
99 |
-
cluster_data, x=
|
100 |
-
title=f"<b>K-Means Clustering
|
|
|
101 |
template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid
|
102 |
)
|
|
|
|
|
103 |
cluster_md = f"""
|
104 |
-
### Clustering Summary
|
105 |
-
- **Features Used:** {
|
106 |
-
- **Number of Clusters (K):** {int(n_clusters)}
|
107 |
-
- **
|
|
|
108 |
"""
|
109 |
return fig_cluster, cluster_md
|
110 |
except Exception as e:
|
111 |
logging.error(f"Clustering failed: {e}", exc_info=True)
|
112 |
-
return go.Figure(), f"β **Error:** Could not perform clustering.
|
|
|
8 |
import plotly.express as px
|
9 |
import plotly.graph_objects as go
|
10 |
from sklearn.cluster import KMeans
|
11 |
+
from sklearn.decomposition import PCA
|
12 |
from sklearn.preprocessing import StandardScaler
|
13 |
from statsmodels.tsa.seasonal import seasonal_decompose
|
14 |
from statsmodels.tsa.stattools import adfuller
|
|
|
16 |
|
17 |
# --- Time-Series Module ---
|
18 |
def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
|
19 |
+
"""
|
20 |
+
Performs time-series decomposition and stationarity testing with robust error handling.
|
21 |
+
|
22 |
+
Args:
|
23 |
+
df (pd.DataFrame): The input DataFrame.
|
24 |
+
date_col (str): The name of the column containing datetime information.
|
25 |
+
value_col (str): The name of the numeric column to analyze.
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
tuple: A Plotly Figure and a Markdown string with analysis.
|
29 |
+
"""
|
30 |
if not date_col or not value_col:
|
31 |
+
return go.Figure(), "Please select both a date/time column and a value column to begin analysis."
|
32 |
|
33 |
try:
|
34 |
+
logging.info(f"Analyzing time-series for date='{date_col}' and value='{value_col}'")
|
35 |
ts_df = df.copy()
|
36 |
ts_df[date_col] = pd.to_datetime(ts_df[date_col])
|
37 |
ts_df = ts_df.set_index(date_col).sort_index()
|
38 |
ts_data = ts_df[value_col].dropna()
|
39 |
|
40 |
+
# A common period for decomposition is 12 (monthly), require at least 2 full periods.
|
41 |
+
period = 12
|
42 |
+
if len(ts_data) < 2 * period:
|
43 |
+
msg = f"Not enough data points ({len(ts_data)}) for a reliable time-series decomposition (requires at least {2*period})."
|
44 |
+
logging.warning(msg)
|
45 |
+
return go.Figure().update_layout(title=msg), ""
|
46 |
|
47 |
+
# Decomposition
|
48 |
+
result = seasonal_decompose(ts_data, model='additive', period=period)
|
49 |
fig_decomp = px.line(
|
50 |
pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}),
|
51 |
title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
|
52 |
+
labels={'value': 'Value', 'index': 'Date'}, template="plotly_white"
|
53 |
+
).update_layout(legend_title_text='Components')
|
|
|
|
|
54 |
|
55 |
# Stationarity Test (ADF)
|
56 |
adf_result = adfuller(ts_data)
|
57 |
conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)'
|
58 |
adf_md = f"""
|
59 |
+
### Stationarity Analysis (Augmented Dickey-Fuller Test)
|
60 |
- **ADF Statistic:** `{adf_result[0]:.4f}`
|
61 |
- **p-value:** `{adf_result[1]:.4f}`
|
62 |
+
- **Conclusion:** The time-series is {conclusion}. Non-stationary series often require differencing before being used in forecasting models like ARIMA.
|
63 |
"""
|
64 |
return fig_decomp, adf_md
|
65 |
except Exception as e:
|
66 |
logging.error(f"Time-series analysis failed: {e}", exc_info=True)
|
67 |
+
return go.Figure(), f"β **Error:** Could not perform analysis. Please ensure the date column is a valid time format and the value column is numeric. \n`{e}`"
|
68 |
|
69 |
# --- Text Analysis Module ---
|
70 |
def generate_word_cloud(df: pd.DataFrame, text_col: str):
|
71 |
+
"""
|
72 |
+
Generates a word cloud from a text column and returns it as an HTML object.
|
73 |
+
|
74 |
+
Args:
|
75 |
+
df (pd.DataFrame): The input DataFrame.
|
76 |
+
text_col (str): The name of the column containing text data.
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
str: An HTML string containing the word cloud image or an error message.
|
80 |
+
"""
|
81 |
if not text_col:
|
82 |
+
return "<p style='text-align:center; padding: 20px;'>Select a text column to generate a word cloud.</p>"
|
83 |
|
84 |
try:
|
85 |
+
logging.info(f"Generating word cloud for column '{text_col}'")
|
86 |
text = ' '.join(df[text_col].dropna().astype(str))
|
87 |
+
if not text.strip():
|
88 |
+
return "<p style='text-align:center; padding: 20px;'>No text data available in this column to generate a cloud.</p>"
|
89 |
|
90 |
+
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis', max_words=150).generate(text)
|
91 |
|
|
|
92 |
buf = io.BytesIO()
|
93 |
wordcloud.to_image().save(buf, format='png')
|
94 |
img_str = base64.b64encode(buf.getvalue()).decode('utf-8')
|
95 |
+
html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud for {text_col}" style="border-radius: 8px;"></div>'
|
96 |
return html_content
|
97 |
except Exception as e:
|
98 |
logging.error(f"Word cloud generation failed: {e}", exc_info=True)
|
99 |
+
return f"<p style='text-align:center; color:red; padding: 20px;'>β **Error:** Could not generate word cloud. Reason: {e}</p>"
|
100 |
|
101 |
# --- Clustering Module ---
|
102 |
def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
|
103 |
+
"""
|
104 |
+
Performs K-Means clustering using best practices (scaling and PCA for visualization).
|
105 |
+
|
106 |
+
Args:
|
107 |
+
df (pd.DataFrame): The input DataFrame.
|
108 |
+
numeric_cols (list): A list of numeric columns to use for clustering.
|
109 |
+
n_clusters (int): The number of clusters (k) to create.
|
110 |
+
|
111 |
+
Returns:
|
112 |
+
tuple: A Plotly Figure and a Markdown string with analysis.
|
113 |
+
"""
|
114 |
if len(numeric_cols) < 2:
|
115 |
+
return go.Figure(), "Clustering requires at least 2 numeric features. Please select a dataset with more numeric columns."
|
116 |
|
117 |
try:
|
118 |
+
logging.info(f"Performing K-Means clustering with k={n_clusters} on {len(numeric_cols)} features.")
|
119 |
cluster_data = df[numeric_cols].dropna()
|
120 |
if len(cluster_data) < n_clusters:
|
121 |
return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters."
|
122 |
|
123 |
+
# Step 1: Scale data - Crucial for distance-based algorithms like K-Means
|
124 |
scaler = StandardScaler()
|
125 |
scaled_data = scaler.fit_transform(cluster_data)
|
126 |
|
127 |
+
# Step 2: Perform K-Means clustering
|
128 |
+
kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init=10).fit(scaled_data)
|
129 |
cluster_data['Cluster'] = kmeans.labels_.astype(str)
|
130 |
|
131 |
+
# Step 3: Use PCA to reduce dimensionality for a meaningful 2D visualization
|
132 |
+
pca = PCA(n_components=2)
|
133 |
+
components = pca.fit_transform(scaled_data)
|
134 |
+
cluster_data['PCA1'] = components[:, 0]
|
135 |
+
cluster_data['PCA2'] = components[:, 1]
|
136 |
+
|
137 |
+
# Step 4: Create the plot using the principal components
|
138 |
fig_cluster = px.scatter(
|
139 |
+
cluster_data, x='PCA1', y='PCA2', color='Cluster',
|
140 |
+
title=f"<b>K-Means Clustering Visualization (k={int(n_clusters)})</b>",
|
141 |
+
labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
|
142 |
template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid
|
143 |
)
|
144 |
+
|
145 |
+
explained_variance = pca.explained_variance_ratio_.sum() * 100
|
146 |
cluster_md = f"""
|
147 |
+
### Clustering Summary & Methodology
|
148 |
+
- **Features Used:** `{len(numeric_cols)}` numeric features were scaled and used for clustering.
|
149 |
+
- **Number of Clusters (K):** `{int(n_clusters)}`
|
150 |
+
- **Visualization:** To visualize the high-dimensional clusters in 2D, Principal Component Analysis (PCA) was used.
|
151 |
+
- **Explained Variance:** The two components shown explain **{explained_variance:.2f}%** of the variance in the data.
|
152 |
"""
|
153 |
return fig_cluster, cluster_md
|
154 |
except Exception as e:
|
155 |
logging.error(f"Clustering failed: {e}", exc_info=True)
|
156 |
+
return go.Figure(), f"β **Error:** Could not perform clustering. \n`{e}`"
|