mgbam commited on
Commit
cff0e3d
Β·
verified Β·
1 Parent(s): f9d0aef

Update analysis_modules.py

Browse files
Files changed (1) hide show
  1. analysis_modules.py +78 -34
analysis_modules.py CHANGED
@@ -8,6 +8,7 @@ import pandas as pd
8
  import plotly.express as px
9
  import plotly.graph_objects as go
10
  from sklearn.cluster import KMeans
 
11
  from sklearn.preprocessing import StandardScaler
12
  from statsmodels.tsa.seasonal import seasonal_decompose
13
  from statsmodels.tsa.stattools import adfuller
@@ -15,98 +16,141 @@ from wordcloud import WordCloud
15
 
16
  # --- Time-Series Module ---
17
  def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
18
- """Performs time-series decomposition and stationarity testing."""
 
 
 
 
 
 
 
 
 
 
19
  if not date_col or not value_col:
20
- return go.Figure(), "Please select both a date/time column and a value column."
21
 
22
  try:
23
- # Prepare data
24
  ts_df = df.copy()
25
  ts_df[date_col] = pd.to_datetime(ts_df[date_col])
26
  ts_df = ts_df.set_index(date_col).sort_index()
27
  ts_data = ts_df[value_col].dropna()
28
 
29
- if len(ts_data) < 24: # Need at least 2 periods for decomposition
30
- return go.Figure(), "Not enough data points (< 24) for time-series decomposition."
 
 
 
 
31
 
32
- # Decomposition (assuming monthly data for period=12)
33
- result = seasonal_decompose(ts_data, model='additive', period=12)
34
  fig_decomp = px.line(
35
  pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}),
36
  title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
37
- labels={'value': 'Value', 'index': 'Date'},
38
- template="plotly_white",
39
- )
40
- fig_decomp.update_layout(legend_title_text='Components')
41
 
42
  # Stationarity Test (ADF)
43
  adf_result = adfuller(ts_data)
44
  conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)'
45
  adf_md = f"""
46
- ### Stationarity Analysis (ADF Test)
47
  - **ADF Statistic:** `{adf_result[0]:.4f}`
48
  - **p-value:** `{adf_result[1]:.4f}`
49
- - **Conclusion:** The time-series is {conclusion}. A non-stationary series may require differencing for forecasting models.
50
  """
51
  return fig_decomp, adf_md
52
  except Exception as e:
53
  logging.error(f"Time-series analysis failed: {e}", exc_info=True)
54
- return go.Figure(), f"❌ **Error:** Could not perform time-series analysis. Reason: {e}"
55
 
56
  # --- Text Analysis Module ---
57
  def generate_word_cloud(df: pd.DataFrame, text_col: str):
58
- """Generates a word cloud from a text column and returns it as a data URI."""
 
 
 
 
 
 
 
 
 
59
  if not text_col:
60
- return None # Return None to hide the HTML component
61
 
62
  try:
 
63
  text = ' '.join(df[text_col].dropna().astype(str))
64
- if not text:
65
- return "<p style='text-align:center;'>No text data available in this column to generate a cloud.</p>"
66
 
67
- wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)
68
 
69
- # Convert matplotlib plot to a base64 encoded string for Gradio HTML
70
  buf = io.BytesIO()
71
  wordcloud.to_image().save(buf, format='png')
72
  img_str = base64.b64encode(buf.getvalue()).decode('utf-8')
73
- html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud"></div>'
74
  return html_content
75
  except Exception as e:
76
  logging.error(f"Word cloud generation failed: {e}", exc_info=True)
77
- return f"❌ **Error:** Could not generate word cloud. Reason: {e}"
78
 
79
  # --- Clustering Module ---
80
  def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
81
- """Performs K-Means clustering and returns a scatter plot."""
 
 
 
 
 
 
 
 
 
 
82
  if len(numeric_cols) < 2:
83
- return go.Figure(), "Clustering requires at least 2 numeric features."
84
 
85
  try:
 
86
  cluster_data = df[numeric_cols].dropna()
87
  if len(cluster_data) < n_clusters:
88
  return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters."
89
 
90
- # Scale data for better clustering performance
91
  scaler = StandardScaler()
92
  scaled_data = scaler.fit_transform(cluster_data)
93
 
94
- kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init='auto').fit(scaled_data)
 
95
  cluster_data['Cluster'] = kmeans.labels_.astype(str)
96
 
97
- # Visualize using the first two principal components for a more holistic view
 
 
 
 
 
 
98
  fig_cluster = px.scatter(
99
- cluster_data, x=numeric_cols[0], y=numeric_cols[1], color='Cluster',
100
- title=f"<b>K-Means Clustering Result (k={int(n_clusters)})</b>",
 
101
  template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid
102
  )
 
 
103
  cluster_md = f"""
104
- ### Clustering Summary
105
- - **Features Used:** {', '.join(numeric_cols)}
106
- - **Number of Clusters (K):** {int(n_clusters)}
107
- - **Insight:** The plot shows the separation of data into {int(n_clusters)} distinct groups based on the selected features.
 
108
  """
109
  return fig_cluster, cluster_md
110
  except Exception as e:
111
  logging.error(f"Clustering failed: {e}", exc_info=True)
112
- return go.Figure(), f"❌ **Error:** Could not perform clustering. Reason: {e}"
 
8
  import plotly.express as px
9
  import plotly.graph_objects as go
10
  from sklearn.cluster import KMeans
11
+ from sklearn.decomposition import PCA
12
  from sklearn.preprocessing import StandardScaler
13
  from statsmodels.tsa.seasonal import seasonal_decompose
14
  from statsmodels.tsa.stattools import adfuller
 
16
 
17
  # --- Time-Series Module ---
18
  def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
19
+ """
20
+ Performs time-series decomposition and stationarity testing with robust error handling.
21
+
22
+ Args:
23
+ df (pd.DataFrame): The input DataFrame.
24
+ date_col (str): The name of the column containing datetime information.
25
+ value_col (str): The name of the numeric column to analyze.
26
+
27
+ Returns:
28
+ tuple: A Plotly Figure and a Markdown string with analysis.
29
+ """
30
  if not date_col or not value_col:
31
+ return go.Figure(), "Please select both a date/time column and a value column to begin analysis."
32
 
33
  try:
34
+ logging.info(f"Analyzing time-series for date='{date_col}' and value='{value_col}'")
35
  ts_df = df.copy()
36
  ts_df[date_col] = pd.to_datetime(ts_df[date_col])
37
  ts_df = ts_df.set_index(date_col).sort_index()
38
  ts_data = ts_df[value_col].dropna()
39
 
40
+ # A common period for decomposition is 12 (monthly), require at least 2 full periods.
41
+ period = 12
42
+ if len(ts_data) < 2 * period:
43
+ msg = f"Not enough data points ({len(ts_data)}) for a reliable time-series decomposition (requires at least {2*period})."
44
+ logging.warning(msg)
45
+ return go.Figure().update_layout(title=msg), ""
46
 
47
+ # Decomposition
48
+ result = seasonal_decompose(ts_data, model='additive', period=period)
49
  fig_decomp = px.line(
50
  pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}),
51
  title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
52
+ labels={'value': 'Value', 'index': 'Date'}, template="plotly_white"
53
+ ).update_layout(legend_title_text='Components')
 
 
54
 
55
  # Stationarity Test (ADF)
56
  adf_result = adfuller(ts_data)
57
  conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)'
58
  adf_md = f"""
59
+ ### Stationarity Analysis (Augmented Dickey-Fuller Test)
60
  - **ADF Statistic:** `{adf_result[0]:.4f}`
61
  - **p-value:** `{adf_result[1]:.4f}`
62
+ - **Conclusion:** The time-series is {conclusion}. Non-stationary series often require differencing before being used in forecasting models like ARIMA.
63
  """
64
  return fig_decomp, adf_md
65
  except Exception as e:
66
  logging.error(f"Time-series analysis failed: {e}", exc_info=True)
67
+ return go.Figure(), f"❌ **Error:** Could not perform analysis. Please ensure the date column is a valid time format and the value column is numeric. \n`{e}`"
68
 
69
  # --- Text Analysis Module ---
70
  def generate_word_cloud(df: pd.DataFrame, text_col: str):
71
+ """
72
+ Generates a word cloud from a text column and returns it as an HTML object.
73
+
74
+ Args:
75
+ df (pd.DataFrame): The input DataFrame.
76
+ text_col (str): The name of the column containing text data.
77
+
78
+ Returns:
79
+ str: An HTML string containing the word cloud image or an error message.
80
+ """
81
  if not text_col:
82
+ return "<p style='text-align:center; padding: 20px;'>Select a text column to generate a word cloud.</p>"
83
 
84
  try:
85
+ logging.info(f"Generating word cloud for column '{text_col}'")
86
  text = ' '.join(df[text_col].dropna().astype(str))
87
+ if not text.strip():
88
+ return "<p style='text-align:center; padding: 20px;'>No text data available in this column to generate a cloud.</p>"
89
 
90
+ wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis', max_words=150).generate(text)
91
 
 
92
  buf = io.BytesIO()
93
  wordcloud.to_image().save(buf, format='png')
94
  img_str = base64.b64encode(buf.getvalue()).decode('utf-8')
95
+ html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud for {text_col}" style="border-radius: 8px;"></div>'
96
  return html_content
97
  except Exception as e:
98
  logging.error(f"Word cloud generation failed: {e}", exc_info=True)
99
+ return f"<p style='text-align:center; color:red; padding: 20px;'>❌ **Error:** Could not generate word cloud. Reason: {e}</p>"
100
 
101
  # --- Clustering Module ---
102
  def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
103
+ """
104
+ Performs K-Means clustering using best practices (scaling and PCA for visualization).
105
+
106
+ Args:
107
+ df (pd.DataFrame): The input DataFrame.
108
+ numeric_cols (list): A list of numeric columns to use for clustering.
109
+ n_clusters (int): The number of clusters (k) to create.
110
+
111
+ Returns:
112
+ tuple: A Plotly Figure and a Markdown string with analysis.
113
+ """
114
  if len(numeric_cols) < 2:
115
+ return go.Figure(), "Clustering requires at least 2 numeric features. Please select a dataset with more numeric columns."
116
 
117
  try:
118
+ logging.info(f"Performing K-Means clustering with k={n_clusters} on {len(numeric_cols)} features.")
119
  cluster_data = df[numeric_cols].dropna()
120
  if len(cluster_data) < n_clusters:
121
  return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters."
122
 
123
+ # Step 1: Scale data - Crucial for distance-based algorithms like K-Means
124
  scaler = StandardScaler()
125
  scaled_data = scaler.fit_transform(cluster_data)
126
 
127
+ # Step 2: Perform K-Means clustering
128
+ kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init=10).fit(scaled_data)
129
  cluster_data['Cluster'] = kmeans.labels_.astype(str)
130
 
131
+ # Step 3: Use PCA to reduce dimensionality for a meaningful 2D visualization
132
+ pca = PCA(n_components=2)
133
+ components = pca.fit_transform(scaled_data)
134
+ cluster_data['PCA1'] = components[:, 0]
135
+ cluster_data['PCA2'] = components[:, 1]
136
+
137
+ # Step 4: Create the plot using the principal components
138
  fig_cluster = px.scatter(
139
+ cluster_data, x='PCA1', y='PCA2', color='Cluster',
140
+ title=f"<b>K-Means Clustering Visualization (k={int(n_clusters)})</b>",
141
+ labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
142
  template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid
143
  )
144
+
145
+ explained_variance = pca.explained_variance_ratio_.sum() * 100
146
  cluster_md = f"""
147
+ ### Clustering Summary & Methodology
148
+ - **Features Used:** `{len(numeric_cols)}` numeric features were scaled and used for clustering.
149
+ - **Number of Clusters (K):** `{int(n_clusters)}`
150
+ - **Visualization:** To visualize the high-dimensional clusters in 2D, Principal Component Analysis (PCA) was used.
151
+ - **Explained Variance:** The two components shown explain **{explained_variance:.2f}%** of the variance in the data.
152
  """
153
  return fig_cluster, cluster_md
154
  except Exception as e:
155
  logging.error(f"Clustering failed: {e}", exc_info=True)
156
+ return go.Figure(), f"❌ **Error:** Could not perform clustering. \n`{e}`"