mgbam commited on
Commit
7fa4e3f
·
verified ·
1 Parent(s): 640d10c

Update analysis_modules.py

Browse files
Files changed (1) hide show
  1. analysis_modules.py +93 -37
analysis_modules.py CHANGED
@@ -1,56 +1,112 @@
1
  # analysis_modules.py
2
 
 
 
 
 
3
  import pandas as pd
4
  import plotly.express as px
 
 
 
5
  from statsmodels.tsa.seasonal import seasonal_decompose
6
  from statsmodels.tsa.stattools import adfuller
7
- from sklearn.cluster import KMeans
8
  from wordcloud import WordCloud
9
- import matplotlib.pyplot as plt
10
- import io
11
- import base64
12
 
13
  # --- Time-Series Module ---
14
  def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
15
  """Performs time-series decomposition and stationarity testing."""
16
- df[date_col] = pd.to_datetime(df[date_col])
17
- ts_df = df.set_index(date_col)[value_col].dropna()
18
-
19
- # Decomposition
20
- decomposition = seasonal_decompose(ts_df, model='additive', period=12) # Assuming monthly data
21
- fig_decomp = px.line(pd.DataFrame({'trend': decomposition.trend, 'seasonal': decomposition.seasonal, 'residual': decomposition.resid}),
22
- title=f"Time-Series Decomposition of {value_col}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # Stationarity Test (ADF)
25
- adf_result = adfuller(ts_df)
26
- adf_md = f"""
27
- ### Stationarity Analysis (ADF Test)
28
- - **Test Statistic:** `{adf_result[0]:.4f}`
29
- - **p-value:** `{adf_result[1]:.4f}`
30
- - **Conclusion:** The series is likely **{'stationary' if adf_result[1] < 0.05 else 'non-stationary'}**.
31
- """
32
- return fig_decomp, adf_md
 
 
 
 
33
 
34
  # --- Text Analysis Module ---
35
  def generate_word_cloud(df: pd.DataFrame, text_col: str):
36
- """Generates a word cloud from a text column."""
37
- text = ' '.join(df[text_col].dropna().astype(str))
38
- wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
39
-
40
- # Convert matplotlib plot to a data URI for Gradio
41
- buf = io.BytesIO()
42
- wordcloud.to_image().save(buf, format='png')
43
- img_str = "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode('utf-8')
44
- return img_str
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  # --- Clustering Module ---
47
  def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
48
  """Performs K-Means clustering and returns a scatter plot."""
49
- cluster_data = df[numeric_cols].dropna()
50
- kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto').fit(cluster_data)
51
- cluster_data['Cluster'] = kmeans.labels_.astype(str)
52
-
53
- # For visualization, we'll use the first two numeric columns
54
- fig_cluster = px.scatter(cluster_data, x=numeric_cols[0], y=numeric_cols[1], color='Cluster',
55
- title=f"K-Means Clustering (k={n_clusters})")
56
- return fig_cluster
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # analysis_modules.py
2
 
3
+ import base64
4
+ import io
5
+ import logging
6
+
7
  import pandas as pd
8
  import plotly.express as px
9
+ import plotly.graph_objects as go
10
+ from sklearn.cluster import KMeans
11
+ from sklearn.preprocessing import StandardScaler
12
  from statsmodels.tsa.seasonal import seasonal_decompose
13
  from statsmodels.tsa.stattools import adfuller
 
14
  from wordcloud import WordCloud
 
 
 
15
 
16
  # --- Time-Series Module ---
17
  def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
18
  """Performs time-series decomposition and stationarity testing."""
19
+ if not date_col or not value_col:
20
+ return go.Figure(), "Please select both a date/time column and a value column."
21
+
22
+ try:
23
+ # Prepare data
24
+ ts_df = df.copy()
25
+ ts_df[date_col] = pd.to_datetime(ts_df[date_col])
26
+ ts_df = ts_df.set_index(date_col).sort_index()
27
+ ts_data = ts_df[value_col].dropna()
28
+
29
+ if len(ts_data) < 24: # Need at least 2 periods for decomposition
30
+ return go.Figure(), "Not enough data points (< 24) for time-series decomposition."
31
+
32
+ # Decomposition (assuming monthly data for period=12)
33
+ result = seasonal_decompose(ts_data, model='additive', period=12)
34
+ fig_decomp = px.line(
35
+ pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}),
36
+ title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
37
+ labels={'value': 'Value', 'index': 'Date'},
38
+ template="plotly_white",
39
+ )
40
+ fig_decomp.update_layout(legend_title_text='Components')
41
 
42
+ # Stationarity Test (ADF)
43
+ adf_result = adfuller(ts_data)
44
+ conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)'
45
+ adf_md = f"""
46
+ ### Stationarity Analysis (ADF Test)
47
+ - **ADF Statistic:** `{adf_result[0]:.4f}`
48
+ - **p-value:** `{adf_result[1]:.4f}`
49
+ - **Conclusion:** The time-series is {conclusion}. A non-stationary series may require differencing for forecasting models.
50
+ """
51
+ return fig_decomp, adf_md
52
+ except Exception as e:
53
+ logging.error(f"Time-series analysis failed: {e}", exc_info=True)
54
+ return go.Figure(), f"❌ **Error:** Could not perform time-series analysis. Reason: {e}"
55
 
56
  # --- Text Analysis Module ---
57
  def generate_word_cloud(df: pd.DataFrame, text_col: str):
58
+ """Generates a word cloud from a text column and returns it as a data URI."""
59
+ if not text_col:
60
+ return None # Return None to hide the HTML component
61
+
62
+ try:
63
+ text = ' '.join(df[text_col].dropna().astype(str))
64
+ if not text:
65
+ return "<p style='text-align:center;'>No text data available in this column to generate a cloud.</p>"
66
+
67
+ wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)
68
+
69
+ # Convert matplotlib plot to a base64 encoded string for Gradio HTML
70
+ buf = io.BytesIO()
71
+ wordcloud.to_image().save(buf, format='png')
72
+ img_str = base64.b64encode(buf.getvalue()).decode('utf-8')
73
+ html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud"></div>'
74
+ return html_content
75
+ except Exception as e:
76
+ logging.error(f"Word cloud generation failed: {e}", exc_info=True)
77
+ return f"❌ **Error:** Could not generate word cloud. Reason: {e}"
78
 
79
  # --- Clustering Module ---
80
  def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
81
  """Performs K-Means clustering and returns a scatter plot."""
82
+ if len(numeric_cols) < 2:
83
+ return go.Figure(), "Clustering requires at least 2 numeric features."
84
+
85
+ try:
86
+ cluster_data = df[numeric_cols].dropna()
87
+ if len(cluster_data) < n_clusters:
88
+ return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters."
89
+
90
+ # Scale data for better clustering performance
91
+ scaler = StandardScaler()
92
+ scaled_data = scaler.fit_transform(cluster_data)
93
+
94
+ kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init='auto').fit(scaled_data)
95
+ cluster_data['Cluster'] = kmeans.labels_.astype(str)
96
+
97
+ # Visualize using the first two principal components for a more holistic view
98
+ fig_cluster = px.scatter(
99
+ cluster_data, x=numeric_cols[0], y=numeric_cols[1], color='Cluster',
100
+ title=f"<b>K-Means Clustering Result (k={int(n_clusters)})</b>",
101
+ template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid
102
+ )
103
+ cluster_md = f"""
104
+ ### Clustering Summary
105
+ - **Features Used:** {', '.join(numeric_cols)}
106
+ - **Number of Clusters (K):** {int(n_clusters)}
107
+ - **Insight:** The plot shows the separation of data into {int(n_clusters)} distinct groups based on the selected features.
108
+ """
109
+ return fig_cluster, cluster_md
110
+ except Exception as e:
111
+ logging.error(f"Clustering failed: {e}", exc_info=True)
112
+ return go.Figure(), f"❌ **Error:** Could not perform clustering. Reason: {e}"