Spaces:

mgbam
/

PhoenixUI

Running

App Files Files Community

mgbam commited on 3 days ago

Commit

cff0e3d

verified ·

1 Parent(s): f9d0aef

Update analysis_modules.py

Browse files

Files changed (1) hide show

analysis_modules.py +78 -34

analysis_modules.py CHANGED Viewed

@@ -8,6 +8,7 @@ import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import StandardScaler
 from statsmodels.tsa.seasonal import seasonal_decompose
 from statsmodels.tsa.stattools import adfuller
@@ -15,98 +16,141 @@ from wordcloud import WordCloud
 # --- Time-Series Module ---
 def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
-    """Performs time-series decomposition and stationarity testing."""
     if not date_col or not value_col:
-        return go.Figure(), "Please select both a date/time column and a value column."
     try:
-        # Prepare data
         ts_df = df.copy()
         ts_df[date_col] = pd.to_datetime(ts_df[date_col])
         ts_df = ts_df.set_index(date_col).sort_index()
         ts_data = ts_df[value_col].dropna()
-        if len(ts_data) < 24: # Need at least 2 periods for decomposition
-             return go.Figure(), "Not enough data points (< 24) for time-series decomposition."
-        # Decomposition (assuming monthly data for period=12)
-        result = seasonal_decompose(ts_data, model='additive', period=12)
         fig_decomp = px.line(
             pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}),
             title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
-            labels={'value': 'Value', 'index': 'Date'},
-            template="plotly_white",
-        )
-        fig_decomp.update_layout(legend_title_text='Components')
         # Stationarity Test (ADF)
         adf_result = adfuller(ts_data)
         conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)'
         adf_md = f"""
-        ### Stationarity Analysis (ADF Test)
         - **ADF Statistic:** `{adf_result[0]:.4f}`
         - **p-value:** `{adf_result[1]:.4f}`
-        - **Conclusion:** The time-series is {conclusion}. A non-stationary series may require differencing for forecasting models.
         """
         return fig_decomp, adf_md
     except Exception as e:
         logging.error(f"Time-series analysis failed: {e}", exc_info=True)
-        return go.Figure(), f"❌ **Error:** Could not perform time-series analysis. Reason: {e}"
 # --- Text Analysis Module ---
 def generate_word_cloud(df: pd.DataFrame, text_col: str):
-    """Generates a word cloud from a text column and returns it as a data URI."""
     if not text_col:
-        return None # Return None to hide the HTML component
     try:
         text = ' '.join(df[text_col].dropna().astype(str))
-        if not text:
-            return "<p style='text-align:center;'>No text data available in this column to generate a cloud.</p>"
-        wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)
-        # Convert matplotlib plot to a base64 encoded string for Gradio HTML
         buf = io.BytesIO()
         wordcloud.to_image().save(buf, format='png')
         img_str = base64.b64encode(buf.getvalue()).decode('utf-8')
-        html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud"></div>'
         return html_content
     except Exception as e:
         logging.error(f"Word cloud generation failed: {e}", exc_info=True)
-        return f"❌ **Error:** Could not generate word cloud. Reason: {e}"
 # --- Clustering Module ---
 def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
-    """Performs K-Means clustering and returns a scatter plot."""
     if len(numeric_cols) < 2:
-        return go.Figure(), "Clustering requires at least 2 numeric features."
     try:
         cluster_data = df[numeric_cols].dropna()
         if len(cluster_data) < n_clusters:
             return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters."
-        # Scale data for better clustering performance
         scaler = StandardScaler()
         scaled_data = scaler.fit_transform(cluster_data)
-        kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init='auto').fit(scaled_data)
         cluster_data['Cluster'] = kmeans.labels_.astype(str)
-        # Visualize using the first two principal components for a more holistic view
         fig_cluster = px.scatter(
-            cluster_data, x=numeric_cols[0], y=numeric_cols[1], color='Cluster',
-            title=f"<b>K-Means Clustering Result (k={int(n_clusters)})</b>",
             template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid
         )
         cluster_md = f"""
-        ### Clustering Summary
-        - **Features Used:** {', '.join(numeric_cols)}
-        - **Number of Clusters (K):** {int(n_clusters)}
-        - **Insight:** The plot shows the separation of data into {int(n_clusters)} distinct groups based on the selected features.
         """
         return fig_cluster, cluster_md
     except Exception as e:
         logging.error(f"Clustering failed: {e}", exc_info=True)
-        return go.Figure(), f"❌ **Error:** Could not perform clustering. Reason: {e}"

 import plotly.express as px
 import plotly.graph_objects as go
 from sklearn.cluster import KMeans
+from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 from statsmodels.tsa.seasonal import seasonal_decompose
 from statsmodels.tsa.stattools import adfuller
 # --- Time-Series Module ---
 def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
+    """
+    Performs time-series decomposition and stationarity testing with robust error handling.
+    Args:
+        df (pd.DataFrame): The input DataFrame.
+        date_col (str): The name of the column containing datetime information.
+        value_col (str): The name of the numeric column to analyze.
+    Returns:
+        tuple: A Plotly Figure and a Markdown string with analysis.
+    """
     if not date_col or not value_col:
+        return go.Figure(), "Please select both a date/time column and a value column to begin analysis."
     try:
+        logging.info(f"Analyzing time-series for date='{date_col}' and value='{value_col}'")
         ts_df = df.copy()
         ts_df[date_col] = pd.to_datetime(ts_df[date_col])
         ts_df = ts_df.set_index(date_col).sort_index()
         ts_data = ts_df[value_col].dropna()
+        # A common period for decomposition is 12 (monthly), require at least 2 full periods.
+        period = 12
+        if len(ts_data) < 2 * period:
+            msg = f"Not enough data points ({len(ts_data)}) for a reliable time-series decomposition (requires at least {2*period})."
+            logging.warning(msg)
+            return go.Figure().update_layout(title=msg), ""
+        # Decomposition
+        result = seasonal_decompose(ts_data, model='additive', period=period)
         fig_decomp = px.line(
             pd.DataFrame({'Trend': result.trend, 'Seasonal': result.seasonal, 'Residual': result.resid}),
             title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
+            labels={'value': 'Value', 'index': 'Date'}, template="plotly_white"
+        ).update_layout(legend_title_text='Components')
         # Stationarity Test (ADF)
         adf_result = adfuller(ts_data)
         conclusion = 'likely **stationary** (p < 0.05)' if adf_result[1] < 0.05 else 'likely **non-stationary** (p >= 0.05)'
         adf_md = f"""
+        ### Stationarity Analysis (Augmented Dickey-Fuller Test)
         - **ADF Statistic:** `{adf_result[0]:.4f}`
         - **p-value:** `{adf_result[1]:.4f}`
+        - **Conclusion:** The time-series is {conclusion}. Non-stationary series often require differencing before being used in forecasting models like ARIMA.
         """
         return fig_decomp, adf_md
     except Exception as e:
         logging.error(f"Time-series analysis failed: {e}", exc_info=True)
+        return go.Figure(), f"❌ **Error:** Could not perform analysis. Please ensure the date column is a valid time format and the value column is numeric. \n`{e}`"
 # --- Text Analysis Module ---
 def generate_word_cloud(df: pd.DataFrame, text_col: str):
+    """
+    Generates a word cloud from a text column and returns it as an HTML object.
+    Args:
+        df (pd.DataFrame): The input DataFrame.
+        text_col (str): The name of the column containing text data.
+    Returns:
+        str: An HTML string containing the word cloud image or an error message.
+    """
     if not text_col:
+        return "<p style='text-align:center; padding: 20px;'>Select a text column to generate a word cloud.</p>"
     try:
+        logging.info(f"Generating word cloud for column '{text_col}'")
         text = ' '.join(df[text_col].dropna().astype(str))
+        if not text.strip():
+            return "<p style='text-align:center; padding: 20px;'>No text data available in this column to generate a cloud.</p>"
+        wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis', max_words=150).generate(text)
         buf = io.BytesIO()
         wordcloud.to_image().save(buf, format='png')
         img_str = base64.b64encode(buf.getvalue()).decode('utf-8')
+        html_content = f'<div style="text-align:center;"><img src="data:image/png;base64,{img_str}" alt="Word Cloud for {text_col}" style="border-radius: 8px;"></div>'
         return html_content
     except Exception as e:
         logging.error(f"Word cloud generation failed: {e}", exc_info=True)
+        return f"<p style='text-align:center; color:red; padding: 20px;'>❌ **Error:** Could not generate word cloud. Reason: {e}</p>"
 # --- Clustering Module ---
 def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
+    """
+    Performs K-Means clustering using best practices (scaling and PCA for visualization).
+    Args:
+        df (pd.DataFrame): The input DataFrame.
+        numeric_cols (list): A list of numeric columns to use for clustering.
+        n_clusters (int): The number of clusters (k) to create.
+    Returns:
+        tuple: A Plotly Figure and a Markdown string with analysis.
+    """
     if len(numeric_cols) < 2:
+        return go.Figure(), "Clustering requires at least 2 numeric features. Please select a dataset with more numeric columns."
     try:
+        logging.info(f"Performing K-Means clustering with k={n_clusters} on {len(numeric_cols)} features.")
         cluster_data = df[numeric_cols].dropna()
         if len(cluster_data) < n_clusters:
             return go.Figure(), f"Not enough data points ({len(cluster_data)}) for {n_clusters} clusters."
+        # Step 1: Scale data - Crucial for distance-based algorithms like K-Means
         scaler = StandardScaler()
         scaled_data = scaler.fit_transform(cluster_data)
+        # Step 2: Perform K-Means clustering
+        kmeans = KMeans(n_clusters=int(n_clusters), random_state=42, n_init=10).fit(scaled_data)
         cluster_data['Cluster'] = kmeans.labels_.astype(str)
+        # Step 3: Use PCA to reduce dimensionality for a meaningful 2D visualization
+        pca = PCA(n_components=2)
+        components = pca.fit_transform(scaled_data)
+        cluster_data['PCA1'] = components[:, 0]
+        cluster_data['PCA2'] = components[:, 1]
+        # Step 4: Create the plot using the principal components
         fig_cluster = px.scatter(
+            cluster_data, x='PCA1', y='PCA2', color='Cluster',
+            title=f"<b>K-Means Clustering Visualization (k={int(n_clusters)})</b>",
+            labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'},
             template="plotly_white", color_discrete_sequence=px.colors.qualitative.Vivid
         )
+        explained_variance = pca.explained_variance_ratio_.sum() * 100
         cluster_md = f"""
+        ### Clustering Summary & Methodology
+        - **Features Used:** `{len(numeric_cols)}` numeric features were scaled and used for clustering.
+        - **Number of Clusters (K):** `{int(n_clusters)}`
+        - **Visualization:** To visualize the high-dimensional clusters in 2D, Principal Component Analysis (PCA) was used.
+        - **Explained Variance:** The two components shown explain **{explained_variance:.2f}%** of the variance in the data.
         """
         return fig_cluster, cluster_md
     except Exception as e:
         logging.error(f"Clustering failed: {e}", exc_info=True)
+        return go.Figure(), f"❌ **Error:** Could not perform clustering. \n`{e}`"