File size: 4,702 Bytes
d426068
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# modules/timeseries.py

# -*- coding: utf-8 -*-
#
# PROJECT:      CognitiveEDA v5.0 - The QuantumLeap Intelligence Platform
#
# DESCRIPTION:  Specialized module for time-series analysis, including decomposition
#               and stationarity testing (Augmented Dickey-Fuller).

import logging
from typing import Tuple

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller

def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str) -> Tuple[go.Figure, str]:
    """
    Performs and visualizes time-series decomposition and stationarity.

    This function takes a DataFrame and specified columns, performs seasonal
    decomposition (assuming an additive model and monthly frequency), and runs
    an Augmented Dickey-Fuller test to check for stationarity.

    Args:
        df: The input DataFrame.
        date_col: The name of the column containing datetime information.
        value_col: The name of the numeric column to analyze.

    Returns:
        A tuple containing:
        - A Plotly Figure of the time-series decomposition.
        - A Markdown string summarizing the stationarity test results.
    """
    # 1. Input Validation
    if not date_col or not value_col:
        return go.Figure(), "Please select both a date/time column and a value column to begin analysis."

    if value_col not in df.columns or not pd.api.types.is_numeric_dtype(df[value_col]):
        msg = f"Value column '{value_col}' is not numeric. Please select a numeric column for analysis."
        return go.Figure().update_layout(title=msg), f"❌ **Error:** {msg}"

    try:
        logging.info(f"Analyzing time-series for date='{date_col}' and value='{value_col}'")
        
        # 2. Data Preparation with robust error handling
        ts_df = df[[date_col, value_col]].copy()
        ts_df[date_col] = pd.to_datetime(ts_df[date_col], errors='coerce')

        # Drop rows where date conversion failed or value is missing
        ts_df.dropna(subset=[date_col, value_col], inplace=True)
        
        if ts_df.empty:
            msg = f"No valid data remains after parsing dates in '{date_col}' and removing missing values."
            logging.warning(msg)
            return go.Figure().update_layout(title=msg), f"❌ **Error:** {msg}"

        ts_df = ts_df.set_index(date_col).sort_index()
        ts_data = ts_df[value_col]

        # 3. Decomposition
        # Assume monthly data (period=12). Require at least 2 full periods for decomposition.
        period = 12
        if len(ts_data) < 2 * period:
            msg = f"Insufficient data for reliable seasonal decomposition. Found {len(ts_data)} points, require at least {2 * period}."
            logging.warning(msg)
            # Still return the raw plot if decomposition isn't possible
            fig_decomp = px.line(ts_data, title=f"<b>Raw Time-Series of '{value_col}'</b>")
            return fig_decomp, f"⚠️ **Warning:** {msg}"

        result = seasonal_decompose(ts_data, model='additive', period=period)
        
        decomposition_data = pd.DataFrame({
            'Trend': result.trend,
            'Seasonal': result.seasonal,
            'Residual': result.resid,
            'Observed': result.observed
        }).reset_index()

        fig_decomp = px.line(
            decomposition_data, x=date_col, y=['Observed', 'Trend', 'Seasonal', 'Residual'],
            title=f"<b>Time-Series Decomposition of '{value_col}'</b>",
            labels={'value': 'Value', date_col: 'Date'},
            facet_row='variable'
        ).update_yaxes(matches=None) # Allow y-axes to have independent scales
        fig_decomp.update_layout(showlegend=False)

        # 4. Stationarity Test (Augmented Dickey-Fuller)
        adf_result = adfuller(ts_data.dropna())
        p_value = adf_result[1]
        conclusion = 'likely **stationary** (p < 0.05)' if p_value < 0.05 else 'likely **non-stationary** (p >= 0.05)'
        
        adf_md = f"""
        ### Stationarity Analysis (Augmented Dickey-Fuller Test)
        - **ADF Statistic:** `{adf_result[0]:.4f}`
        - **p-value:** `{p_value:.4f}`
        - **Conclusion:** The time-series is {conclusion}. Non-stationary series often require differencing before being used in forecasting models like ARIMA.
        """
        return fig_decomp, adf_md

    except Exception as e:
        logging.error(f"Time-series analysis failed: {e}", exc_info=True)
        error_msg = f"An unexpected error occurred during analysis. Please check column formats. Details: {e}"
        return go.Figure(), f"❌ **Error:** {error_msg}"