Upload 23 files
Browse files- .github/workflows/automated_ci.yaml +53 -0
- .github/workflows/python-publish.yaml +71 -0
- docs/analysis.md +108 -0
- docs/data_loader.md +95 -0
- docs/lag_correlation.md +84 -0
- docs/report_generator.md +0 -0
- docs/stationarity.md +82 -0
- docs/volatility_check.md +88 -0
- dynamicts/__init__.py +0 -0
- dynamicts/analysis.py +217 -0
- dynamicts/analysis_helpers.py +0 -0
- dynamicts/data_loader.py +136 -0
- dynamicts/dynamic_analysis.py +47 -0
- dynamicts/lag_correlation.py +168 -0
- dynamicts/report_generator.py +68 -0
- dynamicts/report_generator_sh.py +152 -0
- dynamicts/stationarity.py +281 -0
- dynamicts/volatility_check.py +195 -0
- tests/__init__.py +0 -0
- tests/integration/__init__.py +0 -0
- tests/integration/test_int.py +3 -0
- tests/unit/__init__.py +0 -0
- tests/unit/test_unit.py +117 -0
.github/workflows/automated_ci.yaml
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Continuous Integration
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches: [ "main" ]
|
6 |
+
paths-ignore:
|
7 |
+
- 'README.md'
|
8 |
+
- 'docs/**'
|
9 |
+
pull_request:
|
10 |
+
branches: [ "main" ]
|
11 |
+
paths-ignore:
|
12 |
+
- 'README.md'
|
13 |
+
- 'docs/**'
|
14 |
+
|
15 |
+
permissions:
|
16 |
+
contents: read
|
17 |
+
|
18 |
+
jobs:
|
19 |
+
build:
|
20 |
+
name: Test on ${{ matrix.os }} with Python ${{ matrix.python-version }}
|
21 |
+
runs-on: ${{ matrix.os }}
|
22 |
+
|
23 |
+
strategy:
|
24 |
+
fail-fast: false
|
25 |
+
matrix:
|
26 |
+
os: [ubuntu-latest, windows-latest]
|
27 |
+
python-version: ["3.8", "3.9", "3.10"]
|
28 |
+
|
29 |
+
steps:
|
30 |
+
- name: Checkout code
|
31 |
+
uses: actions/checkout@v3
|
32 |
+
|
33 |
+
- name: Set up Python ${{ matrix.python-version }}
|
34 |
+
uses: actions/setup-python@v3
|
35 |
+
with:
|
36 |
+
python-version: ${{ matrix.python-version }}
|
37 |
+
|
38 |
+
- name: Cache pip dependencies
|
39 |
+
uses: actions/cache@v3
|
40 |
+
with:
|
41 |
+
path: ~/.cache/pip
|
42 |
+
key: ${{ runner.os }}-pip-${{ hashFiles('requirements*.txt') }}
|
43 |
+
restore-keys: |
|
44 |
+
${{ runner.os }}-pip-
|
45 |
+
|
46 |
+
- name: Install dependencies
|
47 |
+
run: |
|
48 |
+
python -m pip install --upgrade pip
|
49 |
+
pip install flake8 pytest tox tox-gh-actions
|
50 |
+
pip install -r requirements.txt
|
51 |
+
|
52 |
+
- name: Test with tox
|
53 |
+
run: tox
|
.github/workflows/python-publish.yaml
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Python Package
|
2 |
+
|
3 |
+
on:
|
4 |
+
release:
|
5 |
+
types: [published]
|
6 |
+
|
7 |
+
permissions:
|
8 |
+
contents: read
|
9 |
+
id-token: write
|
10 |
+
|
11 |
+
jobs:
|
12 |
+
test:
|
13 |
+
runs-on: ubuntu-latest
|
14 |
+
strategy:
|
15 |
+
matrix:
|
16 |
+
python-version: ["3.8", "3.9", "3.10"]
|
17 |
+
steps:
|
18 |
+
- uses: actions/checkout@v4
|
19 |
+
- name: Debug Python version
|
20 |
+
run: 'echo "Using Python version: ${{ matrix.python-version }}"'
|
21 |
+
- name: Set up Python ${{ matrix.python-version }}
|
22 |
+
uses: actions/setup-python@v5
|
23 |
+
with:
|
24 |
+
python-version: ${{ matrix.python-version }}
|
25 |
+
- name: Install dependencies
|
26 |
+
run: |
|
27 |
+
python -m pip install --upgrade pip
|
28 |
+
pip install flake8 pytest
|
29 |
+
pip install -r requirements.txt
|
30 |
+
pip install -e .
|
31 |
+
- name: Lint with flake8
|
32 |
+
run: |
|
33 |
+
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
34 |
+
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
35 |
+
- name: Run tests
|
36 |
+
run: pytest -v
|
37 |
+
|
38 |
+
build:
|
39 |
+
runs-on: ubuntu-latest
|
40 |
+
needs: test
|
41 |
+
steps:
|
42 |
+
- uses: actions/checkout@v4
|
43 |
+
- name: Set up Python 3.10
|
44 |
+
uses: actions/setup-python@v5
|
45 |
+
with:
|
46 |
+
python-version: "3.10"
|
47 |
+
- name: Build distributions
|
48 |
+
run: |
|
49 |
+
python -m pip install --upgrade pip build
|
50 |
+
python -m build
|
51 |
+
- name: Upload distributions
|
52 |
+
uses: actions/upload-artifact@v4
|
53 |
+
with:
|
54 |
+
name: release-dists
|
55 |
+
path: dist/
|
56 |
+
|
57 |
+
publish:
|
58 |
+
runs-on: ubuntu-latest
|
59 |
+
needs: build
|
60 |
+
environment:
|
61 |
+
name: pypi
|
62 |
+
steps:
|
63 |
+
- name: Download release artifacts
|
64 |
+
uses: actions/download-artifact@v4
|
65 |
+
with:
|
66 |
+
name: release-dists
|
67 |
+
path: dist/
|
68 |
+
- name: Publish to PyPI
|
69 |
+
uses: pypa/gh-action-pypi-publish@release/v1
|
70 |
+
with:
|
71 |
+
packages-dir: dist/
|
docs/analysis.md
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# UnivariateAnalysis
|
2 |
+
|
3 |
+
> **Note:** The following examples assume a time series DataFrame similar to `complaints.csv`, with columns: `date`and `complaints`.
|
4 |
+
|
5 |
+
The `UnivariateAnalysis` class provides a suite of methods for exploratory and statistical analysis of univariate time series data. It helps you understand the distribution, missing values, and outliers in your time series before further modeling or forecasting.
|
6 |
+
|
7 |
+
## Features
|
8 |
+
|
9 |
+
- Visualizes the distribution and boxplot of the target time series.
|
10 |
+
- Computes skewness and kurtosis with interpretation.
|
11 |
+
- Checks for missing values and provides recommendations.
|
12 |
+
- Detects outliers using IQR and Z-score methods.
|
13 |
+
- Logs plots and messages to HTML reports.
|
14 |
+
|
15 |
+
## Class: `UnivariateAnalysis`
|
16 |
+
|
17 |
+
### Initialization
|
18 |
+
|
19 |
+
```python
|
20 |
+
UnivariateAnalysis(df: pd.DataFrame, target_col: str, index_col: str = "date", output_filepath: str = "output_filepath")
|
21 |
+
```
|
22 |
+
|
23 |
+
- **df**: The time series DataFrame (indexed by the time column).
|
24 |
+
- **target_col**: The column name of the univariate time series to analyze.
|
25 |
+
- **index_col**: The name of the time index column (default: "date").
|
26 |
+
- **output_filepath**: Path prefix for saving HTML reports and plots.
|
27 |
+
|
28 |
+
> **Note:** Your DataFrame should have a time-based index (e.g., "date", "timestamp").
|
29 |
+
|
30 |
+
### Methods
|
31 |
+
|
32 |
+
#### `plot_distribution()`
|
33 |
+
|
34 |
+
Plots the histogram and boxplot of the target time series column and logs the plot to the HTML report.
|
35 |
+
|
36 |
+
**Standalone Example:**
|
37 |
+
```python
|
38 |
+
from dynamicts.analysis import UnivariateAnalysis
|
39 |
+
|
40 |
+
analysis = UnivariateAnalysis(df, target_col="complaints", index_col="date", output_filepath="report")
|
41 |
+
fig = analysis.plot_distribution()
|
42 |
+
fig.show()
|
43 |
+
```
|
44 |
+
|
45 |
+
#### `check_distribution_stats()`
|
46 |
+
|
47 |
+
Computes skewness and kurtosis for the target column, interprets the results, and logs the summary to the HTML report.
|
48 |
+
|
49 |
+
**Standalone Example:**
|
50 |
+
```python
|
51 |
+
from dynamicts.analysis import UnivariateAnalysis
|
52 |
+
|
53 |
+
analysis = UnivariateAnalysis(df, target_col="complaints", index_col="date", output_filepath="report")
|
54 |
+
stats = analysis.check_distribution_stats()
|
55 |
+
print(stats["full_message"])
|
56 |
+
```
|
57 |
+
|
58 |
+
#### `check_missing_values()`
|
59 |
+
|
60 |
+
Checks for missing values in the target column, reports the count and percentage, and logs recommendations to the HTML report.
|
61 |
+
|
62 |
+
**Standalone Example:**
|
63 |
+
```python
|
64 |
+
from dynamicts.analysis import UnivariateAnalysis
|
65 |
+
|
66 |
+
analysis = UnivariateAnalysis(df, target_col="complaints", index_col="date", output_filepath="report")
|
67 |
+
missing = analysis.check_missing_values()
|
68 |
+
print(missing["message"])
|
69 |
+
```
|
70 |
+
|
71 |
+
#### `detect_outliers(method="both", plot=True)`
|
72 |
+
|
73 |
+
Detects outliers in the target column using IQR, Z-score, or both. Optionally plots and logs the results.
|
74 |
+
|
75 |
+
- **method**: "iqr", "zscore", or "both" (default: "both").
|
76 |
+
- **plot**: Whether to plot the outliers (default: True).
|
77 |
+
|
78 |
+
**Standalone Example:**
|
79 |
+
```python
|
80 |
+
from dynamicts.analysis import UnivariateAnalysis
|
81 |
+
|
82 |
+
analysis = UnivariateAnalysis(df, target_col="complaints", index_col="date", output_filepath="report")
|
83 |
+
outliers = analysis.detect_outliers(method="both", plot=True)
|
84 |
+
print(f"Outliers detected: {outliers['outliers_detected']}")
|
85 |
+
```
|
86 |
+
|
87 |
+
#### `run_univariate_analysis(df, output_filepath, target_col, index_col="date")` (static method)
|
88 |
+
|
89 |
+
Runs the full univariate analysis pipeline: distribution plot, stats, missing values, and outlier detection. Displays results in a notebook environment.
|
90 |
+
|
91 |
+
**Standalone Example:**
|
92 |
+
```python
|
93 |
+
from dynamicts.analysis import UnivariateAnalysis
|
94 |
+
|
95 |
+
results = UnivariateAnalysis.run_univariate_analysis(
|
96 |
+
df=df,
|
97 |
+
output_filepath="report",
|
98 |
+
target_col="complaints",
|
99 |
+
index_col="date"
|
100 |
+
)
|
101 |
+
```
|
102 |
+
|
103 |
+
### Notes
|
104 |
+
|
105 |
+
- All plots and messages are logged to HTML reports using the provided `output_filepath`.
|
106 |
+
- The DataFrame should be indexed by the time column for proper time series analysis.
|
107 |
+
|
108 |
+
---
|
docs/data_loader.md
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# DataLoader
|
2 |
+
|
3 |
+
> **Note:** We are using `complaints.csv` as an example here, with the following columns: `date`, `complaints`. The `date` column is used as the time index and `complaints` as the target variable.
|
4 |
+
|
5 |
+
The `DataLoader` class provides a unified interface for loading and managing time series data from CSV files in your Python projects. It is designed as the first step in your time series analysis workflow, ensuring your time series data is loaded, validated, and ready for further analysis.
|
6 |
+
|
7 |
+
## Features
|
8 |
+
|
9 |
+
- Loads time series data from CSV files into pandas DataFrames.
|
10 |
+
- Standardizes column names to lowercase.
|
11 |
+
- Checks if the time series index is regular (uniform intervals).
|
12 |
+
- Saves metadata (columns, dtypes, shape, index name) to a JSON file.
|
13 |
+
|
14 |
+
## Class: `DataLoader`
|
15 |
+
|
16 |
+
### Initialization
|
17 |
+
|
18 |
+
```python
|
19 |
+
DataLoader(filepath: str, index_col: Optional[Union[str, int]] = None, parse_dates: Union[bool, list] = True)
|
20 |
+
```
|
21 |
+
|
22 |
+
- **filepath**: Path to the CSV file containing your time series data.
|
23 |
+
- **index_col**: Name or position of the column to use as the time index (e.g., a timestamp or date column).
|
24 |
+
- **parse_dates**: Whether to parse dates in the index column.
|
25 |
+
|
26 |
+
> **Note:** Your CSV must contain a column representing the time axis (e.g., "date"). Set `index_col` to this column's name.
|
27 |
+
|
28 |
+
### Methods
|
29 |
+
|
30 |
+
#### `load() -> pd.DataFrame`
|
31 |
+
|
32 |
+
Loads the time series data from the specified CSV file, standardizes column names, and sets the index name to lowercase.
|
33 |
+
|
34 |
+
**Standalone Example:**
|
35 |
+
```python
|
36 |
+
from dynamicts.data_loader import DataLoader
|
37 |
+
|
38 |
+
loader = DataLoader(filepath="data/complaints.csv", index_col="date")
|
39 |
+
df = loader.load()
|
40 |
+
print(df.head())
|
41 |
+
```
|
42 |
+
|
43 |
+
#### `is_regular() -> bool`
|
44 |
+
|
45 |
+
Checks if the time series index is regular (i.e., intervals between timestamps are uniform). Returns `True` if regular, `False` otherwise.
|
46 |
+
|
47 |
+
**Standalone Example:**
|
48 |
+
```python
|
49 |
+
from dynamicts.data_loader import DataLoader
|
50 |
+
|
51 |
+
loader = DataLoader(filepath="data/complaints.csv", index_col="date")
|
52 |
+
loader.load() # Must load data first
|
53 |
+
is_reg = loader.is_regular()
|
54 |
+
print("Is regular:", is_reg)
|
55 |
+
```
|
56 |
+
|
57 |
+
#### `save_metadata() -> None`
|
58 |
+
|
59 |
+
Saves metadata (columns, dtypes, shape, index name) of the loaded DataFrame to a JSON file in the `metadata/` directory.
|
60 |
+
|
61 |
+
**Standalone Example:**
|
62 |
+
```python
|
63 |
+
from dynamicts.data_loader import DataLoader
|
64 |
+
|
65 |
+
loader = DataLoader(filepath="data/complaints.csv", index_col="date")
|
66 |
+
loader.load() # Must load data first
|
67 |
+
loader.save_metadata()
|
68 |
+
print("Metadata saved.")
|
69 |
+
```
|
70 |
+
|
71 |
+
#### `run_pipeline() -> Optional[pd.DataFrame]`
|
72 |
+
|
73 |
+
Runs the time series data loading pipeline:
|
74 |
+
- Loads the data.
|
75 |
+
- Checks for regularity.
|
76 |
+
- Saves metadata if data is regular.
|
77 |
+
- Returns the loaded DataFrame.
|
78 |
+
|
79 |
+
**Standalone Example:**
|
80 |
+
```python
|
81 |
+
from dynamicts.data_loader import DataLoader
|
82 |
+
|
83 |
+
loader = DataLoader(filepath="data/complaints.csv", index_col="date")
|
84 |
+
data = loader.run_pipeline()
|
85 |
+
if data is not None:
|
86 |
+
print("Time series data loaded successfully!")
|
87 |
+
```
|
88 |
+
|
89 |
+
### Notes
|
90 |
+
|
91 |
+
- The loader logs all actions and errors to a log file in the `logs/` directory.
|
92 |
+
- If the time index is not regular, a warning is logged and the data is still returned for inspection.
|
93 |
+
- Metadata is saved only if the time series data is regular.
|
94 |
+
|
95 |
+
---
|
docs/lag_correlation.md
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Correlation
|
2 |
+
|
3 |
+
> **Note:** We are using `complaints.csv` as an example here, with the following columns: `date`, `complaints`. The `date` column is used as the time index and `complaints` as the target variable.
|
4 |
+
|
5 |
+
The `Correlation` class provides methods for analyzing and visualizing autocorrelation (ACF) and partial autocorrelation (PACF) in univariate time series data. These tools are essential for understanding lag relationships and dependencies in time series analysis.
|
6 |
+
|
7 |
+
## Features
|
8 |
+
|
9 |
+
- Plots autocorrelation (ACF) for a given time series and number of lags.
|
10 |
+
- Plots partial autocorrelation (PACF) for a given time series and number of lags.
|
11 |
+
- Supports both instance-based and standalone usage.
|
12 |
+
- Optionally logs plots to HTML reports.
|
13 |
+
|
14 |
+
## Class: `Correlation`
|
15 |
+
|
16 |
+
### Initialization
|
17 |
+
|
18 |
+
```python
|
19 |
+
Correlation(df: pd.DataFrame = None, target_col: str = None, lags: int = 20, output_filepath: str = None)
|
20 |
+
```
|
21 |
+
|
22 |
+
- **df**: The time series DataFrame (indexed by the time column).
|
23 |
+
- **target_col**: The column name of the univariate time series to analyze.
|
24 |
+
- **lags**: Number of lags to use for correlation plots (default: 20).
|
25 |
+
- **output_filepath**: Path prefix for saving HTML reports and plots.
|
26 |
+
|
27 |
+
> **Note:** Your DataFrame should have a time-based index (e.g., "date", "timestamp").
|
28 |
+
|
29 |
+
### Methods
|
30 |
+
|
31 |
+
#### `acf_plot(data: pd.Series = None, lags: int = None, save: bool = True, output_filepath: str = None)`
|
32 |
+
|
33 |
+
Plots the autocorrelation function (ACF) for the specified time series and number of lags.
|
34 |
+
|
35 |
+
- **data**: Optional. A pandas Series to plot. If not provided, uses the instance's DataFrame and target column.
|
36 |
+
- **lags**: Optional. Number of lags to plot. Defaults to the instance's `lags`.
|
37 |
+
- **save**: Optional. Whether to save the plot to an HTML report.
|
38 |
+
- **output_filepath**: Optional. Path for saving the report.
|
39 |
+
|
40 |
+
**Standalone Example:**
|
41 |
+
```python
|
42 |
+
from dynamicts.lag_correlation import Correlation
|
43 |
+
|
44 |
+
# Instance-based usage
|
45 |
+
corr = Correlation(df, target_col="complaints", lags=30, output_filepath="report")
|
46 |
+
fig = corr.acf_plot()
|
47 |
+
fig.show()
|
48 |
+
|
49 |
+
# Standalone usage
|
50 |
+
corr = Correlation()
|
51 |
+
fig = corr.acf_plot(data=df["complaints"], lags=30, output_filepath="report")
|
52 |
+
fig.show()
|
53 |
+
```
|
54 |
+
|
55 |
+
#### `pacf_plot(data: pd.Series = None, lags: int = None, save: bool = True, output_filepath: str = None)`
|
56 |
+
|
57 |
+
Plots the partial autocorrelation function (PACF) for the specified time series and number of lags.
|
58 |
+
|
59 |
+
- **data**: Optional. A pandas Series to plot. If not provided, uses the instance's DataFrame and target column.
|
60 |
+
- **lags**: Optional. Number of lags to plot. Defaults to the instance's `lags`.
|
61 |
+
- **save**: Optional. Whether to save the plot to an HTML report.
|
62 |
+
- **output_filepath**: Optional. Path for saving the report.
|
63 |
+
|
64 |
+
**Standalone Example:**
|
65 |
+
```python
|
66 |
+
from dynamicts.lag_correlation import Correlation
|
67 |
+
|
68 |
+
# Instance-based usage
|
69 |
+
corr = Correlation(df, target_col="complaints", lags=30, output_filepath="report")
|
70 |
+
fig = corr.pacf_plot()
|
71 |
+
fig.show()
|
72 |
+
|
73 |
+
# Standalone usage
|
74 |
+
corr = Correlation()
|
75 |
+
fig = corr.pacf_plot(data=df["complaints"], lags=30, output_filepath="report")
|
76 |
+
fig.show()
|
77 |
+
```
|
78 |
+
|
79 |
+
### Notes
|
80 |
+
|
81 |
+
- The DataFrame should be indexed by the time column for proper time series analysis.
|
82 |
+
- Plots can be logged to HTML reports if `save=True` and `output_filepath` is provided.
|
83 |
+
|
84 |
+
---
|
docs/report_generator.md
ADDED
File without changes
|
docs/stationarity.md
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Stationarity
|
2 |
+
|
3 |
+
> **Note:** We are using `complaints.csv` as an example here, with the following columns: `date`, `complaints`. The `date` column is used as the time index and `complaints` as the target variable.
|
4 |
+
|
5 |
+
The `Stationarity` class provides methods to test and visualize the stationarity of **univariate time series data**. Stationarity is a key assumption in many time series models, and these tools help you assess and transform your data accordingly.
|
6 |
+
|
7 |
+
## Features
|
8 |
+
|
9 |
+
- Performs the Augmented Dickey-Fuller (ADF) test for stationarity.
|
10 |
+
- Plots rolling mean and standard deviation for visual inspection.
|
11 |
+
- Supports both instance-based and standalone usage.
|
12 |
+
- Optionally logs results and plots to HTML reports.
|
13 |
+
|
14 |
+
## Class: `Stationarity`
|
15 |
+
|
16 |
+
### Initialization
|
17 |
+
|
18 |
+
```python
|
19 |
+
Stationarity(df: pd.DataFrame = None, target_col: str = None, window: int = 12, output_filepath: str = None)
|
20 |
+
```
|
21 |
+
|
22 |
+
- **df**: The time series DataFrame (indexed by the time column).
|
23 |
+
- **target_col**: The column name of the univariate time series to analyze.
|
24 |
+
- **window**: Window size for rolling statistics (default: 12).
|
25 |
+
- **output_filepath**: Path prefix for saving HTML reports and plots.
|
26 |
+
|
27 |
+
> **Note:** Your DataFrame should have a time-based index (e.g., "date", "timestamp").
|
28 |
+
|
29 |
+
### Methods
|
30 |
+
|
31 |
+
#### `adf_test(data: pd.Series = None, verbose: bool = True, save: bool = True, output_filepath: str = None)`
|
32 |
+
|
33 |
+
Performs the Augmented Dickey-Fuller test for stationarity on the specified time series.
|
34 |
+
|
35 |
+
- **data**: Optional. A pandas Series to test. If not provided, uses the instance's DataFrame and target column.
|
36 |
+
- **verbose**: Whether to print and log the test summary (default: True).
|
37 |
+
- **save**: Whether to save the results to an HTML report (default: True).
|
38 |
+
- **output_filepath**: Optional. Path for saving the report.
|
39 |
+
|
40 |
+
**Standalone Example:**
|
41 |
+
```python
|
42 |
+
from dynamicts.stationarity import Stationarity
|
43 |
+
|
44 |
+
# Instance-based usage
|
45 |
+
stat = Stationarity(df, target_col="complaints", window=12, output_filepath="report")
|
46 |
+
adf_result = stat.adf_test()
|
47 |
+
|
48 |
+
# Standalone usage
|
49 |
+
stat = Stationarity()
|
50 |
+
adf_result = stat.adf_test(data=df["complaints"], verbose=True, output_filepath="report")
|
51 |
+
```
|
52 |
+
|
53 |
+
#### `plot_rolling_stats(data: pd.Series = None, window: int = None, save: bool = True, output_filepath: str = None)`
|
54 |
+
|
55 |
+
Plots the rolling mean and standard deviation for the specified time series and window size.
|
56 |
+
|
57 |
+
- **data**: Optional. A pandas Series to plot. If not provided, uses the instance's DataFrame and target column.
|
58 |
+
- **window**: Optional. Window size for rolling statistics. Defaults to the instance's `window`.
|
59 |
+
- **save**: Whether to save the plot to an HTML report (default: True).
|
60 |
+
- **output_filepath**: Optional. Path for saving the report.
|
61 |
+
|
62 |
+
**Standalone Example:**
|
63 |
+
```python
|
64 |
+
from dynamicts.stationarity import Stationarity
|
65 |
+
|
66 |
+
# Instance-based usage
|
67 |
+
stat = Stationarity(df, target_col="complaints", window=12, output_filepath="report")
|
68 |
+
fig = stat.plot_rolling_stats()
|
69 |
+
fig.show()
|
70 |
+
|
71 |
+
# Standalone usage
|
72 |
+
stat = Stationarity()
|
73 |
+
fig = stat.plot_rolling_stats(data=df["complaints"], window=12, output_filepath="report")
|
74 |
+
fig.show()
|
75 |
+
```
|
76 |
+
|
77 |
+
### Notes
|
78 |
+
|
79 |
+
- The DataFrame should be indexed by the time column for proper time series analysis.
|
80 |
+
- Results and plots can be logged to HTML reports if `save=True` and `output_filepath` is provided.
|
81 |
+
|
82 |
+
---
|
docs/volatility_check.md
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# VolatilityChecker
|
2 |
+
|
3 |
+
> **Note:** We are using `complaints.csv` as an example here, with the following columns: `date`, `complaints`. The `date` column is used as the time index and `complaints` as the target variable.
|
4 |
+
|
5 |
+
The `VolatilityChecker` class provides methods for analyzing and visualizing volatility in univariate time series data using ARCH and GARCH models. These tools help you assess the presence and dynamics of volatility clustering in your time series.
|
6 |
+
|
7 |
+
## Features
|
8 |
+
|
9 |
+
- Computes and plots volatility using ARCH(1) and GARCH(1,1) models.
|
10 |
+
- Supports both instance-based and standalone usage.
|
11 |
+
- Optionally logs plots and summary statistics to HTML reports.
|
12 |
+
|
13 |
+
## Class: `VolatilityChecker`
|
14 |
+
|
15 |
+
### Initialization
|
16 |
+
|
17 |
+
```python
|
18 |
+
VolatilityChecker(df: pd.DataFrame = None, target_col: str = None, output_filepath: str = "Output")
|
19 |
+
```
|
20 |
+
|
21 |
+
- **df**: The time series DataFrame (indexed by the time column).
|
22 |
+
- **target_col**: The column name of the univariate time series to analyze.
|
23 |
+
- **output_filepath**: Path prefix for saving HTML reports and plots.
|
24 |
+
|
25 |
+
> **Note:** Your DataFrame should have a time-based index (e.g., "date", "timestamp").
|
26 |
+
|
27 |
+
### Methods
|
28 |
+
|
29 |
+
#### `arch_volatility(data: pd.Series = None, save: bool = True, output_filepath: str = None)`
|
30 |
+
|
31 |
+
Computes and plots volatility using an ARCH(1) model.
|
32 |
+
|
33 |
+
- **data**: Optional. A pandas Series to analyze. If not provided, uses the instance's DataFrame and target column.
|
34 |
+
- **save**: Whether to save the plot and summary to an HTML report (default: True).
|
35 |
+
- **output_filepath**: Optional. Path for saving the report.
|
36 |
+
|
37 |
+
**Standalone Example:**
|
38 |
+
```python
|
39 |
+
from dynamicts.data_loader import DataLoader
|
40 |
+
from dynamicts.volatility_check import VolatilityChecker
|
41 |
+
|
42 |
+
loader = DataLoader(filepath="data/complaints.csv", index_col="date")
|
43 |
+
df = loader.run_pipeline()
|
44 |
+
|
45 |
+
# Instance-based usage
|
46 |
+
vc = VolatilityChecker(df, target_col="complaints", output_filepath="report")
|
47 |
+
fig = vc.arch_volatility()
|
48 |
+
fig.show()
|
49 |
+
|
50 |
+
# Standalone usage
|
51 |
+
vc2 = VolatilityChecker()
|
52 |
+
fig2 = vc2.arch_volatility(data=df["complaints"], output_filepath="report")
|
53 |
+
fig2.show()
|
54 |
+
```
|
55 |
+
|
56 |
+
#### `garch_volatility(data: pd.Series = None, save: bool = True, output_filepath: str = None)`
|
57 |
+
|
58 |
+
Computes and plots volatility using a GARCH(1,1) model.
|
59 |
+
|
60 |
+
- **data**: Optional. A pandas Series to analyze. If not provided, uses the instance's DataFrame and target column.
|
61 |
+
- **save**: Whether to save the plot and summary to an HTML report (default: True).
|
62 |
+
- **output_filepath**: Optional. Path for saving the report.
|
63 |
+
|
64 |
+
**Standalone Example:**
|
65 |
+
```python
|
66 |
+
from dynamicts.data_loader import DataLoader
|
67 |
+
from dynamicts.volatility_check import VolatilityChecker
|
68 |
+
|
69 |
+
loader = DataLoader(filepath="data/complaints.csv", index_col="date")
|
70 |
+
df = loader.run_pipeline()
|
71 |
+
|
72 |
+
# Instance-based usage
|
73 |
+
vc = VolatilityChecker(df, target_col="complaints", output_filepath="report")
|
74 |
+
fig = vc.garch_volatility()
|
75 |
+
fig.show()
|
76 |
+
|
77 |
+
# Standalone usage
|
78 |
+
vc2 = VolatilityChecker()
|
79 |
+
fig2 = vc2.garch_volatility(data=df["complaints"], output_filepath="report")
|
80 |
+
fig2.show()
|
81 |
+
```
|
82 |
+
|
83 |
+
### Notes
|
84 |
+
|
85 |
+
- The DataFrame should be indexed by the time column for proper time series analysis.
|
86 |
+
- Plots and summary statistics can be logged to HTML reports if `save=True` and `output_filepath` is provided.
|
87 |
+
|
88 |
+
---
|
dynamicts/__init__.py
ADDED
File without changes
|
dynamicts/analysis.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
from scipy import stats
|
7 |
+
from scipy.stats import skew, kurtosis
|
8 |
+
from IPython.display import display, Markdown
|
9 |
+
from datetime import datetime
|
10 |
+
|
11 |
+
from dynamicts.report_generator import log_message_to_html_report, log_plot_to_html_report
|
12 |
+
|
13 |
+
class UnivariateAnalysis:
|
14 |
+
def __init__(self, df: pd.DataFrame, target_col: str, index_col: str = "date"):
|
15 |
+
self.df = df
|
16 |
+
self.target_col = target_col
|
17 |
+
|
18 |
+
column_map = {col.lower(): col for col in self.df.columns}
|
19 |
+
target_col_lower = self.target_col.lower()
|
20 |
+
|
21 |
+
if target_col_lower not in column_map:
|
22 |
+
raise ValueError(f"Target column '{self.target_col}' not found in dataset columns: {self.df.columns.tolist()}")
|
23 |
+
|
24 |
+
self.target_col = column_map[target_col_lower]
|
25 |
+
self.date_col = self.df.index.name
|
26 |
+
|
27 |
+
# Generate report path ONCE per instance
|
28 |
+
root_dir = os.path.abspath(os.curdir)
|
29 |
+
report_root = os.path.join(root_dir, "reports")
|
30 |
+
os.makedirs(report_root, exist_ok=True)
|
31 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
32 |
+
report_name = f"analysis_report_{timestamp}.html"
|
33 |
+
self.report_path = os.path.join(report_root, report_name)
|
34 |
+
|
35 |
+
def plot_distribution(self) -> plt.Figure:
|
36 |
+
y = self.df[self.target_col].dropna()
|
37 |
+
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
|
38 |
+
|
39 |
+
sns.histplot(y, kde=True, ax=axes[0], bins=30, color='cornflowerblue')
|
40 |
+
axes[0].set_title(f"Distribution of {self.target_col}")
|
41 |
+
axes[0].set_xlabel(self.target_col)
|
42 |
+
axes[0].set_ylabel("Frequency")
|
43 |
+
|
44 |
+
sns.boxplot(x=y, ax=axes[1], color="lightcoral")
|
45 |
+
axes[1].set_title(f'Boxplot of {self.target_col}')
|
46 |
+
axes[1].set_xlabel(self.target_col)
|
47 |
+
|
48 |
+
plt.tight_layout()
|
49 |
+
log_plot_to_html_report(fig, title=f"Distribution of {self.target_col}", report_path=self.report_path)
|
50 |
+
plt.close(fig)
|
51 |
+
return fig
|
52 |
+
|
53 |
+
def check_distribution_stats(self):
|
54 |
+
y = self.df[self.target_col].dropna()
|
55 |
+
skewness_val = skew(y)
|
56 |
+
kurtosis_val = kurtosis(y)
|
57 |
+
|
58 |
+
if abs(skewness_val) < 0.5:
|
59 |
+
skew_msg = "approximately_symmetric"
|
60 |
+
elif skewness_val > 0:
|
61 |
+
skew_msg = "right_skewed"
|
62 |
+
else:
|
63 |
+
skew_msg = "left_skewed"
|
64 |
+
|
65 |
+
if kurtosis_val < 0:
|
66 |
+
kurt_msg = "light_tailed (platykurtic)"
|
67 |
+
elif kurtosis_val > 0:
|
68 |
+
kurt_msg = "heavy_tailed (leptokurtic)"
|
69 |
+
else:
|
70 |
+
kurt_msg = "normal_tailed (mesokurtic)"
|
71 |
+
|
72 |
+
full_msg = (
|
73 |
+
f"Skewness of '{self.target_col}': {skewness_val:.4f}\n"
|
74 |
+
f"Kurtosis of '{self.target_col}': {kurtosis_val:.4f}\n"
|
75 |
+
f"→ Distribution is {skew_msg} and {kurt_msg}."
|
76 |
+
)
|
77 |
+
|
78 |
+
log_message_to_html_report(message=full_msg, title=f"Distribution Stats: {self.target_col}", report_path=self.report_path)
|
79 |
+
|
80 |
+
return {
|
81 |
+
"skewness": skewness_val,
|
82 |
+
"kurtosis": kurtosis_val,
|
83 |
+
"skewness_interpretation": skew_msg,
|
84 |
+
"kurtosis_interpretation": kurt_msg,
|
85 |
+
"full_message": full_msg
|
86 |
+
}
|
87 |
+
|
88 |
+
def check_missing_values(self):
|
89 |
+
series = self.df[self.target_col]
|
90 |
+
total_points = len(series)
|
91 |
+
missing_count = series.isna().sum()
|
92 |
+
missing_percentage = (missing_count / total_points) * 100
|
93 |
+
|
94 |
+
msg = f"""
|
95 |
+
Total Observations: {total_points}
|
96 |
+
Missing values in '{self.target_col}': {missing_count} ({missing_percentage:.2f}%)
|
97 |
+
"""
|
98 |
+
|
99 |
+
if missing_count > 0:
|
100 |
+
msg += "<b>Recommendation:</b> Consider forward/backward fill or interpolation if your model does not support missing values."
|
101 |
+
|
102 |
+
log_message_to_html_report(message=msg, title=f"Missing Value Analysis for '{self.target_col}'", report_path=self.report_path)
|
103 |
+
return {
|
104 |
+
"total_observations": total_points,
|
105 |
+
"missing_count": missing_count,
|
106 |
+
"missing_percentage": missing_percentage,
|
107 |
+
"message": msg.strip()
|
108 |
+
}
|
109 |
+
|
110 |
+
def detect_outliers(self, method="both", plot=True):
|
111 |
+
y = self.df[self.target_col].dropna()
|
112 |
+
|
113 |
+
Q1 = y.quantile(0.25)
|
114 |
+
Q3 = y.quantile(0.75)
|
115 |
+
IQR = Q3 - Q1
|
116 |
+
iqr_outliers = y[(y < Q1 - 1.5 * IQR) | (y > Q3 + 1.5 * IQR)]
|
117 |
+
|
118 |
+
z_scores = np.abs(stats.zscore(y))
|
119 |
+
z_outliers = y[z_scores > 3]
|
120 |
+
|
121 |
+
if method == "iqr":
|
122 |
+
combined_outliers = iqr_outliers
|
123 |
+
method_label = "IQR"
|
124 |
+
elif method == "zscore":
|
125 |
+
combined_outliers = z_outliers
|
126 |
+
method_label = "Z-Score"
|
127 |
+
else:
|
128 |
+
combined_outliers = y[(y.index.isin(iqr_outliers.index)) | (y.index.isin(z_outliers.index))]
|
129 |
+
method_label = "IQR + Z-Score"
|
130 |
+
|
131 |
+
outlier_count = len(combined_outliers)
|
132 |
+
total = len(y)
|
133 |
+
percentage = (outlier_count / total) * 100
|
134 |
+
|
135 |
+
msg = f"""
|
136 |
+
Outlier Detection using: {method_label}
|
137 |
+
Total Observations: {total}
|
138 |
+
Outliers Detected: {outlier_count} ({percentage:.2f}%)
|
139 |
+
|
140 |
+
<b>Recommendation:</b> Investigate these points manually before deciding to remove or treat them.
|
141 |
+
"""
|
142 |
+
log_message_to_html_report(message=msg, title=f"Outlier Detection ({method_label})", report_path=self.report_path)
|
143 |
+
|
144 |
+
fig = None
|
145 |
+
if plot:
|
146 |
+
fig, ax = plt.subplots(figsize=(12, 5))
|
147 |
+
sns.lineplot(x=y.index, y=y, label="Original Data", ax=ax)
|
148 |
+
sns.scatterplot(x=combined_outliers.index, y=combined_outliers, color='red', s=40, label="Outliers", ax=ax)
|
149 |
+
ax.set_title(f"Outliers Detected using {method_label}")
|
150 |
+
ax.set_ylabel(self.target_col)
|
151 |
+
ax.set_xlabel("Date")
|
152 |
+
plt.xticks(rotation=45)
|
153 |
+
plt.tight_layout()
|
154 |
+
log_plot_to_html_report(fig=fig, title=f"{method_label} Outlier Detection for {self.target_col}", report_path=self.report_path)
|
155 |
+
plt.close(fig)
|
156 |
+
|
157 |
+
return {
|
158 |
+
"method": method_label,
|
159 |
+
"total_observations": total,
|
160 |
+
"outliers_detected": outlier_count,
|
161 |
+
"percentage_outliers": percentage,
|
162 |
+
"outlier_indices": combined_outliers.index.tolist(),
|
163 |
+
"outlier_values": combined_outliers.tolist(),
|
164 |
+
"fig": fig
|
165 |
+
}
|
166 |
+
|
167 |
+
def run_univariate_analysis(self, df: pd.DataFrame = None, target_col: str = None, index_col: str = None):
|
168 |
+
"""
|
169 |
+
Run univariate analysis using instance attributes by default, or override with provided arguments.
|
170 |
+
"""
|
171 |
+
df = df if df is not None else self.df
|
172 |
+
target_col = target_col if target_col is not None else self.target_col
|
173 |
+
index_col = index_col if index_col is not None else self.date_col
|
174 |
+
|
175 |
+
if df is None or target_col is None:
|
176 |
+
raise ValueError("DataFrame and target_col must be provided either as arguments or instance attributes.")
|
177 |
+
|
178 |
+
try:
|
179 |
+
print(f"\nRunning Univariate Time Series Analysis on '{target_col}' ")
|
180 |
+
analysis = UnivariateAnalysis(df=df, target_col=target_col, index_col=index_col)
|
181 |
+
results = {}
|
182 |
+
|
183 |
+
fig_dist = analysis.plot_distribution()
|
184 |
+
display(fig_dist)
|
185 |
+
results["distribution_plot"] = fig_dist
|
186 |
+
|
187 |
+
dist_stats = analysis.check_distribution_stats()
|
188 |
+
display(Markdown(f"### Distribution Stats\n{dist_stats['full_message']}"))
|
189 |
+
results["distribution_stats"] = dist_stats
|
190 |
+
|
191 |
+
missing = analysis.check_missing_values()
|
192 |
+
display(Markdown(f"### Missing Value Info\n{missing['message']}"))
|
193 |
+
results["missing_values"] = missing
|
194 |
+
|
195 |
+
outliers = analysis.detect_outliers(method="both", plot=True)
|
196 |
+
display(Markdown(f"### Outliers Detected: {outliers['outliers_detected']} ({outliers['percentage_outliers']:.2f}%)"))
|
197 |
+
display(outliers["fig"])
|
198 |
+
results["outliers"] = outliers
|
199 |
+
|
200 |
+
display(Markdown("Univariate Time Series Analysis Completed."))
|
201 |
+
return results
|
202 |
+
|
203 |
+
except Exception as e:
|
204 |
+
print(f"Error in univariate analysis: {e}")
|
205 |
+
return None
|
206 |
+
|
207 |
+
if __name__ == "__main__":
|
208 |
+
from dynamicts.data_loader import DataLoader
|
209 |
+
|
210 |
+
loader = DataLoader(filepath="data/complaints.csv", index_col="date")
|
211 |
+
df = pd.read_csv("data/complaints.csv")
|
212 |
+
analysis = UnivariateAnalysis(
|
213 |
+
df=df,
|
214 |
+
target_col="complaints",
|
215 |
+
index_col="date"
|
216 |
+
)
|
217 |
+
results = analysis.run_univariate_analysis()
|
dynamicts/analysis_helpers.py
ADDED
File without changes
|
dynamicts/data_loader.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
data_loader.py
|
3 |
+
Module to load time series data and provide a shared interface for other modules.
|
4 |
+
"""
|
5 |
+
import logging
|
6 |
+
from typing import Optional, Union
|
7 |
+
import pandas as pd
|
8 |
+
import json
|
9 |
+
import os
|
10 |
+
|
11 |
+
|
12 |
+
class DataLoader:
|
13 |
+
"""
|
14 |
+
DataLoader is a class for loading and managing time series data from a CSV file.
|
15 |
+
Attributes:
|
16 |
+
filepath (str): Path to the CSV file containing the data.
|
17 |
+
index_col (str or int, optional): Column to use as the row labels of the DataFrame.
|
18 |
+
parse_dates (bool or list, optional): Whether to parse dates in the index column.
|
19 |
+
data (pd.DataFrame or None): Loaded data after calling `load()`.
|
20 |
+
Methods:
|
21 |
+
load():
|
22 |
+
Loads the data from the specified CSV file, saves metadata, and standardizes column names to lowercase.
|
23 |
+
Returns the loaded DataFrame.
|
24 |
+
is_regular():
|
25 |
+
Checks if the time series index is regular (i.e., intervals between timestamps are uniform).
|
26 |
+
Returns True if regular, False otherwise.
|
27 |
+
save_metadata():
|
28 |
+
Saves metadata (columns, dtypes, shape, index name) of the loaded DataFrame to a JSON file
|
29 |
+
with the same name as the CSV file, suffixed with '_meta.json'.
|
30 |
+
run_pipeline():
|
31 |
+
Runs the data loading pipeline: loads data, checks regularity, and renames the first column to 'y' if regular.
|
32 |
+
Returns the processed DataFrame if regular, otherwise None.
|
33 |
+
"""
|
34 |
+
|
35 |
+
def __init__(self, filepath: str, index_col: Optional[Union[str, int]] = None, parse_dates: Union[bool, list] = True):
|
36 |
+
self.filepath = filepath
|
37 |
+
self.index_col = index_col
|
38 |
+
self.parse_dates = parse_dates
|
39 |
+
self.data = None
|
40 |
+
|
41 |
+
# Data paths file name
|
42 |
+
self.base_name = os.path.splitext(os.path.basename(self.filepath))[0]
|
43 |
+
|
44 |
+
base_dir = os.getcwd()
|
45 |
+
log_dir = os.path.join(base_dir, "logs")
|
46 |
+
os.makedirs(log_dir, exist_ok=True)
|
47 |
+
log_filename = os.path.splitext(os.path.basename(__file__))[0] + ".log"
|
48 |
+
log_path = os.path.join(log_dir, log_filename)
|
49 |
+
|
50 |
+
# Set up logging
|
51 |
+
if not logging.getLogger().hasHandlers():
|
52 |
+
logging.basicConfig(
|
53 |
+
level=logging.INFO,
|
54 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
55 |
+
handlers=[
|
56 |
+
logging.FileHandler(log_path),
|
57 |
+
logging.StreamHandler()
|
58 |
+
]
|
59 |
+
)
|
60 |
+
|
61 |
+
def load(self) -> pd.DataFrame:
|
62 |
+
"""Load the data from the specified CSV file."""
|
63 |
+
try:
|
64 |
+
self.data = pd.read_csv(self.filepath, index_col=self.index_col, parse_dates=self.parse_dates)
|
65 |
+
self.data.columns = self.data.columns.str.lower()
|
66 |
+
self.data.index.name = self.data.index.name.lower() if self.data.index.name else None
|
67 |
+
# self.save_metadata()
|
68 |
+
return self.data
|
69 |
+
except Exception as e:
|
70 |
+
logging.error(f"Error loading data from {self.filepath}: {e}")
|
71 |
+
raise ValueError(f"Failed to load data from {self.filepath}. Please check the file format and path.") from e
|
72 |
+
|
73 |
+
def is_regular(self) -> bool:
|
74 |
+
"""Check if the time series data is regular."""
|
75 |
+
if self.data.index.isnull().sum() > 0:
|
76 |
+
logging.warning("Data contains null values in the index, Cannot proceed with this data further.")
|
77 |
+
return False
|
78 |
+
|
79 |
+
# Ensure index is a DatetimeIndex
|
80 |
+
if not isinstance(self.data.index, pd.DatetimeIndex):
|
81 |
+
logging.warning("Index is not a DatetimeIndex. Cannot check regularity.")
|
82 |
+
return False
|
83 |
+
|
84 |
+
# Calculate differences between consecutive timestamps
|
85 |
+
diffs = self.data.index.to_series().diff().dropna()
|
86 |
+
if diffs.nunique() == 1:
|
87 |
+
logging.info(f"Data is regular. Index differences are uniform: {diffs.iloc[0]}")
|
88 |
+
return True
|
89 |
+
else:
|
90 |
+
logging.warning("Data is not regular. Index differences are not uniform.")
|
91 |
+
logging.warning(f"Unique differences found: {diffs.unique()}" )
|
92 |
+
return False
|
93 |
+
|
94 |
+
|
95 |
+
def save_metadata(self) -> None:
|
96 |
+
"""Save metadata of the DataFrame to a JSON file."""
|
97 |
+
|
98 |
+
# base_name = os.path.splitext(os.path.basename(self.filepath))[0]
|
99 |
+
base_dir = os.getcwd()
|
100 |
+
metadata_dir = os.path.join(base_dir, "metadata")
|
101 |
+
os.makedirs(metadata_dir, exist_ok=True)
|
102 |
+
meta_filename = os.path.splitext(os.path.basename(self.filepath))[0] + "_meta.json"
|
103 |
+
meta_path = os.path.join(metadata_dir, meta_filename)
|
104 |
+
|
105 |
+
metadata = {
|
106 |
+
"columns": list(self.data.columns),
|
107 |
+
"dtypes": {col: str(dtype) for col, dtype in self.data.dtypes.items()},
|
108 |
+
"shape": self.data.shape,
|
109 |
+
"index_name": self.data.index.name,
|
110 |
+
}
|
111 |
+
try:
|
112 |
+
with open(meta_path, "w") as f:
|
113 |
+
json.dump(metadata, f, indent=4)
|
114 |
+
except Exception as e:
|
115 |
+
logging.error(f"Error saving metadata to {meta_path}: {e}")
|
116 |
+
raise ValueError(f"Failed to save metadata to {meta_path}.") from e
|
117 |
+
|
118 |
+
def run_pipeline(self) -> Optional[pd.DataFrame]:
|
119 |
+
"""Run the data loading pipeline."""
|
120 |
+
logging.info("loading data...")
|
121 |
+
self.load()
|
122 |
+
if not self.is_regular():
|
123 |
+
logging.warning("Pipeline completed. Data is loaded but may not be regular.")
|
124 |
+
return self.data # Return the data anyway for inspection.
|
125 |
+
logging.info("Data loaded is regular. Further processing may be needed.")
|
126 |
+
self.save_metadata()
|
127 |
+
return self.data
|
128 |
+
|
129 |
+
|
130 |
+
# Usage
|
131 |
+
if __name__ == "__main__":
|
132 |
+
loader = DataLoader(filepath="sample_data/date_count.csv", index_col="Date")
|
133 |
+
result = loader.run_pipeline()
|
134 |
+
if result is not None:
|
135 |
+
logging.info("Data loaded successfully.")
|
136 |
+
|
dynamicts/dynamic_analysis.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dynamicts.analysis import UnivariateAnalysis
|
2 |
+
from dynamicts.data_loader import DataLoader
|
3 |
+
from dynamicts.lag_correlation import Correlation
|
4 |
+
|
5 |
+
|
6 |
+
class DynamicTSA:
|
7 |
+
def __init__(self, filepath, target_col, index_col=None, parse_dates = True, lags = 20):
|
8 |
+
self.filepath = filepath
|
9 |
+
self.target_col = target_col
|
10 |
+
self.index_col = index_col
|
11 |
+
self.parse_dates = parse_dates
|
12 |
+
self.lags = lags
|
13 |
+
self.data = None
|
14 |
+
|
15 |
+
def run(self):
|
16 |
+
# Loading data
|
17 |
+
loader = DataLoader(filepath=self.filepath, index_col=self.index_col, parse_dates=self.parse_dates)
|
18 |
+
df = loader.run_pipeline()
|
19 |
+
|
20 |
+
if df is None:
|
21 |
+
print("Data loading failed or data is not regular.")
|
22 |
+
return
|
23 |
+
|
24 |
+
# Univariate analysis
|
25 |
+
ua = UnivariateAnalysis(df, target_col=self.target_col, index_col=self.index_col, output_filepath=self.filepath )
|
26 |
+
ua.plot_distribution()
|
27 |
+
ua.check_distribution_stats()
|
28 |
+
ua.check_missing_values()
|
29 |
+
ua.detect_outliers()
|
30 |
+
|
31 |
+
# lag correlation (ACF/PACF)
|
32 |
+
corr = Correlation(df=df, target_col=self.target_col, lags=self.lags, output_filepath=self.filepath)
|
33 |
+
corr = Correlation(df=df, target_col=self.target_col, lags=self.lags, output_filepath=self.filepath)
|
34 |
+
corr.acf_plot()
|
35 |
+
corr.pacf_plot()
|
36 |
+
|
37 |
+
print("Dynamic time series analysis pipeline completed.")
|
38 |
+
|
39 |
+
# Example usage:
|
40 |
+
if __name__ == "__main__":
|
41 |
+
tsa = DynamicTSA(
|
42 |
+
filepath="data/bitcoin_price.csv",
|
43 |
+
target_col="Close",
|
44 |
+
index_col="Date",
|
45 |
+
lags=30
|
46 |
+
)
|
47 |
+
tsa.run()
|
dynamicts/lag_correlation.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
lag_correlation.py
|
3 |
+
|
4 |
+
This module provides the Correlation class for analyzing and visualizing autocorrelation (ACF)
|
5 |
+
and partial autocorrelation (PACF) in time series data. It supports logging results and plots to HTML reports.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import pandas as pd
|
9 |
+
from datetime import datetime
|
10 |
+
import os
|
11 |
+
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
from dynamicts.report_generator import log_plot_to_html_report, log_message_to_html_report
|
14 |
+
from dynamicts.data_loader import DataLoader
|
15 |
+
|
16 |
+
class Correlation:
|
17 |
+
"""
|
18 |
+
A class for computing and visualizing autocorrelation and partial autocorrelation
|
19 |
+
for time series data.
|
20 |
+
"""
|
21 |
+
def __init__(self, df: pd.DataFrame = None, target_col: str = None, lags: int = 20):
|
22 |
+
"""
|
23 |
+
Initialize the Correlation class.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
df (pd.DataFrame, optional): DataFrame containing the time series data.
|
27 |
+
target_col (str, optional): Name of the column to analyze.
|
28 |
+
lags (int, optional): Number of lags to use for correlation plots.
|
29 |
+
|
30 |
+
Raises:
|
31 |
+
TypeError: If df is not a pandas DataFrame.
|
32 |
+
ValueError: If target_col is not provided or not found in df.
|
33 |
+
"""
|
34 |
+
self.df = df
|
35 |
+
self.target_col = target_col
|
36 |
+
self.lags = lags
|
37 |
+
self.date_col = None # will be set if df is valid
|
38 |
+
|
39 |
+
# If df is provided, validate it
|
40 |
+
if self.df is not None:
|
41 |
+
if not isinstance(self.df, pd.DataFrame):
|
42 |
+
raise TypeError("Expected a pandas DataFrame for `df`.")
|
43 |
+
|
44 |
+
# Save date_col if index name is present
|
45 |
+
self.date_col = self.df.index.name or "index"
|
46 |
+
|
47 |
+
# If target_col is missing
|
48 |
+
if not self.target_col:
|
49 |
+
raise ValueError("`target_col` must be provided when passing a DataFrame.")
|
50 |
+
|
51 |
+
# Validate column name case-insensitively
|
52 |
+
column_map = {col.lower(): col for col in self.df.columns}
|
53 |
+
target_col_lower = self.target_col.lower()
|
54 |
+
|
55 |
+
if target_col_lower not in column_map:
|
56 |
+
raise ValueError(
|
57 |
+
f"Target column '{self.target_col}' not found in DataFrame. "
|
58 |
+
f"Available columns: {self.df.columns.tolist()}"
|
59 |
+
)
|
60 |
+
|
61 |
+
self.target_col = column_map[target_col_lower]
|
62 |
+
# Generate report path ONCE per instance
|
63 |
+
root_dir = os.path.abspath(os.curdir)
|
64 |
+
report_root = os.path.join(root_dir, "reports")
|
65 |
+
os.makedirs(report_root, exist_ok=True)
|
66 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
67 |
+
report_name = f"correlation_report_{timestamp}.html"
|
68 |
+
self.report_path = os.path.join(report_root, report_name)
|
69 |
+
|
70 |
+
|
71 |
+
def acf_plot(self, data: pd.Series = None, lags: int = None, save: bool = True):
|
72 |
+
"""
|
73 |
+
Plot the autocorrelation function (ACF) for the given time series.
|
74 |
+
|
75 |
+
Args:
|
76 |
+
data (pd.Series, optional): Time series data to analyze. If None, uses initialized data.
|
77 |
+
lags (int, optional): Number of lags to plot. Defaults to instance lags.
|
78 |
+
save (bool, optional): Whether to save the plot to an HTML report.
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
matplotlib.figure.Figure: The generated ACF plot.
|
82 |
+
|
83 |
+
Raises:
|
84 |
+
ValueError: If no data is provided.
|
85 |
+
"""
|
86 |
+
series = data if data is not None else self.df[self.target_col]
|
87 |
+
|
88 |
+
if series is None:
|
89 |
+
raise ValueError("No data provided for ACF plot. Pass `data` or instantiate with DataFrame and target_col.")
|
90 |
+
|
91 |
+
lags = lags if lags is not None else self.lags
|
92 |
+
title = f"Auto correlation Plot, lags = {lags}"
|
93 |
+
|
94 |
+
fig, ax = plt.subplots(figsize = (14, 8))
|
95 |
+
plot_acf(series, lags=lags, ax=ax)
|
96 |
+
ax.set_title(title)
|
97 |
+
|
98 |
+
if save:
|
99 |
+
log_plot_to_html_report(fig=fig, title=title, report_path=self.report_path)
|
100 |
+
|
101 |
+
return fig
|
102 |
+
|
103 |
+
|
104 |
+
def pacf_plot(self, data: pd.Series = None, lags: int = None, save: bool = True):
|
105 |
+
"""
|
106 |
+
Plot the partial autocorrelation function (PACF) for the given time series.
|
107 |
+
|
108 |
+
Args:
|
109 |
+
data (pd.Series, optional): Time series data to analyze. If None, uses initialized data.
|
110 |
+
lags (int, optional): Number of lags to plot. Defaults to instance lags.
|
111 |
+
save (bool, optional): Whether to save the plot to an HTML report.
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
matplotlib.figure.Figure: The generated PACF plot.
|
115 |
+
|
116 |
+
Raises:
|
117 |
+
ValueError: If no data is provided.
|
118 |
+
"""
|
119 |
+
series = data if data is not None else self.df[self.target_col]
|
120 |
+
|
121 |
+
if series is None:
|
122 |
+
raise ValueError("No data provided for PACF plot. Pass `data` or instantiate with DataFrame and target_col.")
|
123 |
+
lags = lags if lags is not None else self.lags
|
124 |
+
title = f"Partial Autocorrelation Plot (lags={lags})"
|
125 |
+
|
126 |
+
fig, ax = plt.subplots(figsize = (12, 6))
|
127 |
+
plot_pacf(series, lags=lags, ax=ax)
|
128 |
+
ax.set_title(title)
|
129 |
+
|
130 |
+
if save:
|
131 |
+
log_plot_to_html_report(fig=fig, title=title, report_path=self.report_path)
|
132 |
+
|
133 |
+
return fig
|
134 |
+
def run_lag_correlation(self, data: pd.Series = None, lags: int = None):
|
135 |
+
"""
|
136 |
+
Run both ACF and PACF plots and log them to the report.
|
137 |
+
|
138 |
+
Args:
|
139 |
+
data (pd.Series, optional): Time series data to analyze. If None, uses initialized data.
|
140 |
+
lags (int, optional): Number of lags to plot. Defaults to instance lags.
|
141 |
+
|
142 |
+
Returns:
|
143 |
+
dict: Dictionary with ACF and PACF figures.
|
144 |
+
"""
|
145 |
+
acf_fig = self.acf_plot(data=data, lags=lags, save=True)
|
146 |
+
pacf_fig = self.pacf_plot(data=data, lags=lags, save=True)
|
147 |
+
return {
|
148 |
+
"acf_fig": acf_fig,
|
149 |
+
"pacf_fig": pacf_fig
|
150 |
+
}
|
151 |
+
|
152 |
+
if __name__ == "__main__":
|
153 |
+
# Load the data
|
154 |
+
loader = DataLoader(filepath="data/complaints.csv", index_col="date")
|
155 |
+
df = loader.run_pipeline()
|
156 |
+
|
157 |
+
if df is not None:
|
158 |
+
# ✅ Option 1: Instance-based usage (uses internal config)
|
159 |
+
print("Using instance-based plotting...")
|
160 |
+
corr_instance = Correlation(df=df, target_col="complaints", lags=30)
|
161 |
+
corr_instance.acf_plot()
|
162 |
+
corr_instance.pacf_plot()
|
163 |
+
|
164 |
+
# ✅ Option 2: Standalone-style usage (no target_col needed, just a Series)
|
165 |
+
print("Using standalone-style plotting...")
|
166 |
+
corr_flex = Correlation() # No args needed
|
167 |
+
# corr_flex.acf_plot(data=df["revenue"], lags=30)
|
168 |
+
# corr_flex.pacf_plot(data=df["revenue"], lags
|
dynamicts/report_generator.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import io
|
3 |
+
from datetime import datetime
|
4 |
+
import os
|
5 |
+
|
6 |
+
def log_plot_to_html_report(fig, title, report_path):
|
7 |
+
"""
|
8 |
+
Logs a matplotlib figure to an HTML report file, saving in the root 'reports' directory.
|
9 |
+
|
10 |
+
Parameters
|
11 |
+
----------
|
12 |
+
fig : matplotlib.figure.Figure
|
13 |
+
The matplotlib figure to be saved.
|
14 |
+
title : str
|
15 |
+
The title for the plot section in the report.
|
16 |
+
mod_name : str, optional
|
17 |
+
Module name to use in the report file name.
|
18 |
+
"""
|
19 |
+
# Ensure the reports directory exists
|
20 |
+
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
21 |
+
|
22 |
+
# Save figure to a BytesIO buffer
|
23 |
+
buf = io.BytesIO()
|
24 |
+
fig.savefig(buf, format='png', bbox_inches='tight')
|
25 |
+
buf.seek(0)
|
26 |
+
|
27 |
+
# Convert image to base64 string
|
28 |
+
encoded = base64.b64encode(buf.read()).decode('utf-8')
|
29 |
+
buf.close()
|
30 |
+
|
31 |
+
# HTML content
|
32 |
+
html_block = f"""
|
33 |
+
<h2>{title}</h2>
|
34 |
+
<img src="data:image/png;base64,{encoded}" style="max-width:100%; height:auto;">
|
35 |
+
<p><em>Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</em></p>
|
36 |
+
<hr>
|
37 |
+
"""
|
38 |
+
|
39 |
+
# Write or append to HTML file
|
40 |
+
if not os.path.exists(report_path):
|
41 |
+
with open(report_path, "w") as f:
|
42 |
+
f.write("<html><head><title>Time Series Analysis Report</title></head><body>")
|
43 |
+
f.write(html_block)
|
44 |
+
else:
|
45 |
+
with open(report_path, "a") as f:
|
46 |
+
f.write(html_block)
|
47 |
+
|
48 |
+
def log_message_to_html_report(message, title, report_path):
|
49 |
+
"""
|
50 |
+
Logs a text message to an HTML report file at the given report_path.
|
51 |
+
"""
|
52 |
+
# Ensure the reports directory exists
|
53 |
+
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
54 |
+
|
55 |
+
html_block = f"""
|
56 |
+
<h2>{title}</h2>
|
57 |
+
<p style="font-family: monospace; white-space: pre-wrap;">{message}</p>
|
58 |
+
<p><em>Logged on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</em></p>
|
59 |
+
<hr>
|
60 |
+
"""
|
61 |
+
|
62 |
+
if not os.path.exists(report_path):
|
63 |
+
with open(report_path, "w", encoding="utf-8") as f:
|
64 |
+
f.write("<html><head><title>Time Series Analysis Report</title></head><body>")
|
65 |
+
f.write(html_block)
|
66 |
+
else:
|
67 |
+
with open(report_path, "a", encoding="utf-8") as f:
|
68 |
+
f.write(html_block)
|
dynamicts/report_generator_sh.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import io
|
3 |
+
from datetime import datetime
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
class Reports:
|
8 |
+
def __init__(self, fig=None, title: str = None, message: str = None, data_filepath: str = None, report_name: str = "report.html", mod_name: str = "default_mod"):
|
9 |
+
self.fig = fig
|
10 |
+
self.title = title
|
11 |
+
self.message = message
|
12 |
+
self.mod_name = mod_name
|
13 |
+
# Default data_filepath to current file's directory if not provided
|
14 |
+
if data_filepath is None:
|
15 |
+
self.data_filepath = os.path.dirname(__file__)
|
16 |
+
else:
|
17 |
+
self.data_filepath = data_filepath
|
18 |
+
|
19 |
+
# Set up report directory and path
|
20 |
+
base_name = os.path.splitext(os.path.basename(self.data_filepath))[0]
|
21 |
+
root_dir = os.path.abspath(os.curdir)
|
22 |
+
self.report_root = os.path.join(root_dir, "reports")
|
23 |
+
os.makedirs(self.report_root, exist_ok=True)
|
24 |
+
self.report_path = os.path.join(self.report_root, self.mod_name+report_name)
|
25 |
+
|
26 |
+
# Check if the report file exists
|
27 |
+
self.report_exists = os.path.exists(self.report_path)
|
28 |
+
|
29 |
+
# Optionally, create the file if it doesn't exist
|
30 |
+
if not self.report_exists:
|
31 |
+
with open(self.report_path, "w", encoding="utf-8") as f:
|
32 |
+
f.write("") # Create an empty file
|
33 |
+
|
34 |
+
def log_plot_to_md_report(self, report_name: str = "report.md") -> None:
|
35 |
+
"""
|
36 |
+
Logs a matplotlib figure to a Markdown report file, saving in a directory based on the data file name.
|
37 |
+
"""
|
38 |
+
# Check if the report file exists, create if not
|
39 |
+
if not os.path.exists(self.report_path):
|
40 |
+
with open(self.report_path, "w", encoding="utf-8") as f:
|
41 |
+
f.write("")
|
42 |
+
self.report_exists = True
|
43 |
+
|
44 |
+
# Save plot to memory
|
45 |
+
buf = io.BytesIO()
|
46 |
+
self.fig.savefig(buf, format="png", bbox_inches="tight")
|
47 |
+
buf.seek(0)
|
48 |
+
|
49 |
+
# Convert to base 64
|
50 |
+
encoded = base64.b64encode(buf.read()).decode("utf-8")
|
51 |
+
buf.close()
|
52 |
+
|
53 |
+
# MD Embed
|
54 |
+
markdown_block = f"""
|
55 |
+
## {self.title}
|
56 |
+

|
57 |
+
<sub>Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</sub>
|
58 |
+
---
|
59 |
+
"""
|
60 |
+
|
61 |
+
# Append to report file
|
62 |
+
with open(self.report_path, "a") as f:
|
63 |
+
f.write(markdown_block)
|
64 |
+
|
65 |
+
def log_plot_to_html_report(self, report_name: str = "report.html") -> None:
|
66 |
+
"""
|
67 |
+
Logs a matplotlib figure to an HTML report file, saving in a directory based on the data file name.
|
68 |
+
"""
|
69 |
+
# Check if the report file exists, create if not
|
70 |
+
if not os.path.exists(self.report_path):
|
71 |
+
with open(self.report_path, "w") as f:
|
72 |
+
f.write("<html><head><title>Time Series Analysis Report</title></head><body>")
|
73 |
+
self.report_exists = True
|
74 |
+
|
75 |
+
# Save figure to a BytesIO buffer
|
76 |
+
buf = io.BytesIO()
|
77 |
+
self.fig.savefig(buf, format='png', bbox_inches='tight')
|
78 |
+
buf.seek(0)
|
79 |
+
|
80 |
+
# Convert image to base64 string
|
81 |
+
encoded = base64.b64encode(buf.read()).decode('utf-8')
|
82 |
+
buf.close()
|
83 |
+
|
84 |
+
# HTML content
|
85 |
+
html_block = f"""
|
86 |
+
<h2>{self.title}</h2>
|
87 |
+
<img src="data:image/png;base64,{encoded}" style="max-width:100%; height:auto;">
|
88 |
+
<p><em>Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</em></p>
|
89 |
+
<hr>
|
90 |
+
"""
|
91 |
+
|
92 |
+
# Write or append to HTML file
|
93 |
+
with open(self.report_path, "a") as f:
|
94 |
+
f.write(html_block)
|
95 |
+
|
96 |
+
def log_message_to_html_report(self, report_name: str = "report.html") -> None:
|
97 |
+
"""
|
98 |
+
Logs a text message to an HTML report file, saving in a directory based on the data file name.
|
99 |
+
"""
|
100 |
+
# Check if the report file exists, create if not
|
101 |
+
if not os.path.exists(self.report_path):
|
102 |
+
with open(self.report_path, "w", encoding="utf-8") as f:
|
103 |
+
f.write("<html><head><title>Time Series Analysis Report</title></head><body>")
|
104 |
+
self.report_exists = True
|
105 |
+
|
106 |
+
html_block = f"""
|
107 |
+
<h2>{self.title}</h2>
|
108 |
+
<p style="font-family: monospace; white-space: pre-wrap;">{self.message}</p>
|
109 |
+
<p><em>Logged on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</em></p>
|
110 |
+
<hr>
|
111 |
+
"""
|
112 |
+
|
113 |
+
with open(self.report_path, "a", encoding="utf-8") as f:
|
114 |
+
f.write(html_block)
|
115 |
+
|
116 |
+
@staticmethod
|
117 |
+
def append_all_reports(combined_report: str = "combined_report.html") -> None:
|
118 |
+
"""
|
119 |
+
Appends the contents of all HTML reports in the reports directory into one combined report.
|
120 |
+
Only runs if the reports directory exists and is not empty.
|
121 |
+
"""
|
122 |
+
report_dir = os.path.dirname(os.path.dirname(__file__)) + "/reports"
|
123 |
+
combined_path = os.path.join(report_dir, combined_report)
|
124 |
+
|
125 |
+
# Check if reports directory exists and is not empty
|
126 |
+
if not os.path.isdir(report_dir):
|
127 |
+
print(f"Reports directory '{report_dir}' does not exist.")
|
128 |
+
return
|
129 |
+
report_files = [
|
130 |
+
f for f in os.listdir(report_dir)
|
131 |
+
if f.endswith(".html") and f != combined_report
|
132 |
+
]
|
133 |
+
if not report_files:
|
134 |
+
print(f"No HTML reports found in '{report_dir}'.")
|
135 |
+
return
|
136 |
+
|
137 |
+
contents = []
|
138 |
+
for report_file in report_files:
|
139 |
+
path = os.path.join(report_dir, report_file)
|
140 |
+
if os.path.exists(path):
|
141 |
+
with open(path, "r", encoding="utf-8") as f:
|
142 |
+
content = f.read()
|
143 |
+
# Remove opening and closing HTML/body tags to avoid nesting
|
144 |
+
content = content.replace("<html>", "").replace("</html>", "")
|
145 |
+
content = content.replace("<body>", "").replace("</body>", "")
|
146 |
+
contents.append(f"<h1>{report_file}</h1>\n" + content)
|
147 |
+
|
148 |
+
# Write combined content to new file
|
149 |
+
with open(combined_path, "w", encoding="utf-8") as f:
|
150 |
+
f.write("<html><head><title>Combined Report</title></head><body>\n")
|
151 |
+
f.write("<hr>\n".join(contents))
|
152 |
+
f.write("\n</body></html>")
|
dynamicts/stationarity.py
ADDED
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
stationarity.py
|
3 |
+
---------------
|
4 |
+
Provides tools for stationarity diagnostics and visualization for time series data,
|
5 |
+
including rolling statistics, Augmented Dickey-Fuller (ADF) and KPSS tests, and seasonal decomposition.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
from IPython.display import display, Markdown
|
13 |
+
from datetime import datetime
|
14 |
+
from statsmodels.tsa.stattools import adfuller, kpss
|
15 |
+
from statsmodels.tsa.seasonal import seasonal_decompose
|
16 |
+
from dynamicts.report_generator import log_plot_to_html_report, log_message_to_html_report
|
17 |
+
|
18 |
+
class Stationaritychecker:
|
19 |
+
def __init__(self, df: pd.DataFrame, target_col: str, window: int = 7) -> None:
|
20 |
+
"""
|
21 |
+
Initialize the Stationaritychecker with a DataFrame, target column, and window size.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
df (pd.DataFrame): The DataFrame containing the time series data.
|
25 |
+
target_col (str): The column to analyze for stationarity.
|
26 |
+
window (int): Window size for rolling calculations (default: 7).
|
27 |
+
"""
|
28 |
+
# Clean the target column: remove % if present and convert to float
|
29 |
+
series = df[target_col].replace('%', '', regex=True)
|
30 |
+
series = pd.to_numeric(series, errors='coerce')
|
31 |
+
self.series = series
|
32 |
+
self.window = window
|
33 |
+
|
34 |
+
# Generate report path ONCE per instance
|
35 |
+
root_dir = os.path.abspath(os.curdir)
|
36 |
+
report_root = os.path.join(root_dir, "reports")
|
37 |
+
os.makedirs(report_root, exist_ok=True)
|
38 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
39 |
+
report_name = f"stationarity_report_{timestamp}.html"
|
40 |
+
self.report_path = os.path.join(report_root, report_name)
|
41 |
+
|
42 |
+
def rolling_statistics(self, window=None) -> dict:
|
43 |
+
"""
|
44 |
+
Compute and plot rolling mean, standard deviation, and covariance for a time series.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
window (int): Window size for rolling calculations (default: self.window).
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
dict: Dictionary containing rolling statistics, the figure, and a summary message.
|
51 |
+
"""
|
52 |
+
try:
|
53 |
+
win = window if window is not None else self.window
|
54 |
+
roll_mean = self.series.rolling(win).mean()
|
55 |
+
roll_std = self.series.rolling(win).std()
|
56 |
+
roll_cov = self.series.rolling(win).cov(self.series.shift(1))
|
57 |
+
|
58 |
+
fig, ax = plt.subplots(figsize=(15, 6))
|
59 |
+
ax.plot(self.series.index, self.series, label="Original", alpha=0.5)
|
60 |
+
ax.plot(roll_mean.index, roll_mean, label="Rolling Mean", color='blue')
|
61 |
+
ax.plot(roll_std.index, roll_std, label="Rolling Std Dev", color='green')
|
62 |
+
ax.plot(roll_cov.index, roll_cov, label="Rolling Covariance", color='purple')
|
63 |
+
ax.set_title(f"Rolling Statistics (Window={win})")
|
64 |
+
ax.legend()
|
65 |
+
plt.xticks(rotation=45)
|
66 |
+
plt.tight_layout()
|
67 |
+
|
68 |
+
# Use .name if Series, else fallback
|
69 |
+
series_name = getattr(self.series, 'name', None)
|
70 |
+
if not series_name:
|
71 |
+
series_name = 'series'
|
72 |
+
log_plot_to_html_report(fig, title=f"Rolling Statistics of {series_name}", report_path=self.report_path)
|
73 |
+
plt.close(fig)
|
74 |
+
|
75 |
+
msg = f"""
|
76 |
+
Rolling statistics over window = {win} computed for:
|
77 |
+
1. Mean
|
78 |
+
2. Standard Deviation
|
79 |
+
3. Covariance (with lagged series)
|
80 |
+
|
81 |
+
<b>Tip:</b> Rolling metrics help identify trends, volatility, and local stability.
|
82 |
+
"""
|
83 |
+
log_message_to_html_report(message=msg.strip(), title="Rolling Statistics Summary", report_path=self.report_path)
|
84 |
+
|
85 |
+
return {
|
86 |
+
"window": win,
|
87 |
+
"rolling_mean": roll_mean,
|
88 |
+
"rolling_std": roll_std,
|
89 |
+
"rolling_cov": roll_cov,
|
90 |
+
"fig": fig,
|
91 |
+
"message": msg.strip()
|
92 |
+
}
|
93 |
+
except Exception as e:
|
94 |
+
log_message_to_html_report(f"Error in rolling_statistics: {e}", title="Rolling Statistics Error", report_path=self.report_path)
|
95 |
+
print(f"Exception in rolling_statistics: {e}")
|
96 |
+
return {}
|
97 |
+
|
98 |
+
def adf_test(self, autolag='AIC'):
|
99 |
+
"""
|
100 |
+
Perform Augmented Dickey-Fuller test and return results as a dictionary.
|
101 |
+
|
102 |
+
Args:
|
103 |
+
autolag (str): Method to use when automatically determining the lag (default: 'AIC').
|
104 |
+
|
105 |
+
Returns:
|
106 |
+
dict: Results of the ADF test.
|
107 |
+
"""
|
108 |
+
try:
|
109 |
+
result = adfuller(self.series.dropna(), autolag=autolag)
|
110 |
+
output = {
|
111 |
+
'ADF Statistic': result[0],
|
112 |
+
'p-value': result[1],
|
113 |
+
'Num Lags Used': result[2],
|
114 |
+
'Num Observations Used': result[3],
|
115 |
+
'Critical Values': result[4],
|
116 |
+
'IC Best': result[5]
|
117 |
+
}
|
118 |
+
return output
|
119 |
+
except Exception as e:
|
120 |
+
log_message_to_html_report(f"Error in adf_test: {e}", title="ADF Test Error", report_path=self.report_path)
|
121 |
+
print(f"Exception in adf_test: {e}")
|
122 |
+
return {}
|
123 |
+
|
124 |
+
def kpss_test(self, regression='c', lags='auto'):
|
125 |
+
"""
|
126 |
+
Perform KPSS test and return results as a dictionary.
|
127 |
+
|
128 |
+
Args:
|
129 |
+
regression (str): Type of regression for the test ('c' or 'ct').
|
130 |
+
lags (str or int): Number of lags to use (default: 'auto').
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
dict: Results of the KPSS test.
|
134 |
+
"""
|
135 |
+
try:
|
136 |
+
result = kpss(self.series.dropna(), regression=regression, lags=lags)
|
137 |
+
output = {
|
138 |
+
'KPSS Statistic': result[0],
|
139 |
+
'p-value': result[1],
|
140 |
+
'Num Lags Used': result[2],
|
141 |
+
'Critical Values': result[3]
|
142 |
+
}
|
143 |
+
return output
|
144 |
+
except Exception as e:
|
145 |
+
log_message_to_html_report(f"Error in kpss_test: {e}", title="KPSS Test Error", report_path=self.report_path)
|
146 |
+
print(f"Exception in kpss_test: {e}")
|
147 |
+
return {}
|
148 |
+
|
149 |
+
def print_adf_summary(self, autolag='AIC'):
|
150 |
+
"""
|
151 |
+
Return a Markdown-formatted summary of the ADF test results.
|
152 |
+
|
153 |
+
Args:
|
154 |
+
autolag (str): Method for lag selection.
|
155 |
+
|
156 |
+
Returns:
|
157 |
+
str: Markdown-formatted summary.
|
158 |
+
"""
|
159 |
+
try:
|
160 |
+
result = self.adf_test(autolag)
|
161 |
+
if not result:
|
162 |
+
return "ADF test failed. See logs for details."
|
163 |
+
summary_md = f"""
|
164 |
+
**Augmented Dickey-Fuller Test Results**
|
165 |
+
|
166 |
+
- **ADF Statistic:** `{result['ADF Statistic']:.4f}`
|
167 |
+
- **p-value:** `{result['p-value']:.4g}`
|
168 |
+
- **Num Lags Used:** `{result['Num Lags Used']}`
|
169 |
+
- **Num Observations Used:** `{result['Num Observations Used']}`
|
170 |
+
|
171 |
+
**Critical Values:**
|
172 |
+
"""
|
173 |
+
for key, value in result['Critical Values'].items():
|
174 |
+
summary_md += f"\n- `{key}`: `{value:.4f}`"
|
175 |
+
summary_md += f"\n- **IC Best:** `{result['IC Best']:.4f}`"
|
176 |
+
|
177 |
+
if result["p-value"] <= 0.05:
|
178 |
+
summary_md += "\n\n**-->** ✅ **Reject the null hypothesis:** The time series is **stationary**."
|
179 |
+
else:
|
180 |
+
summary_md += "\n\n**-->** ⚠️ **Fail to reject the null hypothesis:** The time series is **non-stationary**."
|
181 |
+
log_message_to_html_report(message=summary_md.strip(), title="ADF Test Summary", report_path=self.report_path)
|
182 |
+
return summary_md
|
183 |
+
except Exception as e:
|
184 |
+
log_message_to_html_report(f"Error in print_adf_summary: {e}", title="ADF Summary Error", report_path=self.report_path)
|
185 |
+
print(f"Exception in print_adf_summary: {e}")
|
186 |
+
return "ADF summary failed. See logs for details."
|
187 |
+
|
188 |
+
def plot_seasonal_decompose(
|
189 |
+
self,
|
190 |
+
model: str = 'additive',
|
191 |
+
period: int = 12,
|
192 |
+
title: str = 'Seasonal Decomposition'
|
193 |
+
) -> plt.Figure:
|
194 |
+
"""
|
195 |
+
Plot and log the seasonal decomposition of a time series.
|
196 |
+
|
197 |
+
Args:
|
198 |
+
model (str): 'additive' or 'multiplicative'.
|
199 |
+
period (int): The period for decomposition (e.g., 12 for monthly data).
|
200 |
+
title (str): Plot title.
|
201 |
+
|
202 |
+
Returns:
|
203 |
+
plt.Figure: The matplotlib figure object.
|
204 |
+
"""
|
205 |
+
try:
|
206 |
+
decomposition = seasonal_decompose(self.series.dropna(), model=model, period=period)
|
207 |
+
fig = decomposition.plot()
|
208 |
+
fig.set_size_inches(10, 8)
|
209 |
+
plt.suptitle(title)
|
210 |
+
plt.tight_layout()
|
211 |
+
log_plot_to_html_report(fig, title=f"Seasonal Decomposition", report_path=self.report_path)
|
212 |
+
plt.close(fig)
|
213 |
+
return fig
|
214 |
+
except Exception as e:
|
215 |
+
log_message_to_html_report(f"Error in plot_seasonal_decompose: {e}", title="Seasonal Decompose Error", report_path=self.report_path)
|
216 |
+
print(f"Exception in plot_seasonal_decompose: {e}")
|
217 |
+
return None
|
218 |
+
|
219 |
+
def run_stationarity_pipeline(self, window: int = 12, adf_autolag: str = 'AIC', decompose_model: str = 'additive', decompose_period: int = None):
|
220 |
+
"""
|
221 |
+
Run a pipeline: rolling average visual, ADF summary, and seasonal decomposition.
|
222 |
+
Args:
|
223 |
+
window: int, window for rolling statistics.
|
224 |
+
adf_autolag: str, autolag parameter for ADF test.
|
225 |
+
decompose_model: str, 'additive' or 'multiplicative' for seasonal decomposition.
|
226 |
+
decompose_period: int, period for seasonal decomposition.
|
227 |
+
"""
|
228 |
+
try:
|
229 |
+
print("="*60)
|
230 |
+
print("**1. Augmented Dickey-Fuller Test**".center(60))
|
231 |
+
print("="*60)
|
232 |
+
display(Markdown("### 1. Augmented Dickey-Fuller Test"))
|
233 |
+
display(Markdown(self.print_adf_summary(autolag=adf_autolag)))
|
234 |
+
|
235 |
+
print("\n" + "="*60)
|
236 |
+
print("**2. Seasonal Decomposition Visual**".center(60))
|
237 |
+
print("="*60)
|
238 |
+
display(Markdown("### 2. Seasonal Decomposition Visual"))
|
239 |
+
display(self.plot_seasonal_decompose(model=decompose_model, period=decompose_period))
|
240 |
+
|
241 |
+
print("\n" + "="*60)
|
242 |
+
print("**3. Rolling Statistics Visual**".center(60))
|
243 |
+
print("="*60)
|
244 |
+
display(Markdown("### 3. Rolling Statistics Visual"))
|
245 |
+
rolling_stats = self.rolling_statistics(window=window)
|
246 |
+
display(Markdown(rolling_stats.get('message', '')))
|
247 |
+
display(rolling_stats.get('fig', None))
|
248 |
+
except Exception as e:
|
249 |
+
log_message_to_html_report(f"Error in stationarity_pipeline: {e}", title="Stationarity Pipeline Error", report_path=self.report_path)
|
250 |
+
print(f"Exception in stationarity_pipeline: {e}")
|
251 |
+
|
252 |
+
if __name__ == "__main__":
|
253 |
+
import pandas as pd
|
254 |
+
|
255 |
+
# Example: Load a time series from CSV
|
256 |
+
# Replace with your actual file and column names
|
257 |
+
df = pd.read_csv("data/complaints.csv", parse_dates=["date"], index_col="date")
|
258 |
+
# Clean the column if needed (remove % and convert to float)
|
259 |
+
df["complaints"] = pd.to_numeric(df["complaints"].replace('%', '', regex=True), errors='coerce')
|
260 |
+
series = df["complaints"]
|
261 |
+
|
262 |
+
# Create an instance of Stationaritychecker
|
263 |
+
checker = Stationaritychecker(series, window=12)
|
264 |
+
|
265 |
+
# Run rolling statistics
|
266 |
+
rolling_stats = checker.rolling_statistics()
|
267 |
+
print("Rolling statistics computed and logged.")
|
268 |
+
|
269 |
+
# Run ADF test and print summary
|
270 |
+
adf_summary = checker.print_adf_summary()
|
271 |
+
print(adf_summary)
|
272 |
+
|
273 |
+
# Run KPSS test
|
274 |
+
kpss_result = checker.kpss_test()
|
275 |
+
print("KPSS test result:", kpss_result)
|
276 |
+
|
277 |
+
# Plot and log seasonal decomposition
|
278 |
+
checker.plot_seasonal_decompose(model='additive', period=12)
|
279 |
+
|
280 |
+
# Or run the full pipeline
|
281 |
+
# checker.run_stationarity_pipeline(window=12, adf_autolag='AIC', decompose_model='additive', decompose_period
|
dynamicts/volatility_check.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
volatility_check.py
|
3 |
+
|
4 |
+
This module provides the VolatilityChecker class for analyzing and visualizing time series volatility
|
5 |
+
using ARCH and GARCH models. It supports logging results and plots to HTML reports.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
from datetime import datetime
|
10 |
+
import pandas as pd
|
11 |
+
import matplotlib.pyplot as plt
|
12 |
+
from arch import arch_model
|
13 |
+
from dynamicts.report_generator import log_plot_to_html_report, log_message_to_html_report
|
14 |
+
|
15 |
+
class VolatilityChecker:
|
16 |
+
"""
|
17 |
+
A class for checking and visualizing volatility in time series data using ARCH and GARCH models.
|
18 |
+
"""
|
19 |
+
|
20 |
+
def __init__(self, df: pd.DataFrame = None, target_col: str = None):
|
21 |
+
"""
|
22 |
+
Initialize the VolatilityChecker.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
df (pd.DataFrame, optional): DataFrame containing the time series data.
|
26 |
+
target_col (str, optional): Name of the column to analyze for volatility.
|
27 |
+
|
28 |
+
Raises:
|
29 |
+
TypeError: If df is not a pandas DataFrame.
|
30 |
+
ValueError: If target_col is not provided or not found in df.
|
31 |
+
RuntimeError: For any other initialization errors.
|
32 |
+
"""
|
33 |
+
self.df = None
|
34 |
+
self.series = None
|
35 |
+
|
36 |
+
# Generate report path ONCE per instance
|
37 |
+
root_dir = os.path.abspath(os.curdir)
|
38 |
+
report_root = os.path.join(root_dir, "reports")
|
39 |
+
os.makedirs(report_root, exist_ok=True)
|
40 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
41 |
+
report_name = f"volatility_report_{timestamp}.html"
|
42 |
+
self.report_path = os.path.join(report_root, report_name)
|
43 |
+
|
44 |
+
try:
|
45 |
+
if df is not None:
|
46 |
+
if not isinstance(df, pd.DataFrame):
|
47 |
+
raise TypeError("Expected a pandas DataFrame for `df`.")
|
48 |
+
|
49 |
+
if not target_col:
|
50 |
+
raise ValueError("`target_col` must be provided when passing a DataFrame.")
|
51 |
+
|
52 |
+
column_map = {col.lower(): col for col in df.columns}
|
53 |
+
target_col_lower = target_col.lower()
|
54 |
+
|
55 |
+
if target_col_lower not in column_map:
|
56 |
+
raise ValueError(f"Column '{target_col}' not found. Available: {list(df.columns)}")
|
57 |
+
|
58 |
+
self.df = df
|
59 |
+
self.series = df[column_map[target_col_lower]].pct_change().dropna() * 100
|
60 |
+
except Exception as e:
|
61 |
+
raise RuntimeError(f"Initialization failed in VolatilityChecker: {str(e)}") from e
|
62 |
+
|
63 |
+
def arch_volatility(self, data: pd.Series = None, save: bool = True):
|
64 |
+
"""
|
65 |
+
Compute and plot volatility using an ARCH(1) model.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
data (pd.Series, optional): Time series data to analyze. If None, uses initialized data.
|
69 |
+
save (bool, optional): Whether to save the plot and summary to an HTML report.
|
70 |
+
|
71 |
+
Returns:
|
72 |
+
matplotlib.figure.Figure: The generated volatility plot.
|
73 |
+
|
74 |
+
Raises:
|
75 |
+
RuntimeError: If computation fails.
|
76 |
+
"""
|
77 |
+
try:
|
78 |
+
series = data.pct_change().dropna() * 100 if data is not None else self.series
|
79 |
+
if series is None:
|
80 |
+
raise ValueError("No time series data provided. Use `data=` or initialize the class with a DataFrame.")
|
81 |
+
|
82 |
+
model = arch_model(series, vol="ARCH", p=1)
|
83 |
+
res = model.fit(disp='off')
|
84 |
+
cond_vol = res.conditional_volatility
|
85 |
+
|
86 |
+
# Plot
|
87 |
+
fig, ax = plt.subplots(figsize=(14, 6))
|
88 |
+
ax.plot(cond_vol, label="ARCH(1) Volatility", color='orange')
|
89 |
+
ax.set_title("ARCH(1) Estimated Volatility")
|
90 |
+
ax.set_ylabel("Volatility")
|
91 |
+
ax.legend()
|
92 |
+
|
93 |
+
if save:
|
94 |
+
log_plot_to_html_report(fig, title="ARCH Volatility", report_path=self.report_path)
|
95 |
+
|
96 |
+
message = f"""ARCH(1) Volatility Report:
|
97 |
+
- Mean: {cond_vol.mean():.4f}%
|
98 |
+
- Max: {cond_vol.max():.4f}%
|
99 |
+
- Min: {cond_vol.min():.4f}%
|
100 |
+
- Std Dev: {cond_vol.std():.4f}%
|
101 |
+
- Spikes (>1.5×mean): {(cond_vol > 1.5 * cond_vol.mean()).sum()}
|
102 |
+
"""
|
103 |
+
log_message_to_html_report(message, title="ARCH Volatility Summary", report_path=self.report_path)
|
104 |
+
|
105 |
+
return fig
|
106 |
+
|
107 |
+
except Exception as e:
|
108 |
+
raise RuntimeError(f"ARCH volatility computation failed: {str(e)}") from e
|
109 |
+
|
110 |
+
def garch_volatility(self, data: pd.Series = None, save: bool = True):
|
111 |
+
"""
|
112 |
+
Compute and plot volatility using a GARCH(1,1) model.
|
113 |
+
|
114 |
+
Args:
|
115 |
+
data (pd.Series, optional): Time series data to analyze. If None, uses initialized data.
|
116 |
+
save (bool, optional): Whether to save the plot and summary to an HTML report.
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
matplotlib.figure.Figure: The generated volatility plot.
|
120 |
+
|
121 |
+
Raises:
|
122 |
+
RuntimeError: If computation fails.
|
123 |
+
"""
|
124 |
+
try:
|
125 |
+
series = data.pct_change().dropna() * 100 if data is not None else self.series
|
126 |
+
if series is None:
|
127 |
+
raise ValueError("No time series data provided. Use `data=` or initialize the class with a DataFrame.")
|
128 |
+
|
129 |
+
model = arch_model(series, vol="GARCH", p=1, q=1)
|
130 |
+
res = model.fit(disp='off')
|
131 |
+
cond_vol = res.conditional_volatility
|
132 |
+
|
133 |
+
# Plot
|
134 |
+
fig, ax = plt.subplots(figsize=(14, 6))
|
135 |
+
ax.plot(cond_vol, label="GARCH(1,1) Volatility", color='purple')
|
136 |
+
ax.set_title("GARCH(1,1) Estimated Volatility")
|
137 |
+
ax.set_ylabel("Volatility")
|
138 |
+
ax.legend()
|
139 |
+
|
140 |
+
if save:
|
141 |
+
log_plot_to_html_report(fig, title="GARCH Volatility", report_path=self.report_path)
|
142 |
+
|
143 |
+
message = f"""GARCH(1,1) Volatility Report:
|
144 |
+
- Mean: {cond_vol.mean():.4f}%
|
145 |
+
- Max: {cond_vol.max():.4f}%
|
146 |
+
- Min: {cond_vol.min():.4f}%
|
147 |
+
- Std Dev: {cond_vol.std():.4f}%
|
148 |
+
- Spikes (>1.5×mean): {(cond_vol > 1.5 * cond_vol.mean()).sum()}
|
149 |
+
"""
|
150 |
+
log_message_to_html_report(message, title="GARCH Volatility Summary", report_path=self.report_path)
|
151 |
+
|
152 |
+
return fig
|
153 |
+
|
154 |
+
except Exception as e:
|
155 |
+
raise RuntimeError(f"GARCH volatility computation failed: {str(e)}") from e
|
156 |
+
def run_volatility_pipeline(self, data: pd.Series = None):
|
157 |
+
"""
|
158 |
+
Run both ARCH and GARCH volatility analysis and log results to the report.
|
159 |
+
|
160 |
+
Args:
|
161 |
+
data (pd.Series, optional): Time series data to analyze. If None, uses initialized data.
|
162 |
+
|
163 |
+
Returns:
|
164 |
+
dict: Dictionary with ARCH and GARCH volatility figures.
|
165 |
+
"""
|
166 |
+
print("="*60)
|
167 |
+
print("**1. ARCH(1) Volatility Analysis**".center(60))
|
168 |
+
print("="*60)
|
169 |
+
arch_fig = self.arch_volatility(data=data, save=True)
|
170 |
+
|
171 |
+
print("\n" + "="*60)
|
172 |
+
print("**2. GARCH(1,1) Volatility Analysis**".center(60))
|
173 |
+
print("="*60)
|
174 |
+
garch_fig = self.garch_volatility(data=data, save=True)
|
175 |
+
|
176 |
+
return {
|
177 |
+
"arch_fig": arch_fig,
|
178 |
+
"garch_fig": garch_fig
|
179 |
+
}
|
180 |
+
|
181 |
+
if __name__ == "__main__":
|
182 |
+
from dynamicts.data_loader import DataLoader
|
183 |
+
|
184 |
+
loader = DataLoader(filepath="data/complaints.csv", index_col="date")
|
185 |
+
df = loader.run_pipeline()
|
186 |
+
|
187 |
+
# Option 1: Use instance-based approach
|
188 |
+
vc = VolatilityChecker(df=df, target_col="complaints")
|
189 |
+
vc.arch_volatility()
|
190 |
+
vc.garch_volatility()
|
191 |
+
|
192 |
+
# Option 2: Use flexible function-style
|
193 |
+
vc2 = VolatilityChecker()
|
194 |
+
# vc2.arch_volatility(data=df["revenue"])
|
195 |
+
# vc2.garch_volatility(data=df["revenue"])
|
tests/__init__.py
ADDED
File without changes
|
tests/integration/__init__.py
ADDED
File without changes
|
tests/integration/test_int.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Dummy test cases
|
2 |
+
def test_null():
|
3 |
+
assert True
|
tests/unit/__init__.py
ADDED
File without changes
|
tests/unit/test_unit.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import the necessary libraries
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import pytest
|
5 |
+
import pandas as pd
|
6 |
+
from dynamicts.data_loader import DataLoader
|
7 |
+
from dynamicts.analysis import UnivariateAnalysis
|
8 |
+
|
9 |
+
# Dummy test cases
|
10 |
+
def test_null():
|
11 |
+
assert True
|
12 |
+
|
13 |
+
# Constants
|
14 |
+
data_url = "https://raw.githubusercontent.com/Chinar-Quantum-AI-Ltd/public_datasets/main/bitcoin_price.csv"
|
15 |
+
|
16 |
+
loader = DataLoader(filepath=data_url, index_col="Date")
|
17 |
+
|
18 |
+
# tEST FOR DATA loader
|
19 |
+
def test_load_success():
|
20 |
+
df = loader.load()
|
21 |
+
# check datframe loaded
|
22 |
+
assert isinstance(df, pd.DataFrame)
|
23 |
+
# columns are lower case
|
24 |
+
assert all(col == col.lower() for col in df.columns)
|
25 |
+
# index is lower case
|
26 |
+
assert df.index.name == "date"
|
27 |
+
|
28 |
+
def test_load_failure():
|
29 |
+
url = "https://raw.githubusercontent.com/Chinar-Quantum-AI-Ltd/public_datasets/main/price.csv" # invalid url for testing
|
30 |
+
loader = DataLoader(
|
31 |
+
filepath=url, index_col="Date"
|
32 |
+
)
|
33 |
+
with pytest.raises(ValueError):
|
34 |
+
loader.load()
|
35 |
+
|
36 |
+
def test_is_regular():
|
37 |
+
# loader = DataLoader(
|
38 |
+
# filepath=data_url,
|
39 |
+
# index_col="Date"
|
40 |
+
# )
|
41 |
+
loader.load()
|
42 |
+
assert loader.is_regular() is True
|
43 |
+
|
44 |
+
def test_is_regular_false(tmp_path):
|
45 |
+
# Create irregular CSV
|
46 |
+
irregular = tmp_path / "irregular.csv"
|
47 |
+
# create dummy irregular data
|
48 |
+
dts = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-04", "2021-01-07"])
|
49 |
+
df_irreg = pd.DataFrame({"date": dts, "y": [1,2,3,4]}).set_index("date")
|
50 |
+
df_irreg.to_csv(irregular)
|
51 |
+
loader = DataLoader(filepath=str(irregular), index_col="date")
|
52 |
+
loader.load()
|
53 |
+
assert loader.is_regular() is False
|
54 |
+
|
55 |
+
def test_save_metadata(tmp_path, monkeypatch):
|
56 |
+
# Monkey patch workingh dir to temp path for clean metadata
|
57 |
+
monkeypatch.chdir(tmp_path)
|
58 |
+
# loader = DataLoader(
|
59 |
+
# filepath=data_url,
|
60 |
+
# index_col="Date"
|
61 |
+
# )
|
62 |
+
df = loader.load()
|
63 |
+
# Save metadata (writes to ./metadata/<filename>_meta.json)
|
64 |
+
loader.save_metadata()
|
65 |
+
|
66 |
+
# Verify expected file exists
|
67 |
+
expected_filename = os.path.splitext(os.path.basename(data_url))[0] + "_meta.json"
|
68 |
+
meta_path = tmp_path / "metadata" / expected_filename
|
69 |
+
|
70 |
+
assert meta_path.exists()
|
71 |
+
|
72 |
+
# Check metadata content
|
73 |
+
with open(meta_path) as f:
|
74 |
+
meta = json.load(f)
|
75 |
+
|
76 |
+
assert meta["columns"] == list(df.columns)
|
77 |
+
assert meta["shape"] == list(df.shape) or tuple(df.shape)
|
78 |
+
assert meta["index_name"] == df.index.name
|
79 |
+
|
80 |
+
# # Some Test cases for analysis.py script
|
81 |
+
|
82 |
+
# # Tests for univariate analysis module
|
83 |
+
# def test_ditribution_stats_and_missing(monkeypatch, tmp_path):
|
84 |
+
# analysis = UnivariateAnalysis(
|
85 |
+
# filepath=data_url,
|
86 |
+
# target_col="Close",
|
87 |
+
# index_col="Date"
|
88 |
+
# )
|
89 |
+
# # test distribution stats
|
90 |
+
# stats = analysis.check_distribution_stats()
|
91 |
+
# assert "skewness" in stats
|
92 |
+
# assert "kurtosis" in stats
|
93 |
+
# assert isinstance(stats["skewness"], float)
|
94 |
+
# assert isinstance(stats["kurtosis"], float)
|
95 |
+
|
96 |
+
# # # test missing values
|
97 |
+
# # missing = analysis.check_missing_values()
|
98 |
+
# # assert "missing_count" in missing
|
99 |
+
# # assert "missing_percentage" in missing
|
100 |
+
# # assert isinstance(missing["missing_percentage"], float)
|
101 |
+
|
102 |
+
# def test_outlier_detection_and_rolling():
|
103 |
+
# analysis = UnivariateAnalysis(
|
104 |
+
# filepath=data_url,
|
105 |
+
# target_col="Close",
|
106 |
+
# index_col="Date"
|
107 |
+
# )
|
108 |
+
# # tests for outlier detection
|
109 |
+
# outliers = analysis.detect_outliers(method="both", plot=False)
|
110 |
+
# assert "outliers_detected" in outliers
|
111 |
+
# assert outliers["outliers_detected"] >= 0
|
112 |
+
|
113 |
+
# # test for rolling stat
|
114 |
+
# rolling = analysis.measure_rolling_statistics(window=7)
|
115 |
+
# assert "rolling_mean" in rolling
|
116 |
+
# assert isinstance(rolling["rolling_mean"], pd.Series)
|
117 |
+
# assert rolling["rolling_mean"].shape == analysis.df["close"].shape
|