eagle0504's picture
Create utils/helper.py
382f27a verified
import pandas as pd
import plotly.graph_objects as go
import streamlit as st
import yfinance as yf
from plotly.subplots import make_subplots
from scipy.stats import norm
import numpy as np
def calculate_macd(
data: pd.DataFrame,
short_window: int = 12,
long_window: int = 26,
signal_window: int = 9,
) -> pd.DataFrame:
"""
Calculate the Moving Average Convergence Divergence (MACD) and Signal line indicators.
Parameters:
data (pd.DataFrame): The dataframe containing stock price information.
short_window (int): The number of periods for the shorter exponential moving average (EMA).
Default is 12.
long_window (int): The number of periods for the longer EMA. Default is 26.
signal_window (int): The number of periods for the signal line EMA. Default is 9.
Returns:
pd.DataFrame: The input Dataframe with additional columns 'MACD' and 'Signal_Line'
which contains the computed MACD values and signal line values respectively.
Note: The function assumes that the input DataFrame contains a 'Close' column from which it computes the EMAs.
"""
# Calculate the Short term Exponential Moving Average
short_ema = data.Close.ewm(span=short_window, adjust=False).mean()
# Calculate the Long term Exponential Moving Average
long_ema = data.Close.ewm(span=long_window, adjust=False).mean()
# Compute MACD (short EMA - long EMA)
data["MACD"] = short_ema - long_ema
# Compute Signal Line (EMA of MACD)
data["Signal_Line"] = data.MACD.ewm(span=signal_window, adjust=False).mean()
return data
def calculate_normalized_macd(
data: pd.DataFrame,
short_window: int = 12,
long_window: int = 26,
signal_window: int = 9,
) -> pd.DataFrame:
"""
Calculate the normalized Moving Average Convergence Divergence (MACD) and Signal line.
The MACD is a trend-following momentum indicator that shows the relationship between
two moving averages of a security's price. The MACD is calculated by subtracting the
long-term exponential moving average (EMA) from the short-term EMA. A nine-day EMA of
the MACD called the "Signal Line," is then plotted on top of the MACD, functioning as
a trigger for buy and sell signals.
This function adds a normalization step to the typical MACD calculation by standardizing
the values using z-scores.
Parameters:
data (pd.DataFrame): The dataframe containing stock price information with a 'Close' column.
short_window (int): The number of periods for the shorter EMA. Default is 12.
long_window (int): The number of periods for the longer EMA. Default is 26.
signal_window (int): The number of periods for the signal line EMA. Default is 9.
Returns:
pd.DataFrame: The input Dataframe is returned with additional columns 'MACD' and 'Signal_Line',
which contains the computed normalized MACD and signal line values respectively.
"""
# Calculate the Short term Exponential Moving Average
short_ema = data.Close.ewm(span=short_window, adjust=False).mean()
# Calculate the Long term Exponential Moving Average
long_ema = data.Close.ewm(span=long_window, adjust=False).mean()
# Compute MACD (short EMA - long EMA)
data["MACD"] = short_ema - long_ema
# Compute Signal Line (EMA of MACD)
data["Signal_Line"] = data.MACD.ewm(span=signal_window, adjust=False).mean()
# Normalize the 'MACD' column using z-score normalization
data["MACD"] = (data["MACD"] - data["MACD"].mean()) / data["MACD"].std()
# Normalize the 'Signal_Line' column using z-score normalization
data["Signal_Line"] = (data["Signal_Line"] - data["Signal_Line"].mean()) / data[
"Signal_Line"
].std()
return data
def calculate_percentile_macd(
data: pd.DataFrame,
short_window: int = 12,
long_window: int = 26,
signal_window: int = 9,
) -> pd.DataFrame:
"""
Calculate the percentile-based Moving Average Convergence Divergence (MACD) and Signal line.
This function computes the MACD by subtracting the long-term exponential moving average (EMA)
from the short-term EMA. It then calculates the Signal Line, which is a smoothing of the MACD
values. After normalization using z-scores, the normalized MACD and Signal Line values are converted
to percentiles, which are then rescaled to range from -100% to +100%.
Parameters:
data (pd.DataFrame): The dataframe containing stock price information with a 'Close' column.
short_window (int): The number of periods for the shorter EMA. Default is 12.
long_window (int): The number of periods for the longer EMA. Default is 26.
signal_window (int): The number of periods for the signal line EMA. Default is 9.
Returns:
pd.DataFrame: The input Dataframe with additional columns 'MACD' and 'Signal_Line', representing
the rescaled percentile values of the corresponding MACD and signal line calculations.
"""
# Calculate the Short term Exponential Moving Average
short_ema = data.Close.ewm(span=short_window, adjust=False).mean()
# Calculate the Long term Exponential Moving Average
long_ema = data.Close.ewm(span=long_window, adjust=False).mean()
# Compute MACD (short EMA - long EMA)
data["MACD"] = short_ema - long_ema
# Compute Signal Line (EMA of MACD)
data["Signal_Line"] = data.MACD.ewm(span=signal_window, adjust=False).mean()
# Normalize the 'MACD' column using z-score normalization
data["MACD"] = (data["MACD"] - data["MACD"].mean()) / data["MACD"].std()
# Normalize the 'Signal_Line' column using z-score normalization
data["Signal_Line"] = (data["Signal_Line"] - data["Signal_Line"].mean()) / data[
"Signal_Line"
].std()
# Convert normalized data to percentiles (CDF) and rescale to -100% to +100%
# Rescaling allows comparing the relative position of the current value within the distribution
data["MACD"] = norm.cdf(data["MACD"]) * 200 - 100
data["Signal_Line"] = norm.cdf(data["Signal_Line"]) * 200 - 100
return data
def find_crossovers(
df: pd.DataFrame, bullish_threshold: float, bearish_threshold: float
) -> pd.DataFrame:
"""
Identifies the bullish and bearish crossover points between MACD and Signal Line.
This function checks where the MACD line crosses the Signal Line from below (bullish crossover)
or from above (bearish crossover). It then marks these crossovers with a 1 for bullish or -1
for bearish within a new column in the DataFrame called 'Crossover'.
Parameters:
df (pd.DataFrame): The dataframe containing the columns 'MACD' and 'Signal_Line'.
bullish_threshold (float): The threshold above which a crossover is considered bullish.
bearish_threshold (float): The threshold below which a crossover is considered bearish.
Returns:
pd.DataFrame: The input DataFrame with an additional 'Crossover' column indicating
the bullish (+1) and bearish (-1) crossovers.
"""
# Initialize 'Crossover' column to zero, indicating no crossover by default
df["Crossover"] = 0
# Find bullish crossovers - when the MACD crosses the Signal Line from below
# and the Signal Line is below the bullish threshold.
crossover_indices = df.index[
(df["MACD"] > df["Signal_Line"])
& (df["MACD"].shift() < df["Signal_Line"].shift())
& (df["Signal_Line"] < bullish_threshold)
]
# Mark the bullish crossovers with 1 in the 'Crossover' column
df.loc[crossover_indices, "Crossover"] = 1
# Find bearish crossovers - when the MACD crosses the Signal Line from above
# and the Signal Line is above the bearish threshold.
crossover_indices = df.index[
(df["MACD"] < df["Signal_Line"])
& (df["MACD"].shift() > df["Signal_Line"].shift())
& (df["Signal_Line"] > bearish_threshold)
]
# Mark the bearish crossovers with -1 in the 'Crossover' column
df.loc[crossover_indices, "Crossover"] = -1
return df
def get_fundamentals(ticker: str):
"""
Fetches the income statement, balance sheet, and cash flow statement for a given stock ticker.
This function retrieves fundamental financial information about a stock using the yfinance library,
which fetches this data from Yahoo Finance.
Parameters:
ticker (str): The stock symbol to query.
Returns:
tuple of pandas.DataFrame: A 3-tuple where the first element is an income statement DataFrame,
the second is a balance sheet DataFrame, and the third
is a cash flow statement DataFrame.
"""
# Create a Ticker object which allows access to Yahoo finance's vast data source
stock = yf.Ticker(ticker)
# Fetching and returning annual income statement, balance sheet, and cashflow data
return stock.income_stmt, stock.balance_sheet, stock.cashflow
def create_fig(data: pd.DataFrame, ticker: str) -> go.Figure:
"""
Creates a Plotly graph object (figure) that includes a candlestick plot of the stock prices,
moving averages and a MACD (Moving Average Convergence Divergence) chart for the given data.
Parameters:
data (pandas.DataFrame): The input data containing the stock price information.
It must include 'Close', 'Open', 'High', 'Low' columns and
'MACD', 'Signal_Line', 'Crossover' values calculated externally.
ticker (str): The stock symbol used in subplot titles to indicate the stock being analyzed.
Returns:
plotly.graph_objs._figure.Figure: A figure object which includes the visualization of
the stock prices with moving averages and a MACD chart.
"""
# Calculate moving averages
data["MA12"] = data["Close"].rolling(window=12).mean()
data["MA26"] = data["Close"].rolling(window=26).mean()
data["MA50"] = data["Close"].rolling(window=50).mean()
data["MA200"] = data["Close"].rolling(window=200).mean()
# Initialize figure with subplots
fig = make_subplots(
rows=2,
cols=1,
shared_xaxes=True,
vertical_spacing=0.02,
subplot_titles=(f"{ticker} Candlestick", "MACD"),
row_width=[0.2, 0.7],
)
# Add Candlestick trace
fig.add_trace(
go.Candlestick(
x=data.index,
open=data["Open"],
high=data["High"],
low=data["Low"],
close=data["Close"],
name="Candlestick",
),
row=1,
col=1,
)
# Add Moving Average traces
for ma, color in zip(
["MA12", "MA26", "MA50", "MA200"], ["magenta", "cyan", "yellow", "black"]
):
fig.add_trace(
go.Scatter(
x=data.index,
y=data[ma],
line=dict(color=color, width=1.5),
name=f"{ma} days MA",
),
row=1,
col=1,
)
# Add MACD and Signal Line traces
fig.add_trace(
go.Scatter(
x=data.index, y=data["MACD"], line=dict(color="blue", width=2), name="MACD"
),
row=2,
col=1,
)
fig.add_trace(
go.Scatter(
x=data.index,
y=data["Signal_Line"],
line=dict(color="orange", width=2),
name="Signal Line",
),
row=2,
col=1,
)
# Add markers for Bullish and Bearish crossovers on MACD chart
fig.add_trace(
go.Scatter(
mode="markers",
x=data[data["Crossover"] == 1].index,
y=data[data["Crossover"] == 1]["MACD"],
marker_symbol="triangle-up",
marker_color="green",
marker_size=20,
name="Bullish Crossover (MACD) ✅",
),
row=2,
col=1,
)
fig.add_trace(
go.Scatter(
mode="markers",
x=data[data["Crossover"] == -1].index,
y=data[data["Crossover"] == -1]["MACD"],
marker_symbol="triangle-down",
marker_color="red",
marker_size=20,
name="Bearish Crossover (MACD) 🈲",
),
row=2,
col=1,
)
# Add markers for Bullish and Bearish crossovers on the Candlestick chart
fig.add_trace(
go.Scatter(
mode="markers",
x=data[data["Crossover"] == 1].index,
y=data[data["Crossover"] == 1]["Close"],
marker_symbol="triangle-up",
marker_color="green",
marker_size=25,
name="Bullish Crossover (Close) ✅",
),
row=1,
col=1,
)
fig.add_trace(
go.Scatter(
mode="markers",
x=data[data["Crossover"] == -1].index,
y=data[data["Crossover"] == -1]["Close"],
marker_symbol="triangle-down",
marker_color="red",
marker_size=25,
name="Bearish Crossover (Close) 🈲",
),
row=1,
col=1,
)
# Update layout configurations
fig.update_layout(
xaxis_rangeslider_visible=False,
height=800, # Define the height of the figure
)
return fig
def generate_simulated_data(data: pd.DataFrame, num_days: int) -> pd.DataFrame:
"""
Generates simulated future data for a given DataFrame based on the statistical characteristics
(mean and standard deviation) of the input data.
The simulation assumes normally distributed returns and extrapolates future values by computing
the cumulative product of random returns.
Parameters:
data (pandas.DataFrame): The historical data on which the simulation will be based. The index must be date-based.
num_days (int): The number of days into the future for which data should be simulated.
Returns:
pandas.DataFrame: A DataFrame containing the original historical data appended with the simulated future data.
"""
# Compute mean and standard deviation for each column
means = data.mean()
stds = data.std()
# Generate random returns from normal distribution
random_returns = pd.DataFrame()
for col in data.columns:
random_returns[col] = np.random.normal(loc=means[col], scale=stds[col], size=num_days)
# Add 1 to the returns
random_returns += 1
# Compute cumulative product to get factors
factors = random_returns.cumprod()
# Generate future dates
last_date = data.index[-1]
future_dates = pd.date_range(start=last_date + pd.DateOffset(days=1), periods=num_days)
# Append future factors to original data
future_data = pd.DataFrame(index=future_dates, columns=data.columns, data=factors.values)
# Concatenate original data and future data
simulated_data = pd.concat([data, future_data])
return simulated_data