Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
import yfinance as yf | |
import ast | |
from scipy import stats | |
from datetime import datetime, timedelta | |
import pytz | |
import pandas_market_calendars as mcal | |
import alphalens as al | |
import matplotlib.pyplot as plt | |
def sentiment_to_numerical(sentiment): | |
mapping = {'Negative': -1, 'Positive': 1, 'Neutral': 0} | |
return sentiment.map(mapping) | |
def process_sentiment_data(sentiment_data = 'finbert_sentiment.csv', sector_ticker = 'sector_ticker.csv', prices = 'prices.csv'): | |
columns_to_load = ['Ticker', 'pubDate', 'finbert_output'] | |
df = pd.read_csv(sentiment_data, usecols=columns_to_load) | |
df.rename(columns={'Publication Date': 'pubDate','finbert_output': 'Sentiment'}, inplace=True) | |
# Adjusting the dates of news articles | |
nyse = mcal.get_calendar('NYSE') | |
trading_start_hour = 9 | |
trading_start_minute = 30 | |
trading_end_hour = 16 | |
trading_end_minute = 0 | |
def adjust_date(pub_date): | |
if pd.isnull(pub_date) or not isinstance(pub_date, pd.Timestamp): | |
return pub_date | |
trading_start_time = pd.Timestamp(f'{pub_date.date()} {trading_start_hour}:{trading_start_minute}') | |
if pub_date >= trading_start_time: | |
next_trading_day = nyse.schedule(start_date=pub_date.date() + pd.DateOffset(days=1), end_date=pub_date.date() + pd.DateOffset(days=10)).iloc[0]['market_open'] | |
return next_trading_day | |
else: | |
valid_days = nyse.valid_days(start_date=pub_date.date(), end_date=pub_date.date()) | |
if not valid_days.empty and pub_date.date() == valid_days[0].date(): | |
return pub_date | |
else: | |
next_trading_day = nyse.schedule(start_date=pub_date.date() + pd.DateOffset(days=1), end_date=pub_date.date() + pd.DateOffset(days=10)).iloc[0]['market_open'] | |
return next_trading_day | |
df['pubDate'] = df['pubDate'].apply(adjust_date) | |
# Converting probabiltiies to one value | |
def convert_sentiment_to_score(sentiment): | |
predicted_sentiment_probabilities = {} | |
components = sentiment.split(', ') | |
for component in components: | |
key_value = component.split(':') | |
if len(key_value) == 2: | |
key, value = key_value | |
key = key.strip(" '{}").capitalize() | |
try: | |
value = float(value.strip()) | |
except ValueError: | |
continue | |
predicted_sentiment_probabilities[key] = value | |
positive = predicted_sentiment_probabilities.get('Positive', 0) | |
negative = predicted_sentiment_probabilities.get('Negative', 0) | |
neutral = predicted_sentiment_probabilities.get('Neutral',0) | |
sentiment_score = (positive - negative)/(1 + neutral) | |
return sentiment_score | |
df['Sentiment_Score_2'] = df['Sentiment'].apply(convert_sentiment_to_score) | |
# replacing invalid tickers | |
df['pubDate'] = pd.to_datetime(df['pubDate'], utc=True, format='ISO8601') | |
df['pubDate'] = df['pubDate'].dt.date | |
print(df['pubDate'].dtypes) | |
replacements = { | |
'ATVI': 'ATVIX', | |
'ABC': 'ABG', | |
'FBHS': 'FBIN', | |
'FISV': 'FI', | |
'FRC': 'FRCB', | |
'NLOK': 'SYM.MU', | |
'PKI': 'PKN.SG', | |
'RE': 'EG', | |
'SIVB': 'SIVBQ', | |
} | |
df['Ticker'] = df['Ticker'].replace(replacements) | |
df = df[df['Ticker'] != 'SBNY'] | |
# | |
aggregated_data = df.groupby(['Ticker', 'pubDate'])['Sentiment_Score_2'].mean().reset_index() | |
aggregated_data['pubDate'] = pd.to_datetime(aggregated_data['pubDate']).dt.tz_localize('UTC') | |
aggregated_data.set_index(['pubDate', 'Ticker'], inplace=True) | |
prices = pd.read_csv(prices, index_col=0, parse_dates=True) | |
# | |
equal_weighted_benchmark = prices.pct_change(periods=1).shift(periods=-1).mean(axis=1) | |
equal_weighted_benchmark_df = equal_weighted_benchmark.reset_index() | |
equal_weighted_benchmark_df.columns = ['date', 'equal_weighted_benchmark'] | |
returns_5d=prices.pct_change(periods=5).shift(periods=-5)/5 | |
returns_10d=prices.pct_change(periods=10).shift(periods=-10)/10 | |
returns_20d=prices.pct_change(periods=20).shift(periods=-20)/20 | |
mean_5d = returns_5d.mean(axis=1).reset_index() | |
mean_10d = returns_10d.mean(axis=1).reset_index() | |
mean_20d = returns_20d.mean(axis=1).reset_index() | |
mean_5d.columns = ['date', '5d_mean_return'] | |
mean_10d.columns = ['date', '10d_mean_return'] | |
mean_20d.columns = ['date', '20d_mean_return'] | |
equal_weighted_benchmark_df = equal_weighted_benchmark_df.merge(mean_5d, on='date', how='left') | |
equal_weighted_benchmark_df = equal_weighted_benchmark_df.merge(mean_10d, on='date', how='left') | |
equal_weighted_benchmark_df = equal_weighted_benchmark_df.merge(mean_20d, on='date', how='left') | |
cut_date_min= aggregated_data.index.get_level_values('pubDate').min() | |
cut_date_max= aggregated_data.index.get_level_values('pubDate').max() | |
equal_weighted_benchmark_df = equal_weighted_benchmark_df[equal_weighted_benchmark_df.date>=cut_date_min] | |
equal_weighted_benchmark_df = equal_weighted_benchmark_df[equal_weighted_benchmark_df.date<=cut_date_max] | |
equal_weighted_benchmark_df | |
# | |
tickers = aggregated_data.index.get_level_values('Ticker').unique() | |
start_date = aggregated_data.index.get_level_values('pubDate').min() - pd.Timedelta(days=30) | |
end_date = aggregated_data.index.get_level_values('pubDate').max() + pd.Timedelta(days=30) | |
all_dates = prices.loc[cut_date_min:cut_date_max].index | |
all_tickers_dates = pd.MultiIndex.from_product([tickers, all_dates], names=['Ticker', 'Date']) | |
all_tickers_dates_df = pd.DataFrame(index=all_tickers_dates).reset_index() | |
aggregated_data_reset = aggregated_data.reset_index() | |
merged_data = pd.merge(all_tickers_dates_df, aggregated_data_reset, how='left', left_on=['Ticker', 'Date'], right_on=['Ticker', 'pubDate']) | |
sector_data = pd.read_excel('scraping.xlsx', usecols=['Ticker', 'Sector']) | |
merged_data = merged_data.reset_index() | |
merged_data = pd.merge(merged_data, sector_data, how='left', left_on='Ticker', right_on='Ticker') | |
# | |
decay_factor = 0.7 | |
for ticker in tickers: | |
ticker_data = merged_data[merged_data['Ticker'] == ticker].copy() | |
original_nans = ticker_data['Sentiment_Score_2'].isna() | |
ticker_data['Sentiment_Score_2'] = ticker_data['Sentiment_Score_2'].ffill() | |
for i in range(1, len(ticker_data)): | |
if original_nans.iloc[i]: | |
ticker_data.iloc[i, ticker_data.columns.get_loc('Sentiment_Score_2')] = ticker_data.iloc[i - 1, ticker_data.columns.get_loc('Sentiment_Score_2')] * decay_factor | |
merged_data.loc[merged_data['Ticker'] == ticker, 'Sentiment_Score_2'] = ticker_data['Sentiment_Score_2'] | |
merged_data['Sentiment_Score_2'].fillna(0, inplace=True) | |
merged_data.drop(columns=['pubDate'], inplace=True) | |
merged_data.set_index(['Date', 'Ticker'], inplace=True) | |
return merged_data, prices, equal_weighted_benchmark_df | |
# Alphalens | |
def alphalens_analysis(merged_data, prices): | |
factor_data=[] | |
factor_data = al.utils.get_clean_factor_and_forward_returns( | |
factor=merged_data['Sentiment_Score_2'], | |
prices=prices, | |
binning_by_group=False, | |
bins=None, | |
quantiles=5, | |
periods=(1, 5, 10, 20), | |
groupby=merged_data['Sector'], | |
) | |
al.tears.create_returns_tear_sheet(factor_data, long_short=True, group_neutral=False) | |
return factor_data | |
def alphalens_analysis_by_sector(factor_data): | |
mean_return_by_qt, std_err_by_qt = al.performance.mean_return_by_quantile(factor_data, by_group=True) | |
al.plotting.plot_quantile_returns_bar(mean_return_by_qt, by_group=True) | |
def calculate_information_ratio(factor_data, equal_weighted_benchmark_df): | |
# Merge the factor data with the benchmark data | |
factor_data = factor_data.merge(equal_weighted_benchmark_df, on='date', how='left') | |
# Calculate excess returns for various holding periods | |
factor_data['excess_return_1D'] = factor_data['1D'] - factor_data['equal_weighted_benchmark'] | |
factor_data['excess_return_5D'] = factor_data['5D'] - factor_data['5d_mean_return'] | |
factor_data['excess_return_10D'] = factor_data['10D'] - factor_data['10d_mean_return'] | |
factor_data['excess_return_20D'] = factor_data['20D'] - factor_data['20d_mean_return'] | |
# Initialize a DataFrame to store IR results | |
results = pd.DataFrame(index=range(1, 6), columns=['IR 1D', 'IR 5D', 'IR 10D', 'IR 20D']) | |
# Calculate IR for each quantile and holding period | |
for quantile in range(1, 6): | |
for period in [1, 5, 10, 20]: | |
column_name = f'excess_return_{period}D' | |
tmp = factor_data[factor_data.factor_quantile == quantile][['date', column_name]].groupby('date').mean() | |
ir = np.mean(tmp) / np.std(tmp) * np.sqrt(252) | |
results.at[quantile, f'IR {period}D'] = ir.values[0] | |
from IPython.display import display | |
display(results.style.format("{:.3f}")) | |