Spaces:
Sleeping
Sleeping
# import streamlit as st | |
# import pandas as pd | |
# import numpy as np | |
# import os | |
# import ast | |
# import openai | |
# from openai import OpenAI | |
# import json | |
# from getpass import getpass | |
# from scipy.spatial.distance import cosine | |
# from tqdm import tqdm | |
# import matplotlib.pyplot as plt | |
# import financial_analysis as fa | |
# from financial_analysis import alphalens_analysis, alphalens_analysis_by_sector, calculate_information_ratio, process_sentiment_data | |
# def get_sentiment_gpt(company, SASB, news, max_retries=5, model = 'gpt-4-turbo-2024-04-09'): | |
# system_prompt = """ | |
# As a specialist in ESG analytics, | |
# You possess a deep understanding of evaluating environmental, social, and governance factors in the context of corporate news. | |
# Your expertise lies in discerning the underlying sentiment of news segments that pertain to a company's ESG practices, | |
# determining whether the coverage reflects a positive, negative, or neutral stance. | |
# """ | |
# allowed_sentiments = ['Negative', 'Positive', 'Neutral'] | |
# attempt = 0 | |
# while attempt < max_retries: | |
# main_prompt = f""" | |
# Classify the sentiment (Only options: Positive, Negative, Neutral) of the following news: {news} | | |
# The sentiment classification should be about the sections of the news talking about the company {company}. | | |
# The ESG part of the news should be around topics within the following SASB topics {SASB} | |
# The output should be a structured JSON object with the key: "sentiment". | |
# Here is the format I expect for the JSON object: | |
# {{ | |
# "sentiment": "Enter 'Positive', 'Neutral', or 'Negative'", | |
# }} | |
# Do not return any additional text or information outside of this JSON structure. | |
# """ | |
# messages = [ | |
# {"role": "system", "content": system_prompt}, | |
# {"role": "user", "content": main_prompt} | |
# ] | |
# response = openai.chat.completions.create( | |
# model=model, | |
# messages=messages, | |
# response_format={"type": "json_object"} # Enable JSON mode | |
# ) | |
# response_json = json.loads(response.choices[0].message.content) | |
# json_sentiment = response_json.get('sentiment') | |
# if json_sentiment in allowed_sentiments: | |
# return json_sentiment | |
# attempt += 1 | |
# # After max retries, if no valid sentiment is found, handle as needed (e.g., return a default sentiment) | |
# print("Failed to obtain a valid sentiment after maximum retries. Defaulting to 'Neutral'.") | |
# return 'Neutral' # Default return value if no valid sentiment is obtained | |
# def update_dataset_with_gpt_sentiment(df, model, column_name='GPT_based_sentiment'): | |
# # Initialize the new column to store GPT-based sentiment | |
# df['GPT_based_sentiment'] = None | |
# # Use tqdm to show a progress bar for the operation | |
# for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"): | |
# # Extract necessary information for each row | |
# company = row['Company'] # Make sure this matches your DataFrame's column name | |
# SASB = row['SASB'] # Make sure this matches your DataFrame's column name | |
# news = row['title & content'] # Make sure this matches your DataFrame's column name | |
# # Call the function to get the sentiment | |
# sentiment = get_sentiment_gpt(company, SASB, news, model=model) | |
# # Update the DataFrame with the obtained sentiment | |
# df.at[index, column_name] = sentiment # Now correctly assigns the sentiment | |
# return df | |
# # Function to get embeddings, provided by you | |
# def get_embedding(text, model="text-embedding-3-small"): | |
# client = OpenAI() | |
# text = text.replace("\n", " ") | |
# return client.embeddings.create(input=[text], model=model).data[0].embedding | |
# # Function to calculate cosine similarity | |
# def cosine_similarity(v1, v2): | |
# return 1 - cosine(v1, v2) | |
# def calculate_sasb_embeddings(sasb_str): | |
# # Safely convert the string representation of a dictionary into an actual dictionary | |
# try: | |
# sasb_dict = ast.literal_eval(sasb_str) | |
# if not isinstance(sasb_dict, dict): | |
# raise ValueError("SASB column does not contain a valid dictionary.") | |
# except ValueError as e: | |
# print(f"Error converting SASB column to dictionary: {e}") | |
# return {} | |
# sasb_embeddings = {} | |
# for topic, content in sasb_dict.items(): | |
# # Join the list of keywords into a single string | |
# combined_content = ' '.join(content) | |
# sasb_embeddings[topic] = get_embedding(combined_content) | |
# return sasb_embeddings | |
# # Function to process ESG classification | |
# def classify_esg(data): | |
# # Calculate embeddings for the news | |
# data['news_embeddings'] = data['title & content'].apply(get_embedding) | |
# # Calculate embeddings for SASB topics (you need to have your SASB topics defined) | |
# data['sasb_embeddings'] = data['SASB'].apply(calculate_sasb_embeddings) | |
# # Compute cosine similarities | |
# data['cosine_similarities'] = data.apply( | |
# lambda row: {topic: cosine_similarity(row['news_embeddings'], emb) | |
# for topic, emb in row['sasb_embeddings'].items()}, | |
# axis=1 | |
# ) | |
# # Extract max cosine similarity | |
# data['max_cosine_similarity'] = data['cosine_similarities'].apply(lambda x: max(x.values())) | |
# # Mark the top 10% of news by max_cosine_similarity within each 'Sector' as 'Yes' | |
# sector_thresholds = data.groupby('Sector')['max_cosine_similarity'].quantile(0.9).to_dict() | |
# data['ESG_relevance'] = data.apply( | |
# lambda row: 'Yes' if row['max_cosine_similarity'] >= sector_thresholds[row['Sector']] else 'No', | |
# axis=1 | |
# ) | |
# return data | |
# def main(): | |
# st.set_page_config(page_title="NLP ESG Project", page_icon="π") | |
# # Custom styles | |
# st.markdown( | |
# """ | |
# <style> | |
# .streamlit-container { | |
# background-color: #F5F5F5; | |
# } | |
# .stButton>button { | |
# width: 100%; | |
# border-radius: 10px; | |
# border: none; | |
# margin: 10px 0; | |
# padding: 15px 20px; | |
# background-color: #79AEC8; | |
# color: white; | |
# font-size: 18px; | |
# } | |
# .stButton>button:hover { | |
# background-color: #6699CC; | |
# } | |
# </style> | |
# """, | |
# unsafe_allow_html=True, | |
# ) | |
# # Header section | |
# st.write("# NLP Project: ESG News Analysis and Financial Impact") | |
# st.sidebar.write("## Configuration") | |
# # API Key input | |
# openai_api_key = st.sidebar.text_input("Enter your OpenAI API key", type="password") | |
# openai_api_key = os.getenv('OPENAI_API_KEY') | |
# os.environ["OPENAI_API_KEY"] = openai_api_key | |
# openai.api_key = openai_api_key | |
# # File Upload | |
# st.sidebar.write("## Upload Data") | |
# uploaded_file = st.sidebar.file_uploader("", type="csv") | |
# # Investment Strategy Slider | |
# st.sidebar.markdown("### Investment Strategy") | |
# investment_strategy = st.sidebar.slider( | |
# "Investment Strategy", | |
# min_value=0.0, max_value=1.0, value=0.5, step=0.01, | |
# format="", | |
# help="0 is Conservative, 1 is Aggressive", | |
# label_visibility="collapsed" | |
# ) | |
# st.sidebar.text(f"Current Strategy: {'Conservative' if investment_strategy <= 0.5 else 'Aggressive'}") | |
# # Main container | |
# st.sidebar.write("## Upload Data") | |
# uploaded_file = st.sidebar.file_uploader("Please upload a CSV file", type="csv", label_visibility="collapsed") | |
# if uploaded_file: | |
# # Displaying the file | |
# data = pd.read_csv(uploaded_file) | |
# st.session_state.classified_data = classify_esg(data) | |
# st.write("### Uploaded News Data:") | |
# st.dataframe(data, use_container_width=True) | |
# if st.button("π Classify ESG"): | |
# st.write("Classifying ESG-related news...") | |
# try: | |
# with st.spinner("Calculating embeddings and similarities..."): | |
# st.session_state.classified_data = classify_esg(st.session_state.classified_data) | |
# st.write("Classified News Data:") | |
# st.dataframe(st.session_state.classified_data, use_container_width=True) | |
# except Exception as e: | |
# st.error(f"An error occurred: {e}") | |
# if st.button("π Determine Sentiment"): | |
# st.write("Determining sentiment using GPT...") | |
# # Run sentiment analysis with GPT | |
# try: | |
# with st.spinner("Analyzing sentiment..."): | |
# # Assume you have your API key set and a function defined to handle sentiment analysis | |
# st.session_state.updated_data = update_dataset_with_gpt_sentiment(st.session_state.classified_data, model='gpt-4-turbo-2024-04-09') | |
# st.write("News with GPT-based Sentiment Analysis:") | |
# st.dataframe(st.session_state.updated_data, use_container_width=True) | |
# except Exception as e: | |
# st.error(f"An error occurred: {e}") | |
# if st.button("π Alphalens Analysis"): | |
# # process_sentiment_data(sentiment_data = 'finbert_sentiment.csv', sector_ticker = 'sector_ticker.csv', prices = 'prices.csv') | |
# prices = pd.read_csv('prices.csv') | |
# factor_data = pd.read_csv('factor_data.csv') | |
# merged_data = pd.read_csv('merged_data.csv') | |
# alphalens_analysis(merged_data, prices) | |
# # Expander for advanced settings | |
# with st.expander("Advanced Settings"): | |
# st.write("Any advanced settings and configurations will go here.") | |
# if __name__ == "__main__": | |
# main() | |
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import os | |
import ast | |
import openai | |
from openai import OpenAI | |
import json | |
from getpass import getpass | |
from scipy.spatial.distance import cosine | |
from tqdm import tqdm | |
import matplotlib.pyplot as plt | |
def get_sentiment_gpt(company, SASB, news, max_retries=5, model = 'gpt-4-turbo-2024-04-09'): | |
system_prompt = """ | |
As a specialist in ESG analytics, | |
You possess a deep understanding of evaluating environmental, social, and governance factors in the context of corporate news. | |
Your expertise lies in discerning the underlying sentiment of news segments that pertain to a company's ESG practices, | |
determining whether the coverage reflects a positive, negative, or neutral stance. | |
""" | |
allowed_sentiments = ['Negative', 'Positive', 'Neutral'] | |
attempt = 0 | |
while attempt < max_retries: | |
main_prompt = f""" | |
Classify the sentiment (Only options: Positive, Negative, Neutral) of the following news: {news} | | |
The sentiment classification should be about the sections of the news talking about the company {company}. | | |
The ESG part of the news should be around topics within the following SASB topics {SASB} | |
The output should be a structured JSON object with the key: "sentiment". | |
Here is the format I expect for the JSON object: | |
{{ | |
"sentiment": "Enter 'Positive', 'Neutral', or 'Negative'", | |
}} | |
Do not return any additional text or information outside of this JSON structure. | |
""" | |
messages = [ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": main_prompt} | |
] | |
response = openai.chat.completions.create( | |
model=model, | |
messages=messages, | |
response_format={"type": "json_object"} # Enable JSON mode | |
) | |
response_json = json.loads(response.choices[0].message.content) | |
json_sentiment = response_json.get('sentiment') | |
if json_sentiment in allowed_sentiments: | |
return json_sentiment | |
attempt += 1 | |
# After max retries, if no valid sentiment is found, handle as needed (e.g., return a default sentiment) | |
print("Failed to obtain a valid sentiment after maximum retries. Defaulting to 'Neutral'.") | |
return 'Neutral' # Default return value if no valid sentiment is obtained | |
def update_dataset_with_gpt_sentiment(df, model, column_name='GPT_based_sentiment'): | |
# Initialize the new column to store GPT-based sentiment | |
df['GPT_based_sentiment'] = None | |
# Use tqdm to show a progress bar for the operation | |
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"): | |
# Extract necessary information for each row | |
company = row['Company'] # Make sure this matches your DataFrame's column name | |
SASB = row['SASB'] # Make sure this matches your DataFrame's column name | |
news = row['title & content'] # Make sure this matches your DataFrame's column name | |
# Call the function to get the sentiment | |
sentiment = get_sentiment_gpt(company, SASB, news, model=model) | |
# Update the DataFrame with the obtained sentiment | |
df.at[index, column_name] = sentiment # Now correctly assigns the sentiment | |
return df | |
# Function to get embeddings, provided by you | |
def get_embedding(text, model="text-embedding-3-small"): | |
client = OpenAI() | |
text = text.replace("\n", " ") | |
return client.embeddings.create(input=[text], model=model).data[0].embedding | |
# Function to calculate cosine similarity | |
def cosine_similarity(v1, v2): | |
return 1 - cosine(v1, v2) | |
def calculate_sasb_embeddings(sasb_str): | |
# Safely convert the string representation of a dictionary into an actual dictionary | |
try: | |
sasb_dict = ast.literal_eval(sasb_str) | |
if not isinstance(sasb_dict, dict): | |
raise ValueError("SASB column does not contain a valid dictionary.") | |
except ValueError as e: | |
print(f"Error converting SASB column to dictionary: {e}") | |
return {} | |
sasb_embeddings = {} | |
for topic, content in sasb_dict.items(): | |
# Join the list of keywords into a single string | |
combined_content = ' '.join(content) | |
sasb_embeddings[topic] = get_embedding(combined_content) | |
return sasb_embeddings | |
# Function to process ESG classification | |
def classify_esg(data): | |
# Calculate embeddings for the news | |
data['news_embeddings'] = data['title & content'].apply(get_embedding) | |
# Calculate embeddings for SASB topics (you need to have your SASB topics defined) | |
data['sasb_embeddings'] = data['SASB'].apply(calculate_sasb_embeddings) | |
# Compute cosine similarities | |
data['cosine_similarities'] = data.apply( | |
lambda row: {topic: cosine_similarity(row['news_embeddings'], emb) | |
for topic, emb in row['sasb_embeddings'].items()}, | |
axis=1 | |
) | |
# Extract max cosine similarity | |
data['max_cosine_similarity'] = data['cosine_similarities'].apply(lambda x: max(x.values())) | |
# Mark the top 10% of news by max_cosine_similarity within each 'Sector' as 'Yes' | |
sector_thresholds = data.groupby('Sector')['max_cosine_similarity'].quantile(0.9).to_dict() | |
data['ESG_relevance'] = data.apply( | |
lambda row: 'Yes' if row['max_cosine_similarity'] >= sector_thresholds[row['Sector']] else 'No', | |
axis=1 | |
) | |
return data | |
def main(): | |
st.set_page_config(page_title="NLP ESG Project", page_icon="π") | |
# Custom styles | |
st.markdown( | |
""" | |
<style> | |
.streamlit-container { | |
background-color: #F5F5F5; | |
} | |
.stButton>button { | |
width: 100%; | |
border-radius: 10px; | |
border: none; | |
margin: 10px 0; | |
padding: 15px 20px; | |
background-color: #79AEC8; | |
color: white; | |
font-size: 18px; | |
} | |
.stButton>button:hover { | |
background-color: #6699CC; | |
} | |
</style> | |
""", | |
unsafe_allow_html=True, | |
) | |
# Header section | |
st.write("# NLP Project: ESG News Analysis and Financial Impact") | |
st.sidebar.write("## Configuration") | |
# API Key input | |
openai_api_key = st.sidebar.text_input("Enter your OpenAI API key", type="password") | |
os.environ["OPENAI_API_KEY"] = openai_api_key | |
openai.api_key = openai_api_key | |
# File Upload | |
st.sidebar.write("## Upload Data") | |
uploaded_file = st.sidebar.file_uploader("", type="csv") | |
# Investment Strategy Slider | |
st.sidebar.markdown("### Investment Strategy") | |
investment_strategy = st.sidebar.slider( | |
"Investment Strategy", | |
min_value=0.0, max_value=1.0, value=0.5, step=0.01, | |
format="", | |
help="0 is Conservative, 1 is Aggressive", | |
label_visibility="collapsed" | |
) | |
st.sidebar.text(f"Current Strategy: {'Conservative' if investment_strategy <= 0.5 else 'Aggressive'}") | |
# Main container | |
if uploaded_file: | |
# Displaying the file | |
data = pd.read_csv(uploaded_file) | |
st.session_state.classified_data = classify_esg(data) | |
st.write("### Uploaded News Data:") | |
st.dataframe(data, use_container_width=True) | |
if st.button("π Classify ESG"): | |
st.write("Classifying ESG-related news...") | |
try: | |
with st.spinner("Calculating embeddings and similarities..."): | |
st.session_state.classified_data = classify_esg(st.session_state.classified_data) | |
st.write("Classified News Data:") | |
st.dataframe(st.session_state.classified_data, use_container_width=True) | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |
if st.button("π Determine Sentiment"): | |
st.write("Determining sentiment using GPT...") | |
# Run sentiment analysis with GPT | |
try: | |
with st.spinner("Analyzing sentiment..."): | |
# Assume you have your API key set and a function defined to handle sentiment analysis | |
st.session_state.updated_data = update_dataset_with_gpt_sentiment(st.session_state.classified_data, model='gpt-4-turbo-2024-04-09') | |
st.write("News with GPT-based Sentiment Analysis:") | |
st.dataframe(st.session_state.updated_data, use_container_width=True) | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |
if st.button("π Alphalens Analysis"): | |
st.write("Alphalens analysis will be here") # placeholder | |
# Expander for advanced settings | |
with st.expander("Advanced Settings"): | |
st.write("Any advanced settings and configurations will go here.") | |
if __name__ == "__main__": | |
main() |