|
import streamlit as st |
|
import re |
|
import requests |
|
from newspaper import Article |
|
from newspaper import Config |
|
import preprocessor as p |
|
import pandas as pd |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import numpy as np |
|
import torch.nn.functional as F |
|
from goose3 import Goose |
|
from goose3.configuration import Configuration |
|
from bs4 import BeautifulSoup |
|
|
|
st.write(""" |
|
# ESG Prediction App |
|
|
|
This is a Proof of Concept for a company ESG (Environmental, Social, and Governance) risk prediction application. |
|
""") |
|
|
|
company = st.text_input("Company", placeholder="PT Adaro Minerals Indonesia Tbk") |
|
|
|
GOOGLE = 'https://www.google.com/search' |
|
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'} |
|
|
|
API_KEY = 'AIzaSyDCfIltnvAQ3lvpovRXydRMhGQ-VxkboQ4' |
|
SEARCH_ENGINE_ID = 'e586ee8a6c7e64d7b' |
|
|
|
from googleapiclient.discovery import build |
|
import math |
|
|
|
def google_search(search_term, api_key, cse_id, **kwargs): |
|
service = build("customsearch", "v1", developerKey=api_key) |
|
|
|
num_search_results = kwargs['num'] |
|
if num_search_results > 100: |
|
raise NotImplementedError('Google Custom Search API supports max of 100 results') |
|
elif num_search_results > 10: |
|
kwargs['num'] = 10 |
|
calls_to_make = math.ceil(num_search_results / 10) |
|
else: |
|
calls_to_make = 1 |
|
|
|
kwargs['start'] = start_item = 1 |
|
items_to_return = [] |
|
while calls_to_make > 0: |
|
res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute() |
|
items_to_return.extend(res['items']) |
|
calls_to_make -= 1 |
|
start_item += 10 |
|
kwargs['start'] = start_item |
|
leftover = num_search_results - start_item + 1 |
|
if 0 < leftover < 10: |
|
kwargs['num'] = leftover |
|
|
|
return items_to_return |
|
|
|
if company: |
|
|
|
links = [] |
|
news_text = [] |
|
|
|
query = f'{company}' |
|
response = google_search(query, API_KEY, SEARCH_ENGINE_ID, num=50) |
|
|
|
url_collection = [item['link'] for item in response] |
|
import os |
|
os.environ['ST_USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' |
|
|
|
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' |
|
config = Config() |
|
config.browser_user_agent = user_agent |
|
config.request_timeout = 60 |
|
config.fetch_images = False |
|
config.memoize_articles = True |
|
config.language = 'id' |
|
|
|
|
|
|
|
def cleaner(text): |
|
text = re.sub("@[A-Za-z0-9]+", "", text) |
|
text = text.replace("#", "").replace("_", "") |
|
|
|
text = text.strip().replace("\n","") |
|
return text |
|
|
|
for url in url_collection: |
|
if "http" not in url: |
|
continue |
|
lang = "id" |
|
if "eco-business.com" in url or "thejakartapost.com" in url or "marketforces.org.au" in url or "jakartaglobe.id" in url: |
|
lang = "en" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} |
|
|
|
response = requests.get(url, headers=headers) |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
g = Goose() |
|
article = g.extract(raw_html=str(soup)) |
|
|
|
|
|
|
|
|
|
|
|
possible_class = ['detail', 'body-content', 'article-content', 'detail-konten', 'DetailBlock'] |
|
excluded_sentence = ['Komentar menjadi tanggung-jawab Anda sesuai UU ITE', 'Dapatkan berita terbaru dari kami Ikuti langkah ini untuk mendapatkan notifikasi:'] |
|
|
|
if not article.cleaned_text: |
|
article_content = soup.find('div', class_=possible_class) |
|
if article_content and article_content.get_text() not in excluded_sentence: |
|
news_text.append(article_content.get_text()) |
|
news_empty = False |
|
|
|
else: |
|
if article.cleaned_text not in excluded_sentence: |
|
news_text.append(article.cleaned_text) |
|
news_empty = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df = pd.DataFrame({ |
|
'news': news_text |
|
}) |
|
|
|
|
|
tokenizer_esg = AutoTokenizer.from_pretrained("didev007/ESG-indobert-model") |
|
model_esg = AutoModelForSequenceClassification.from_pretrained("didev007/ESG-indobert-model") |
|
|
|
|
|
tokenizer_sentiment = AutoTokenizer.from_pretrained("adhityaprimandhika/distillbert_sentiment_analysis") |
|
model_sentiment = AutoModelForSequenceClassification.from_pretrained("adhityaprimandhika/distillbert_sentiment_analysis") |
|
|
|
def get_chunk_weights(num_chunks): |
|
center = num_chunks / 2 |
|
sigma = num_chunks / 4 |
|
weights = [np.exp(-0.5 * ((i - center) / sigma) ** 2) for i in range(num_chunks)] |
|
weights = np.array(weights) |
|
return weights / weights.sum() |
|
|
|
def tokenize_and_chunk(text, tokenizer, chunk_size=512): |
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) |
|
input_ids = inputs['input_ids'][0] |
|
|
|
chunks = [input_ids[i:i+chunk_size] for i in range(0, len(input_ids), chunk_size)] |
|
return chunks |
|
|
|
def esg_category(chunks, model): |
|
num_chunks = len(chunks) |
|
weights = get_chunk_weights(num_chunks) |
|
|
|
esg_scores = np.zeros(4) |
|
labels = ["none", "E", "S", "G"] |
|
|
|
for i, chunk in enumerate(chunks): |
|
inputs = {'input_ids': chunk.unsqueeze(0)} |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
probs = F.softmax(logits, dim=1).detach().numpy()[0] |
|
esg_scores += weights[i] * probs |
|
|
|
predicted_class = esg_scores.argmax() |
|
aggregated_esg = labels[predicted_class] |
|
|
|
return aggregated_esg |
|
|
|
def sentiment_analysis(text, tokenizer, model): |
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
predicted_class = torch.argmax(logits, dim=1).item() |
|
labels = ["positive", "neutral", "negative"] |
|
predicted_sentiment = labels[predicted_class] |
|
return predicted_sentiment |
|
|
|
def apply_model_to_dataframe(df, tokenizer_esg, model_esg, tokenizer_sentiment, model_sentiment, text_column='news'): |
|
esg_categories = [] |
|
sentiments = [] |
|
for text in df[text_column]: |
|
if isinstance(text, str): |
|
chunks = tokenize_and_chunk(text, tokenizer_esg) |
|
esg = esg_category(chunks, model_esg) |
|
sentiment = sentiment_analysis(text, tokenizer_sentiment, model_sentiment) |
|
esg_categories.append(esg) |
|
sentiments.append(sentiment) |
|
else: |
|
esg_categories.append("none") |
|
sentiments.append("neutral") |
|
|
|
df['aggregated_esg'] = esg_categories |
|
df['sentiment'] = sentiments |
|
return df |
|
|
|
result_data = apply_model_to_dataframe(df, tokenizer_esg, model_esg, tokenizer_sentiment, model_sentiment) |
|
|
|
grouped_counts = df.groupby(['aggregated_esg', 'sentiment']).size().reset_index(name='count') |
|
data = grouped_counts.pivot(index='aggregated_esg', columns='sentiment', values='count') |
|
required_columns_sentiment = ['negative', 'positive', 'neutral'] |
|
for col in required_columns_sentiment: |
|
if col not in data.columns: |
|
data[col] = 0 |
|
|
|
|
|
data['negative'] = data['negative'].fillna(0) |
|
data['positive'] = data['positive'].fillna(0) |
|
data['neutral'] = data['neutral'].fillna(0) |
|
|
|
|
|
|
|
data['count'] = (data['negative']+data['positive']+data['neutral']) |
|
data['total'] = data['negative']/data['count'] + data['positive']*(-0.2)/data['count'] |
|
|
|
if 'none' in data: |
|
data = data.drop('none') |
|
|
|
|
|
total = data['total'].sum() |
|
|
|
|
|
min_esg = -1 |
|
max_esg = 2 |
|
min_score = 0 |
|
max_score = 60 |
|
|
|
ESG_score = ((total - min_esg) / (max_esg - min_esg)) * (max_score - min_score) + min_score |
|
|
|
def esg_risk_categorization(esg_score): |
|
if esg_score <= 10: |
|
return 'Negligible' |
|
elif 10 < esg_score <= 20: |
|
return 'Low' |
|
elif 20 < esg_score <= 30: |
|
return 'Medium' |
|
elif 30 < esg_score <= 40: |
|
return 'High' |
|
else: |
|
return 'Severe' |
|
|
|
risk = esg_risk_categorization(ESG_score) |
|
|
|
st.write(company) |
|
log_detail = """ |
|
Company: {} |
|
ESG Score Prediction: {} |
|
ESG Category Risk Prediction: {} |
|
""".format(company, ESG_score, risk) |
|
print(log_detail) |
|
|
|
st.write(f'ESG Category Risk Prediction: {risk}') |
|
|
|
|