|
--- |
|
license: apache-2.0 |
|
tags: |
|
- ESG |
|
- finance |
|
language: |
|
- en |
|
pipeline_tag: text-classification |
|
--- |
|
## Main information |
|
We introduce the model for multilabel ESG risks classification. There is 47 classes methodology with granularial risk definition. |
|
|
|
## Usage |
|
```python |
|
from collections import OrderedDict |
|
from transformers import MPNetPreTrainedModel, MPNetModel, AutoTokenizer |
|
import torch |
|
#Mean Pooling - Take attention mask into account for correct averaging |
|
def mean_pooling(model_output, attention_mask): |
|
token_embeddings = model_output #First element of model_output contains all token embeddings |
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
|
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) |
|
|
|
# Definition of ESGify class because of custom,sentence-transformers like, mean pooling function and classifier head |
|
class ESGify(MPNetPreTrainedModel): |
|
"""Model for Classification ESG risks from text.""" |
|
|
|
def __init__(self,config): #tuning only the head |
|
""" |
|
""" |
|
super().__init__(config) |
|
# Instantiate Parts of model |
|
self.mpnet = MPNetModel(config,add_pooling_layer=False) |
|
self.id2label = config.id2label |
|
self.label2id = config.label2id |
|
self.classifier = torch.nn.Sequential(OrderedDict([('norm',torch.nn.BatchNorm1d(768)), |
|
('linear',torch.nn.Linear(768,512)), |
|
('act',torch.nn.ReLU()), |
|
('batch_n',torch.nn.BatchNorm1d(512)), |
|
('drop_class', torch.nn.Dropout(0.2)), |
|
('class_l',torch.nn.Linear(512 ,47))])) |
|
|
|
|
|
def forward(self, input_ids, attention_mask): |
|
|
|
|
|
# Feed input to mpnet model |
|
outputs = self.mpnet(input_ids=input_ids, |
|
attention_mask=attention_mask) |
|
|
|
# mean pooling dataset and eed input to classifier to compute logits |
|
logits = self.classifier( mean_pooling(outputs['last_hidden_state'],attention_mask)) |
|
|
|
# apply sigmoid |
|
logits = 1.0 / (1.0 + torch.exp(-logits)) |
|
return logits |
|
|
|
model = ESGify.from_pretrained('ai-lab/ESGify') |
|
tokenizer = AutoTokenizer.from_pretrained('ai-lab/ESGify') |
|
texts = ['text1','text2'] |
|
to_model = tokenizer.batch_encode_plus( |
|
texts, |
|
add_special_tokens=True, |
|
max_length=512, |
|
return_token_type_ids=False, |
|
padding="max_length", |
|
truncation=True, |
|
return_attention_mask=True, |
|
return_tensors='pt', |
|
) |
|
results = model(**to_model) |
|
|
|
|
|
# We also recommend preprocess texts with using FLAIR model |
|
|
|
from flair.data import Sentence |
|
from flair.nn import Classifier |
|
from torch.utils.data import DataLoader |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
|
|
stop_words = set(stopwords.words('english')) |
|
tagger = Classifier.load('ner-ontonotes-large') |
|
tag_list = ['FAC','LOC','ORG','PERSON'] |
|
texts_with_masks = [] |
|
for example_sent in texts: |
|
filtered_sentence = [] |
|
word_tokens = word_tokenize(example_sent) |
|
# converts the words in word_tokens to lower case and then checks whether |
|
#they are present in stop_words or not |
|
for w in word_tokens: |
|
if w.lower() not in stop_words: |
|
filtered_sentence.append(w) |
|
# make a sentence |
|
sentence = Sentence(' '.join(filtered_sentence)) |
|
# run NER over sentence |
|
tagger.predict(sentence) |
|
sent = ' '.join(filtered_sentence) |
|
k = 0 |
|
new_string = '' |
|
start_t = 0 |
|
for i in sentence.get_labels(): |
|
info = i.to_dict() |
|
val = info['value'] |
|
if info['confidence']>0.8 and val in tag_list : |
|
|
|
if i.data_point.start_position>start_t : |
|
new_string+=sent[start_t:i.data_point.start_position] |
|
start_t = i.data_point.end_position |
|
new_string+= f'<{val}>' |
|
new_string+=sent[start_t:-1] |
|
texts_with_masks.append(new_string) |
|
|
|
to_model = tokenizer.batch_encode_plus( |
|
texts_with_masks, |
|
add_special_tokens=True, |
|
max_length=512, |
|
return_token_type_ids=False, |
|
padding="max_length", |
|
truncation=True, |
|
return_attention_mask=True, |
|
return_tensors='pt', |
|
) |
|
results = model(**to_model) |
|
``` |
|
|
|
------ |
|
|
|
## Background |
|
|
|
The project aims to develop the ESG Risks classification model with a custom ESG risks definition methodology. |
|
|
|
|
|
## Training procedure |
|
|
|
### Pre-training |
|
|
|
We use the pretrained [`microsoft/mpnet-base`](https://huggingface.co/microsoft/mpnet-base) model. |
|
Next, we do the domain-adaptation procedure by Mask Language Modeling pertaining with using texts of ESG reports. |
|
|
|
|
|
#### Training data |
|
|
|
We use the ESG news dataset of 2000 texts with manually annotation of ESG specialists. |