File size: 5,156 Bytes
29bfbfa 917903a 5dad04b 1129006 29bfbfa 917903a 9fb80bf 84367c2 917903a 3448016 917903a 9fb80bf 917903a 9fb80bf 917903a 5dad04b 917903a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
---
license: apache-2.0
tags:
- ESG
- finance
language:
- en
pipeline_tag: text-classification
---
## Main information
We introduce the model for multilabel ESG risks classification. There is 47 classes methodology with granularial risk definition.
## Usage
```python
from collections import OrderedDict
from transformers import MPNetPreTrainedModel, MPNetModel, AutoTokenizer
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# Definition of ESGify class because of custom,sentence-transformers like, mean pooling function and classifier head
class ESGify(MPNetPreTrainedModel):
"""Model for Classification ESG risks from text."""
def __init__(self,config): #tuning only the head
"""
"""
super().__init__(config)
# Instantiate Parts of model
self.mpnet = MPNetModel(config,add_pooling_layer=False)
self.id2label = config.id2label
self.label2id = config.label2id
self.classifier = torch.nn.Sequential(OrderedDict([('norm',torch.nn.BatchNorm1d(768)),
('linear',torch.nn.Linear(768,512)),
('act',torch.nn.ReLU()),
('batch_n',torch.nn.BatchNorm1d(512)),
('drop_class', torch.nn.Dropout(0.2)),
('class_l',torch.nn.Linear(512 ,47))]))
def forward(self, input_ids, attention_mask):
# Feed input to mpnet model
outputs = self.mpnet(input_ids=input_ids,
attention_mask=attention_mask)
# mean pooling dataset and eed input to classifier to compute logits
logits = self.classifier( mean_pooling(outputs['last_hidden_state'],attention_mask))
# apply sigmoid
logits = 1.0 / (1.0 + torch.exp(-logits))
return logits
model = ESGify.from_pretrained('ai-lab/ESGify')
tokenizer = AutoTokenizer.from_pretrained('ai-lab/ESGify')
texts = ['text1','text2']
to_model = tokenizer.batch_encode_plus(
texts,
add_special_tokens=True,
max_length=512,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
results = model(**to_model)
# We also recommend preprocess texts with using FLAIR model
from flair.data import Sentence
from flair.nn import Classifier
from torch.utils.data import DataLoader
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
tagger = Classifier.load('ner-ontonotes-large')
tag_list = ['FAC','LOC','ORG','PERSON']
texts_with_masks = []
for example_sent in texts:
filtered_sentence = []
word_tokens = word_tokenize(example_sent)
# converts the words in word_tokens to lower case and then checks whether
#they are present in stop_words or not
for w in word_tokens:
if w.lower() not in stop_words:
filtered_sentence.append(w)
# make a sentence
sentence = Sentence(' '.join(filtered_sentence))
# run NER over sentence
tagger.predict(sentence)
sent = ' '.join(filtered_sentence)
k = 0
new_string = ''
start_t = 0
for i in sentence.get_labels():
info = i.to_dict()
val = info['value']
if info['confidence']>0.8 and val in tag_list :
if i.data_point.start_position>start_t :
new_string+=sent[start_t:i.data_point.start_position]
start_t = i.data_point.end_position
new_string+= f'<{val}>'
new_string+=sent[start_t:-1]
texts_with_masks.append(new_string)
to_model = tokenizer.batch_encode_plus(
texts_with_masks,
add_special_tokens=True,
max_length=512,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
results = model(**to_model)
```
------
## Background
The project aims to develop the ESG Risks classification model with a custom ESG risks definition methodology.
## Training procedure
### Pre-training
We use the pretrained [`microsoft/mpnet-base`](https://huggingface.co/microsoft/mpnet-base) model.
Next, we do the domain-adaptation procedure by Mask Language Modeling pertaining with using texts of ESG reports.
#### Training data
We use the ESG news dataset of 2000 texts with manually annotation of ESG specialists. |