File size: 1,885 Bytes
b396e94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import torch
import scipy.special
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load FinBERT model and tokenizer
finbert_ckpt = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(finbert_ckpt)
model_finbert = AutoModelForSequenceClassification.from_pretrained(finbert_ckpt).to("cuda" if torch.cuda.is_available() else "cpu")

def analyze_sentiment(text_list):
    """Performs sentiment analysis on a list of texts using FinBERT."""
    preds = []
    preds_proba = []
    
    tokenizer_kwargs = {"padding": True, "truncation": True, "max_length": 512}
    
    for text in text_list:
        with torch.no_grad():
            # Tokenize the input
            input_sequence = tokenizer(text, return_tensors="pt", **tokenizer_kwargs).to(model_finbert.device)
            logits = model_finbert(**input_sequence).logits.cpu().numpy().squeeze()
            
            # Convert logits to probabilities
            scores = {
                k: v for k, v in zip(
                    model_finbert.config.id2label.values(),
                    scipy.special.softmax(logits)
                )
            }
            
            # Get the most probable sentiment
            sentiment = max(scores, key=scores.get)
            probability = max(scores.values())

            # Map the sentiment labels
            if sentiment == 'LABEL_2':
                sentiment = 'positive'
            elif sentiment == 'LABEL_0':
                sentiment = 'negative'
            else:
                sentiment = 'neutral'

            preds.append(sentiment)
            preds_proba.append(probability)

    # Return a DataFrame with results
    df_results = pd.DataFrame({
        "Text": text_list,
        "Predicted Sentiment": preds,
        "Probability": preds_proba
    })

    return df_results