File size: 2,771 Bytes
2293f58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.stats import zscore

class SentimentAnalyzer:
    def __init__(self):
        self.models = {
            'finbert': AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone"),
            'financial_sentiment': AutoModelForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
        }
        self.tokenizers = {
            'finbert': AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone"),
            'financial_sentiment': AutoTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
        }
        self.max_length = 512  # Limite do modelo

    def chunk_text(self, text, tokenizer):
        tokens = tokenizer.encode(text, truncation=False)
        return [tokens[i:i+self.max_length] for i in range(0, len(tokens), self.max_length)]
    
    def preprocess_text(self, item):
        title = str(item.get('title', '')).strip()  
        content = str(item.get('content', '')).strip()
        text = f"{title} {content}".strip()
        return text if text else None


    def analyze(self, news):
        if not news:
            return {'negative': 0.33, 'neutral': 0.33, 'positive': 0.33}

        sentiment_scores = []

        for item in news:
            if not isinstance(item, dict):
                continue

            text = self.preprocess_text(item)
            if not text:
                continue
            
            tokenizer = self.tokenizers['financial_sentiment']
            model = self.models['financial_sentiment']
            
            tokenized_chunks = self.chunk_text(text, tokenizer)
            chunk_scores = []
            
            for chunk in tokenized_chunks:
                inputs = tokenizer.decode(chunk, skip_special_tokens=True)
                inputs = tokenizer(inputs, return_tensors="pt", truncation=True, max_length=self.max_length)
                outputs = model(**inputs)
                probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
                chunk_scores.append(probabilities.detach().numpy()[0])
            
            if chunk_scores:
                sentiment_scores.append(np.mean(chunk_scores, axis=0))

        if not sentiment_scores:
            return {'negative': 0.33, 'neutral': 0.33, 'positive': 0.33}

        # Filtro de outliers
        filtered_scores = [s for s in sentiment_scores if np.abs(zscore(s)).max() < 2]
        avg_sentiment = np.mean(filtered_scores, axis=0) if filtered_scores else np.mean(sentiment_scores, axis=0)

        return {'negative': float(avg_sentiment[0]), 'neutral': float(avg_sentiment[1]), 'positive': float(avg_sentiment[2])}