Spaces:
Running
Running
increasing performance on AI-mode by implementing singletons
Browse files
mediaunmasked/analyzers/bias_analyzer.py
CHANGED
@@ -1,21 +1,25 @@
|
|
1 |
import logging
|
2 |
import os
|
3 |
-
from typing import Dict, Any, List
|
4 |
from transformers import pipeline
|
5 |
import numpy as np
|
|
|
|
|
6 |
|
7 |
logger = logging.getLogger(__name__)
|
8 |
|
9 |
class BiasAnalyzer:
|
10 |
-
def __init__(self, use_ai: bool = True):
|
11 |
"""
|
12 |
Initialize bias analyzer with both LLM and traditional approaches.
|
13 |
|
14 |
Args:
|
15 |
use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
|
|
|
16 |
"""
|
17 |
self.use_ai = use_ai
|
18 |
self.llm_available = False
|
|
|
19 |
|
20 |
# Load traditional keywords
|
21 |
self.resources_dir = os.path.join(os.path.dirname(__file__), '..', 'resources')
|
@@ -24,14 +28,20 @@ class BiasAnalyzer:
|
|
24 |
|
25 |
if use_ai:
|
26 |
try:
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
model
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
except Exception as e:
|
36 |
logger.warning(f"Failed to initialize LLM pipeline: {str(e)}")
|
37 |
self.llm_available = False
|
@@ -128,25 +138,30 @@ class BiasAnalyzer:
|
|
128 |
}
|
129 |
|
130 |
def _analyze_with_llm(self, text: str) -> Dict[str, Any]:
|
131 |
-
"""Analyze bias using LLM zero-shot classification."""
|
132 |
try:
|
133 |
-
# Define bias categories
|
134 |
bias_categories = [
|
135 |
"left-wing bias",
|
136 |
"right-wing bias",
|
137 |
"neutral/balanced perspective"
|
138 |
]
|
139 |
|
140 |
-
#
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
-
#
|
144 |
chunk_scores = []
|
145 |
flagged_phrases = []
|
146 |
|
147 |
for chunk in chunks:
|
148 |
-
#
|
149 |
-
|
150 |
chunk,
|
151 |
bias_categories,
|
152 |
multi_label=True
|
@@ -154,21 +169,43 @@ class BiasAnalyzer:
|
|
154 |
|
155 |
chunk_scores.append({
|
156 |
label: score
|
157 |
-
for label, score in zip(
|
158 |
})
|
159 |
|
160 |
-
#
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
# Aggregate scores across chunks
|
174 |
aggregated_scores = {
|
@@ -184,7 +221,7 @@ class BiasAnalyzer:
|
|
184 |
right_score = aggregated_scores["right-wing bias"]
|
185 |
neutral_score = aggregated_scores["neutral/balanced perspective"]
|
186 |
|
187 |
-
# Calculate bias score (-1 to 1
|
188 |
bias_score = (right_score - left_score) / max(right_score + left_score, 0.0001)
|
189 |
|
190 |
# Determine bias label
|
@@ -206,11 +243,23 @@ class BiasAnalyzer:
|
|
206 |
# Calculate bias percentage (0-100)
|
207 |
bias_percentage = min(100, abs(bias_score * 100))
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
return {
|
210 |
"bias": bias,
|
211 |
"bias_score": round(bias_score, 2),
|
212 |
"bias_percentage": round(bias_percentage, 1),
|
213 |
-
"flagged_phrases":
|
214 |
"detailed_scores": {
|
215 |
"left_bias": round(left_score * 100, 1),
|
216 |
"right_bias": round(right_score * 100, 1),
|
|
|
1 |
import logging
|
2 |
import os
|
3 |
+
from typing import Dict, Any, List, Optional
|
4 |
from transformers import pipeline
|
5 |
import numpy as np
|
6 |
+
import nltk
|
7 |
+
from nltk.tokenize import sent_tokenize
|
8 |
|
9 |
logger = logging.getLogger(__name__)
|
10 |
|
11 |
class BiasAnalyzer:
|
12 |
+
def __init__(self, use_ai: bool = True, model_registry: Optional[Any] = None):
|
13 |
"""
|
14 |
Initialize bias analyzer with both LLM and traditional approaches.
|
15 |
|
16 |
Args:
|
17 |
use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
|
18 |
+
model_registry: Optional shared model registry for better performance
|
19 |
"""
|
20 |
self.use_ai = use_ai
|
21 |
self.llm_available = False
|
22 |
+
self.model_registry = model_registry
|
23 |
|
24 |
# Load traditional keywords
|
25 |
self.resources_dir = os.path.join(os.path.dirname(__file__), '..', 'resources')
|
|
|
28 |
|
29 |
if use_ai:
|
30 |
try:
|
31 |
+
if model_registry and model_registry.is_available:
|
32 |
+
self.classifier = model_registry.zero_shot
|
33 |
+
self.llm_available = True
|
34 |
+
logger.info("Using shared model pipeline for bias analysis")
|
35 |
+
else:
|
36 |
+
# Initialize own pipeline if no shared registry
|
37 |
+
self.classifier = pipeline(
|
38 |
+
"zero-shot-classification",
|
39 |
+
model="facebook/bart-large-mnli",
|
40 |
+
device=-1,
|
41 |
+
batch_size=8
|
42 |
+
)
|
43 |
+
self.llm_available = True
|
44 |
+
logger.info("Initialized dedicated model pipeline for bias analysis")
|
45 |
except Exception as e:
|
46 |
logger.warning(f"Failed to initialize LLM pipeline: {str(e)}")
|
47 |
self.llm_available = False
|
|
|
138 |
}
|
139 |
|
140 |
def _analyze_with_llm(self, text: str) -> Dict[str, Any]:
|
141 |
+
"""Analyze bias using LLM zero-shot classification with batch processing."""
|
142 |
try:
|
143 |
+
# Define bias categories
|
144 |
bias_categories = [
|
145 |
"left-wing bias",
|
146 |
"right-wing bias",
|
147 |
"neutral/balanced perspective"
|
148 |
]
|
149 |
|
150 |
+
# Clean and prepare text
|
151 |
+
cleaned_text = text.replace('$!/$', '').replace('##', '').replace('#', '')
|
152 |
+
cleaned_text = '\n'.join(line for line in cleaned_text.split('\n')
|
153 |
+
if not line.startswith('[') and not line.startswith('More on'))
|
154 |
+
|
155 |
+
# Split into larger chunks (4000 chars) for fewer API calls
|
156 |
+
chunks = [cleaned_text[i:i+4000] for i in range(0, len(cleaned_text), 4000)]
|
157 |
|
158 |
+
# Process chunks in batches
|
159 |
chunk_scores = []
|
160 |
flagged_phrases = []
|
161 |
|
162 |
for chunk in chunks:
|
163 |
+
# Analyze chunk as a whole first
|
164 |
+
chunk_result = self.classifier(
|
165 |
chunk,
|
166 |
bias_categories,
|
167 |
multi_label=True
|
|
|
169 |
|
170 |
chunk_scores.append({
|
171 |
label: score
|
172 |
+
for label, score in zip(chunk_result['labels'], chunk_result['scores'])
|
173 |
})
|
174 |
|
175 |
+
# Only analyze individual sentences if chunk shows strong bias
|
176 |
+
max_chunk_score = max(chunk_result['scores'])
|
177 |
+
if max_chunk_score > 0.6:
|
178 |
+
sentences = sent_tokenize(chunk)
|
179 |
+
# Filter sentences for analysis (longer, potentially more meaningful ones)
|
180 |
+
relevant_sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
|
181 |
+
|
182 |
+
# Process sentences in batches of 8
|
183 |
+
for i in range(0, len(relevant_sentences), 8):
|
184 |
+
batch = relevant_sentences[i:i+8]
|
185 |
+
try:
|
186 |
+
batch_results = self.classifier(
|
187 |
+
batch,
|
188 |
+
bias_categories,
|
189 |
+
multi_label=False
|
190 |
+
)
|
191 |
+
|
192 |
+
# Handle single or multiple results
|
193 |
+
if not isinstance(batch_results, list):
|
194 |
+
batch_results = [batch_results]
|
195 |
+
|
196 |
+
for sentence, result in zip(batch, batch_results):
|
197 |
+
max_score = max(result['scores'])
|
198 |
+
if max_score > 0.8 and result['labels'][0] != "neutral/balanced perspective":
|
199 |
+
flagged_phrases.append({
|
200 |
+
"text": sentence,
|
201 |
+
"type": result['labels'][0],
|
202 |
+
"score": max_score,
|
203 |
+
"highlight": f"[{result['labels'][0].upper()}] (Score: {round(max_score * 100, 1)}%) \"{sentence}\""
|
204 |
+
})
|
205 |
+
|
206 |
+
except Exception as batch_error:
|
207 |
+
logger.warning(f"Batch processing error: {str(batch_error)}")
|
208 |
+
continue
|
209 |
|
210 |
# Aggregate scores across chunks
|
211 |
aggregated_scores = {
|
|
|
221 |
right_score = aggregated_scores["right-wing bias"]
|
222 |
neutral_score = aggregated_scores["neutral/balanced perspective"]
|
223 |
|
224 |
+
# Calculate bias score (-1 to 1)
|
225 |
bias_score = (right_score - left_score) / max(right_score + left_score, 0.0001)
|
226 |
|
227 |
# Determine bias label
|
|
|
243 |
# Calculate bias percentage (0-100)
|
244 |
bias_percentage = min(100, abs(bias_score * 100))
|
245 |
|
246 |
+
# Sort and limit flagged phrases
|
247 |
+
sorted_phrases = sorted(flagged_phrases, key=lambda x: x['score'], reverse=True)
|
248 |
+
unique_phrases = []
|
249 |
+
seen = set()
|
250 |
+
|
251 |
+
for phrase in sorted_phrases:
|
252 |
+
if phrase['text'] not in seen:
|
253 |
+
unique_phrases.append(phrase)
|
254 |
+
seen.add(phrase['text'])
|
255 |
+
if len(unique_phrases) >= 5:
|
256 |
+
break
|
257 |
+
|
258 |
return {
|
259 |
"bias": bias,
|
260 |
"bias_score": round(bias_score, 2),
|
261 |
"bias_percentage": round(bias_percentage, 1),
|
262 |
+
"flagged_phrases": unique_phrases,
|
263 |
"detailed_scores": {
|
264 |
"left_bias": round(left_score * 100, 1),
|
265 |
"right_bias": round(right_score * 100, 1),
|
mediaunmasked/analyzers/evidence_analyzer.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import logging
|
2 |
-
from typing import Dict, Any, List
|
3 |
from transformers import pipeline
|
4 |
import numpy as np
|
5 |
import nltk
|
@@ -8,26 +8,35 @@ from nltk.tokenize import sent_tokenize
|
|
8 |
logger = logging.getLogger(__name__)
|
9 |
|
10 |
class EvidenceAnalyzer:
|
11 |
-
def __init__(self, use_ai: bool = True):
|
12 |
"""
|
13 |
Initialize evidence analyzer with LLM and traditional approaches.
|
14 |
|
15 |
Args:
|
16 |
use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
|
|
|
17 |
"""
|
18 |
self.use_ai = use_ai
|
19 |
self.llm_available = False
|
|
|
20 |
|
21 |
if use_ai:
|
22 |
try:
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
except Exception as e:
|
32 |
logger.warning(f"Failed to initialize LLM pipeline: {str(e)}")
|
33 |
self.llm_available = False
|
|
|
1 |
import logging
|
2 |
+
from typing import Dict, Any, List, Optional
|
3 |
from transformers import pipeline
|
4 |
import numpy as np
|
5 |
import nltk
|
|
|
8 |
logger = logging.getLogger(__name__)
|
9 |
|
10 |
class EvidenceAnalyzer:
|
11 |
+
def __init__(self, use_ai: bool = True, model_registry: Optional[Any] = None):
|
12 |
"""
|
13 |
Initialize evidence analyzer with LLM and traditional approaches.
|
14 |
|
15 |
Args:
|
16 |
use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
|
17 |
+
model_registry: Optional shared model registry for better performance
|
18 |
"""
|
19 |
self.use_ai = use_ai
|
20 |
self.llm_available = False
|
21 |
+
self.model_registry = model_registry
|
22 |
|
23 |
if use_ai:
|
24 |
try:
|
25 |
+
if model_registry and model_registry.is_available:
|
26 |
+
# Use shared models
|
27 |
+
self.classifier = model_registry.zero_shot
|
28 |
+
self.llm_available = True
|
29 |
+
logger.info("Using shared model pipeline for evidence analysis")
|
30 |
+
else:
|
31 |
+
# Initialize own pipeline
|
32 |
+
self.classifier = pipeline(
|
33 |
+
"zero-shot-classification",
|
34 |
+
model="facebook/bart-large-mnli",
|
35 |
+
device=-1,
|
36 |
+
batch_size=8
|
37 |
+
)
|
38 |
+
self.llm_available = True
|
39 |
+
logger.info("Initialized dedicated model pipeline for evidence analysis")
|
40 |
except Exception as e:
|
41 |
logger.warning(f"Failed to initialize LLM pipeline: {str(e)}")
|
42 |
self.llm_available = False
|
mediaunmasked/analyzers/headline_analyzer.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import logging
|
2 |
-
from typing import Dict, Any, List
|
3 |
from transformers import pipeline, AutoTokenizer
|
4 |
import numpy as np
|
5 |
import nltk
|
@@ -8,32 +8,46 @@ from nltk.tokenize import sent_tokenize
|
|
8 |
logger = logging.getLogger(__name__)
|
9 |
|
10 |
class HeadlineAnalyzer:
|
11 |
-
def __init__(self, use_ai: bool = True):
|
12 |
"""
|
13 |
Initialize the analyzers for headline analysis.
|
14 |
|
15 |
Args:
|
16 |
use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
|
|
|
17 |
"""
|
18 |
self.use_ai = use_ai
|
19 |
self.llm_available = False
|
|
|
20 |
|
21 |
if use_ai:
|
22 |
try:
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
# Zero-shot classifier for clickbait and sensationalism
|
27 |
-
self.zero_shot = pipeline(
|
28 |
-
"zero-shot-classification",
|
29 |
-
model="facebook/bart-large-mnli",
|
30 |
-
device=-1
|
31 |
-
)
|
32 |
-
|
33 |
-
self.tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
|
34 |
self.max_length = 512
|
35 |
-
|
36 |
-
logger.info("LLM pipelines initialized successfully for headline analysis")
|
37 |
except Exception as e:
|
38 |
logger.warning(f"Failed to initialize LLM pipelines: {str(e)}")
|
39 |
self.llm_available = False
|
@@ -51,24 +65,17 @@ class HeadlineAnalyzer:
|
|
51 |
sep_tokens = len(self.tokenizer.encode("[SEP]")) - 2
|
52 |
max_content_tokens = self.max_length - headline_tokens - sep_tokens
|
53 |
|
54 |
-
# Process words into sections
|
|
|
55 |
for word in content_words:
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
current_section.pop()
|
62 |
-
sections.append(" ".join(current_section))
|
63 |
-
|
64 |
-
# Start new section with 20% overlap for context
|
65 |
-
overlap_start = max(0, len(current_section) - int(len(current_section) * 0.2))
|
66 |
-
current_section = current_section[overlap_start:]
|
67 |
-
current_section.append(word)
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
sections.append(" ".join(current_section))
|
72 |
|
73 |
return sections
|
74 |
|
@@ -82,10 +89,16 @@ class HeadlineAnalyzer:
|
|
82 |
nltk.download('punkt')
|
83 |
|
84 |
sentences = sent_tokenize(section)
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
# Categories for sensationalism check
|
91 |
sensationalism_categories = [
|
@@ -108,44 +121,96 @@ class HeadlineAnalyzer:
|
|
108 |
for label, score in zip(sensationalism_result['labels'], sensationalism_result['scores'])
|
109 |
}
|
110 |
|
111 |
-
#
|
112 |
-
for
|
113 |
-
if len(sentence.strip()) > 10:
|
114 |
-
# Check for contradiction/entailment
|
115 |
-
input_text = f"{headline} [SEP] {sentence}"
|
116 |
-
nli_result = self.nli_pipeline(input_text, top_k=None)
|
117 |
-
scores = {item['label']: item['score'] for item in nli_result}
|
118 |
-
nli_scores.append(scores)
|
119 |
-
|
120 |
-
# Flag contradictory or highly sensationalized content
|
121 |
-
if scores.get('CONTRADICTION', 0) > 0.4:
|
122 |
-
flagged_phrases.append({
|
123 |
-
'text': sentence.strip(),
|
124 |
-
'type': 'contradiction',
|
125 |
-
'score': scores['CONTRADICTION']
|
126 |
-
})
|
127 |
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
-
#
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
# Sort and limit flagged phrases
|
151 |
sorted_phrases = sorted(
|
@@ -153,11 +218,19 @@ class HeadlineAnalyzer:
|
|
153 |
key=lambda x: x['score'],
|
154 |
reverse=True
|
155 |
)
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
return {
|
159 |
"accuracy_score": accuracy_score,
|
160 |
-
"flagged_phrases":
|
161 |
"detailed_scores": {
|
162 |
"nli": avg_scores,
|
163 |
"sensationalism": sensationalism_scores
|
@@ -167,9 +240,12 @@ class HeadlineAnalyzer:
|
|
167 |
except Exception as e:
|
168 |
logger.error(f"Section analysis failed: {str(e)}")
|
169 |
return {
|
170 |
-
"accuracy_score": 0,
|
171 |
"flagged_phrases": [],
|
172 |
-
"detailed_scores": {
|
|
|
|
|
|
|
173 |
}
|
174 |
|
175 |
def _analyze_traditional(self, headline: str, content: str) -> Dict[str, Any]:
|
@@ -266,13 +342,23 @@ class HeadlineAnalyzer:
|
|
266 |
accuracy_scores = [r['accuracy_score'] for r in section_results]
|
267 |
final_score = np.mean(accuracy_scores)
|
268 |
|
269 |
-
# Combine flagged phrases
|
270 |
all_phrases = []
|
271 |
for result in section_results:
|
272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
|
274 |
-
|
275 |
-
|
|
|
|
|
|
|
|
|
276 |
|
277 |
return {
|
278 |
"headline_vs_content_score": round(final_score, 1),
|
|
|
1 |
import logging
|
2 |
+
from typing import Dict, Any, List, Optional
|
3 |
from transformers import pipeline, AutoTokenizer
|
4 |
import numpy as np
|
5 |
import nltk
|
|
|
8 |
logger = logging.getLogger(__name__)
|
9 |
|
10 |
class HeadlineAnalyzer:
|
11 |
+
def __init__(self, use_ai: bool = True, model_registry: Optional[Any] = None):
|
12 |
"""
|
13 |
Initialize the analyzers for headline analysis.
|
14 |
|
15 |
Args:
|
16 |
use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
|
17 |
+
model_registry: Optional shared model registry for better performance
|
18 |
"""
|
19 |
self.use_ai = use_ai
|
20 |
self.llm_available = False
|
21 |
+
self.model_registry = model_registry
|
22 |
|
23 |
if use_ai:
|
24 |
try:
|
25 |
+
if model_registry and model_registry.is_available:
|
26 |
+
# Use shared models
|
27 |
+
self.nli_pipeline = model_registry.nli
|
28 |
+
self.zero_shot = model_registry.zero_shot
|
29 |
+
self.tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
|
30 |
+
self.llm_available = True
|
31 |
+
logger.info("Using shared model pipelines for headline analysis")
|
32 |
+
else:
|
33 |
+
# Initialize own pipelines
|
34 |
+
self.nli_pipeline = pipeline(
|
35 |
+
"text-classification",
|
36 |
+
model="roberta-large-mnli",
|
37 |
+
batch_size=16
|
38 |
+
)
|
39 |
+
self.zero_shot = pipeline(
|
40 |
+
"zero-shot-classification",
|
41 |
+
model="facebook/bart-large-mnli",
|
42 |
+
device=-1,
|
43 |
+
batch_size=8
|
44 |
+
)
|
45 |
+
self.tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
|
46 |
+
self.llm_available = True
|
47 |
+
logger.info("Initialized dedicated model pipelines for headline analysis")
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
self.max_length = 512
|
50 |
+
|
|
|
51 |
except Exception as e:
|
52 |
logger.warning(f"Failed to initialize LLM pipelines: {str(e)}")
|
53 |
self.llm_available = False
|
|
|
65 |
sep_tokens = len(self.tokenizer.encode("[SEP]")) - 2
|
66 |
max_content_tokens = self.max_length - headline_tokens - sep_tokens
|
67 |
|
68 |
+
# Process words into sections with 4000 character chunks
|
69 |
+
current_text = ""
|
70 |
for word in content_words:
|
71 |
+
if len(current_text) + len(word) + 1 <= 4000:
|
72 |
+
current_text += " " + word
|
73 |
+
else:
|
74 |
+
sections.append(current_text.strip())
|
75 |
+
current_text = word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
+
if current_text:
|
78 |
+
sections.append(current_text.strip())
|
|
|
79 |
|
80 |
return sections
|
81 |
|
|
|
89 |
nltk.download('punkt')
|
90 |
|
91 |
sentences = sent_tokenize(section)
|
92 |
+
if not sentences:
|
93 |
+
logger.warning("No sentences found in section")
|
94 |
+
return {
|
95 |
+
"accuracy_score": 50.0, # Neutral score
|
96 |
+
"flagged_phrases": [],
|
97 |
+
"detailed_scores": {
|
98 |
+
"nli": {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0},
|
99 |
+
"sensationalism": {"factual reporting": 0.5, "accurate headline": 0.5}
|
100 |
+
}
|
101 |
+
}
|
102 |
|
103 |
# Categories for sensationalism check
|
104 |
sensationalism_categories = [
|
|
|
121 |
for label, score in zip(sensationalism_result['labels'], sensationalism_result['scores'])
|
122 |
}
|
123 |
|
124 |
+
# Filter relevant sentences (longer than 20 chars)
|
125 |
+
relevant_sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
+
if not relevant_sentences:
|
128 |
+
logger.warning("No relevant sentences found in section")
|
129 |
+
return {
|
130 |
+
"accuracy_score": 50.0, # Neutral score
|
131 |
+
"flagged_phrases": [],
|
132 |
+
"detailed_scores": {
|
133 |
+
"nli": {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0},
|
134 |
+
"sensationalism": sensationalism_scores
|
135 |
+
}
|
136 |
+
}
|
137 |
|
138 |
+
# Process sentences in batches for contradiction/support
|
139 |
+
nli_scores = []
|
140 |
+
flagged_phrases = []
|
141 |
+
batch_size = 8
|
142 |
+
|
143 |
+
for i in range(0, len(relevant_sentences), batch_size):
|
144 |
+
batch = relevant_sentences[i:i+batch_size]
|
145 |
+
batch_inputs = [f"{headline} [SEP] {sentence}" for sentence in batch]
|
146 |
+
|
147 |
+
try:
|
148 |
+
# Get NLI scores for batch
|
149 |
+
batch_results = self.nli_pipeline(batch_inputs, top_k=None)
|
150 |
+
if not isinstance(batch_results, list):
|
151 |
+
batch_results = [batch_results]
|
152 |
+
|
153 |
+
for sentence, result in zip(batch, batch_results):
|
154 |
+
scores = {item['label']: item['score'] for item in result}
|
155 |
+
nli_scores.append(scores)
|
156 |
+
|
157 |
+
# Flag contradictory content
|
158 |
+
if scores.get('CONTRADICTION', 0) > 0.4:
|
159 |
+
flagged_phrases.append({
|
160 |
+
'text': sentence,
|
161 |
+
'type': 'Contradiction',
|
162 |
+
'score': scores['CONTRADICTION'],
|
163 |
+
'highlight': f"[CONTRADICTION] (Score: {round(scores['CONTRADICTION'] * 100, 1)}%) \"{sentence}\""
|
164 |
+
})
|
165 |
+
|
166 |
+
except Exception as batch_error:
|
167 |
+
logger.warning(f"Batch processing error: {str(batch_error)}")
|
168 |
+
continue
|
169 |
+
|
170 |
+
# Calculate aggregate scores with validation
|
171 |
+
if not nli_scores:
|
172 |
+
logger.warning("No NLI scores available")
|
173 |
+
avg_scores = {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0}
|
174 |
+
else:
|
175 |
+
try:
|
176 |
+
avg_scores = {
|
177 |
+
label: float(np.mean([
|
178 |
+
score.get(label, 0.0)
|
179 |
+
for score in nli_scores
|
180 |
+
]))
|
181 |
+
for label in ['ENTAILMENT', 'CONTRADICTION', 'NEUTRAL']
|
182 |
+
}
|
183 |
+
except Exception as agg_error:
|
184 |
+
logger.error(f"Error aggregating NLI scores: {str(agg_error)}")
|
185 |
+
avg_scores = {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0}
|
186 |
|
187 |
+
# Calculate headline accuracy score with validation
|
188 |
+
try:
|
189 |
+
accuracy_components = {
|
190 |
+
'entailment': avg_scores.get('ENTAILMENT', 0.0) * 0.4,
|
191 |
+
'non_contradiction': (1 - avg_scores.get('CONTRADICTION', 0.0)) * 0.3,
|
192 |
+
'non_sensational': (
|
193 |
+
sensationalism_scores.get('factual reporting', 0.0) +
|
194 |
+
sensationalism_scores.get('accurate headline', 0.0)
|
195 |
+
) * 0.15,
|
196 |
+
'non_clickbait': (
|
197 |
+
1 - sensationalism_scores.get('clickbait', 0.0) -
|
198 |
+
sensationalism_scores.get('sensationalized', 0.0)
|
199 |
+
) * 0.15
|
200 |
+
}
|
201 |
+
|
202 |
+
accuracy_score = sum(accuracy_components.values()) * 100
|
203 |
+
|
204 |
+
# Validate final score
|
205 |
+
if np.isnan(accuracy_score) or not np.isfinite(accuracy_score):
|
206 |
+
logger.warning("Invalid accuracy score calculated, using default")
|
207 |
+
accuracy_score = 50.0
|
208 |
+
else:
|
209 |
+
accuracy_score = float(accuracy_score)
|
210 |
+
|
211 |
+
except Exception as score_error:
|
212 |
+
logger.error(f"Error calculating accuracy score: {str(score_error)}")
|
213 |
+
accuracy_score = 50.0
|
214 |
|
215 |
# Sort and limit flagged phrases
|
216 |
sorted_phrases = sorted(
|
|
|
218 |
key=lambda x: x['score'],
|
219 |
reverse=True
|
220 |
)
|
221 |
+
unique_phrases = []
|
222 |
+
seen = set()
|
223 |
+
|
224 |
+
for phrase in sorted_phrases:
|
225 |
+
if phrase['text'] not in seen:
|
226 |
+
unique_phrases.append(phrase)
|
227 |
+
seen.add(phrase['text'])
|
228 |
+
if len(unique_phrases) >= 5:
|
229 |
+
break
|
230 |
|
231 |
return {
|
232 |
"accuracy_score": accuracy_score,
|
233 |
+
"flagged_phrases": unique_phrases,
|
234 |
"detailed_scores": {
|
235 |
"nli": avg_scores,
|
236 |
"sensationalism": sensationalism_scores
|
|
|
240 |
except Exception as e:
|
241 |
logger.error(f"Section analysis failed: {str(e)}")
|
242 |
return {
|
243 |
+
"accuracy_score": 50.0, # Neutral score for errors
|
244 |
"flagged_phrases": [],
|
245 |
+
"detailed_scores": {
|
246 |
+
"nli": {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0},
|
247 |
+
"sensationalism": {}
|
248 |
+
}
|
249 |
}
|
250 |
|
251 |
def _analyze_traditional(self, headline: str, content: str) -> Dict[str, Any]:
|
|
|
342 |
accuracy_scores = [r['accuracy_score'] for r in section_results]
|
343 |
final_score = np.mean(accuracy_scores)
|
344 |
|
345 |
+
# Combine and deduplicate flagged phrases
|
346 |
all_phrases = []
|
347 |
for result in section_results:
|
348 |
+
if 'flagged_phrases' in result:
|
349 |
+
all_phrases.extend(result['flagged_phrases'])
|
350 |
+
|
351 |
+
# Sort by score and get unique phrases
|
352 |
+
sorted_phrases = sorted(all_phrases, key=lambda x: x['score'], reverse=True)
|
353 |
+
unique_phrases = []
|
354 |
+
seen = set()
|
355 |
|
356 |
+
for phrase in sorted_phrases:
|
357 |
+
if phrase['text'] not in seen:
|
358 |
+
unique_phrases.append(phrase)
|
359 |
+
seen.add(phrase['text'])
|
360 |
+
if len(unique_phrases) >= 5:
|
361 |
+
break
|
362 |
|
363 |
return {
|
364 |
"headline_vs_content_score": round(final_score, 1),
|
mediaunmasked/analyzers/scoring.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
from typing import Dict, Any, Literal
|
2 |
import logging
|
|
|
|
|
|
|
3 |
|
4 |
from .headline_analyzer import HeadlineAnalyzer
|
5 |
from .sentiment_analyzer import SentimentAnalyzer
|
@@ -11,6 +14,64 @@ logger = logging.getLogger(__name__)
|
|
11 |
# Define analysis mode type
|
12 |
AnalysisMode = Literal['ai', 'traditional']
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
class MediaScorer:
|
15 |
def __init__(self, use_ai: bool = True):
|
16 |
"""
|
@@ -23,11 +84,31 @@ class MediaScorer:
|
|
23 |
self.analysis_mode: AnalysisMode = 'ai' if use_ai else 'traditional'
|
24 |
logger.info(f"Initializing MediaScorer with {self.analysis_mode} analysis")
|
25 |
|
26 |
-
# Initialize
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
logger.info(f"All analyzers initialized in {self.analysis_mode} mode")
|
33 |
|
@@ -53,32 +134,58 @@ class MediaScorer:
|
|
53 |
""")
|
54 |
logger.info(f"Evidence Analysis: {evidence_analysis}")
|
55 |
|
56 |
-
# Calculate component scores
|
57 |
# For headline: 20% contradiction = 20% score (don't invert)
|
58 |
-
headline_score = headline_analysis
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
# For manipulation: 0% = good (use directly), 100% = bad
|
61 |
-
manipulation_score = (
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
# For bias: 0% = good (use directly), 100% = bad
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
logger.info(f"""Component Scores:
|
69 |
-
Headline: {headline_score * 100:.1f}% (from {headline_analysis
|
70 |
Evidence: {evidence_score * 100:.1f}%
|
71 |
-
Manipulation: {manipulation_score * 100:.1f}% (100 - {sentiment_analysis
|
72 |
-
Bias: {bias_score * 100:.1f}% (100 - {bias_analysis
|
73 |
""")
|
74 |
|
75 |
# Calculate final score
|
76 |
-
final_score = (
|
77 |
(headline_score * 0.25) +
|
78 |
(manipulation_score * 0.25) +
|
79 |
(bias_score * 0.25) +
|
80 |
(evidence_score * 0.25)
|
81 |
-
) * 100
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
# Determine rating
|
84 |
if final_score >= 80:
|
@@ -89,27 +196,27 @@ class MediaScorer:
|
|
89 |
rating = "Misleading"
|
90 |
|
91 |
result = {
|
92 |
-
"media_unmasked_score": round(final_score, 1),
|
93 |
"rating": rating,
|
94 |
"analysis_mode": self.analysis_mode,
|
95 |
"details": {
|
96 |
"headline_analysis": {
|
97 |
-
"headline_vs_content_score": headline_analysis
|
98 |
"flagged_phrases": headline_analysis.get("flagged_phrases", [])
|
99 |
},
|
100 |
"sentiment_analysis": {
|
101 |
-
"sentiment": sentiment_analysis
|
102 |
-
"manipulation_score": sentiment_analysis
|
103 |
"flagged_phrases": sentiment_analysis.get("flagged_phrases", [])
|
104 |
},
|
105 |
"bias_analysis": {
|
106 |
-
"bias": bias_analysis
|
107 |
-
"bias_score": bias_analysis
|
108 |
-
"bias_percentage": bias_analysis
|
109 |
"flagged_phrases": bias_analysis.get("flagged_phrases", [])
|
110 |
},
|
111 |
"evidence_analysis": {
|
112 |
-
"evidence_based_score": evidence_analysis
|
113 |
"flagged_phrases": evidence_analysis.get("flagged_phrases", [])
|
114 |
}
|
115 |
}
|
|
|
1 |
from typing import Dict, Any, Literal
|
2 |
import logging
|
3 |
+
from transformers import pipeline
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
|
7 |
from .headline_analyzer import HeadlineAnalyzer
|
8 |
from .sentiment_analyzer import SentimentAnalyzer
|
|
|
14 |
# Define analysis mode type
|
15 |
AnalysisMode = Literal['ai', 'traditional']
|
16 |
|
17 |
+
class ModelRegistry:
|
18 |
+
"""Singleton class to manage shared model pipelines."""
|
19 |
+
_instance = None
|
20 |
+
_initialized = False
|
21 |
+
|
22 |
+
def __new__(cls):
|
23 |
+
if cls._instance is None:
|
24 |
+
cls._instance = super(ModelRegistry, cls).__new__(cls)
|
25 |
+
return cls._instance
|
26 |
+
|
27 |
+
def __init__(self):
|
28 |
+
if not self._initialized:
|
29 |
+
try:
|
30 |
+
# Use GPU if available
|
31 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
32 |
+
logger.info(f"Using device: {self.device}")
|
33 |
+
|
34 |
+
# Initialize shared models with larger batch sizes
|
35 |
+
self.zero_shot = pipeline(
|
36 |
+
"zero-shot-classification",
|
37 |
+
model="facebook/bart-large-mnli",
|
38 |
+
device=self.device,
|
39 |
+
batch_size=8
|
40 |
+
)
|
41 |
+
|
42 |
+
self.sentiment = pipeline(
|
43 |
+
"text-classification",
|
44 |
+
model="SamLowe/roberta-base-go_emotions",
|
45 |
+
device=self.device,
|
46 |
+
batch_size=16
|
47 |
+
)
|
48 |
+
|
49 |
+
self.nli = pipeline(
|
50 |
+
"text-classification",
|
51 |
+
model="roberta-large-mnli",
|
52 |
+
device=self.device,
|
53 |
+
batch_size=16
|
54 |
+
)
|
55 |
+
|
56 |
+
# Add toxicity pipeline
|
57 |
+
self.toxicity = pipeline(
|
58 |
+
"text-classification",
|
59 |
+
model="unitary/toxic-bert",
|
60 |
+
device=self.device,
|
61 |
+
batch_size=16
|
62 |
+
)
|
63 |
+
|
64 |
+
logger.info("Successfully initialized shared model pipelines")
|
65 |
+
self._initialized = True
|
66 |
+
|
67 |
+
except Exception as e:
|
68 |
+
logger.error(f"Failed to initialize shared models: {str(e)}")
|
69 |
+
self._initialized = False
|
70 |
+
|
71 |
+
@property
|
72 |
+
def is_available(self):
|
73 |
+
return self._initialized
|
74 |
+
|
75 |
class MediaScorer:
|
76 |
def __init__(self, use_ai: bool = True):
|
77 |
"""
|
|
|
84 |
self.analysis_mode: AnalysisMode = 'ai' if use_ai else 'traditional'
|
85 |
logger.info(f"Initializing MediaScorer with {self.analysis_mode} analysis")
|
86 |
|
87 |
+
# Initialize shared model registry if using AI
|
88 |
+
if use_ai:
|
89 |
+
self.model_registry = ModelRegistry()
|
90 |
+
if not self.model_registry.is_available:
|
91 |
+
logger.warning("Shared models not available, falling back to traditional analysis")
|
92 |
+
self.use_ai = False
|
93 |
+
self.analysis_mode = 'traditional'
|
94 |
+
|
95 |
+
# Initialize analyzers with analysis mode preference and shared models
|
96 |
+
self.headline_analyzer = HeadlineAnalyzer(
|
97 |
+
use_ai=self.use_ai,
|
98 |
+
model_registry=self.model_registry if self.use_ai else None
|
99 |
+
)
|
100 |
+
self.sentiment_analyzer = SentimentAnalyzer(
|
101 |
+
use_ai=self.use_ai,
|
102 |
+
model_registry=self.model_registry if self.use_ai else None
|
103 |
+
)
|
104 |
+
self.bias_analyzer = BiasAnalyzer(
|
105 |
+
use_ai=self.use_ai,
|
106 |
+
model_registry=self.model_registry if self.use_ai else None
|
107 |
+
)
|
108 |
+
self.evidence_analyzer = EvidenceAnalyzer(
|
109 |
+
use_ai=self.use_ai,
|
110 |
+
model_registry=self.model_registry if self.use_ai else None
|
111 |
+
)
|
112 |
|
113 |
logger.info(f"All analyzers initialized in {self.analysis_mode} mode")
|
114 |
|
|
|
134 |
""")
|
135 |
logger.info(f"Evidence Analysis: {evidence_analysis}")
|
136 |
|
137 |
+
# Calculate component scores with NaN handling
|
138 |
# For headline: 20% contradiction = 20% score (don't invert)
|
139 |
+
headline_score = headline_analysis.get("headline_vs_content_score", 0)
|
140 |
+
if isinstance(headline_score, (int, float)) and not np.isnan(headline_score):
|
141 |
+
headline_score = headline_score / 100
|
142 |
+
else:
|
143 |
+
headline_score = 0.5 # Default to neutral if score is invalid
|
144 |
+
logger.warning("Invalid headline score, using default value of 0.5")
|
145 |
|
146 |
# For manipulation: 0% = good (use directly), 100% = bad
|
147 |
+
manipulation_score = sentiment_analysis.get("manipulation_score", 0)
|
148 |
+
if isinstance(manipulation_score, (int, float)) and not np.isnan(manipulation_score):
|
149 |
+
manipulation_score = (100 - manipulation_score) / 100
|
150 |
+
else:
|
151 |
+
manipulation_score = 0.5
|
152 |
+
logger.warning("Invalid manipulation score, using default value of 0.5")
|
153 |
|
154 |
# For bias: 0% = good (use directly), 100% = bad
|
155 |
+
bias_percentage = bias_analysis.get("bias_percentage", 0)
|
156 |
+
if isinstance(bias_percentage, (int, float)) and not np.isnan(bias_percentage):
|
157 |
+
bias_score = (100 - bias_percentage) / 100
|
158 |
+
else:
|
159 |
+
bias_score = 0.5
|
160 |
+
logger.warning("Invalid bias score, using default value of 0.5")
|
161 |
|
162 |
+
# For evidence: higher is better
|
163 |
+
evidence_score = evidence_analysis.get("evidence_based_score", 0)
|
164 |
+
if isinstance(evidence_score, (int, float)) and not np.isnan(evidence_score):
|
165 |
+
evidence_score = evidence_score / 100
|
166 |
+
else:
|
167 |
+
evidence_score = 0.5
|
168 |
+
logger.warning("Invalid evidence score, using default value of 0.5")
|
169 |
|
170 |
logger.info(f"""Component Scores:
|
171 |
+
Headline: {headline_score * 100:.1f}% (from {headline_analysis.get("headline_vs_content_score", 0)})
|
172 |
Evidence: {evidence_score * 100:.1f}%
|
173 |
+
Manipulation: {manipulation_score * 100:.1f}% (100 - {sentiment_analysis.get("manipulation_score", 0)}%)
|
174 |
+
Bias: {bias_score * 100:.1f}% (100 - {bias_analysis.get("bias_percentage", 0)}%)
|
175 |
""")
|
176 |
|
177 |
# Calculate final score
|
178 |
+
final_score = float((
|
179 |
(headline_score * 0.25) +
|
180 |
(manipulation_score * 0.25) +
|
181 |
(bias_score * 0.25) +
|
182 |
(evidence_score * 0.25)
|
183 |
+
) * 100)
|
184 |
+
|
185 |
+
# Ensure final score is valid
|
186 |
+
if np.isnan(final_score) or not np.isfinite(final_score):
|
187 |
+
final_score = 50.0 # Default to neutral
|
188 |
+
logger.warning("Invalid final score calculated, using default value of 50.0")
|
189 |
|
190 |
# Determine rating
|
191 |
if final_score >= 80:
|
|
|
196 |
rating = "Misleading"
|
197 |
|
198 |
result = {
|
199 |
+
"media_unmasked_score": round(float(final_score), 1),
|
200 |
"rating": rating,
|
201 |
"analysis_mode": self.analysis_mode,
|
202 |
"details": {
|
203 |
"headline_analysis": {
|
204 |
+
"headline_vs_content_score": float(headline_analysis.get("headline_vs_content_score", 0)),
|
205 |
"flagged_phrases": headline_analysis.get("flagged_phrases", [])
|
206 |
},
|
207 |
"sentiment_analysis": {
|
208 |
+
"sentiment": str(sentiment_analysis.get("sentiment", "Neutral")),
|
209 |
+
"manipulation_score": float(sentiment_analysis.get("manipulation_score", 0)),
|
210 |
"flagged_phrases": sentiment_analysis.get("flagged_phrases", [])
|
211 |
},
|
212 |
"bias_analysis": {
|
213 |
+
"bias": str(bias_analysis.get("bias", "Neutral")),
|
214 |
+
"bias_score": float(bias_analysis.get("bias_score", 0)),
|
215 |
+
"bias_percentage": float(bias_analysis.get("bias_percentage", 0)),
|
216 |
"flagged_phrases": bias_analysis.get("flagged_phrases", [])
|
217 |
},
|
218 |
"evidence_analysis": {
|
219 |
+
"evidence_based_score": float(evidence_analysis.get("evidence_based_score", 0)),
|
220 |
"flagged_phrases": evidence_analysis.get("flagged_phrases", [])
|
221 |
}
|
222 |
}
|
mediaunmasked/analyzers/sentiment_analyzer.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import logging
|
2 |
-
from typing import Dict, Any, List
|
3 |
from textblob import TextBlob
|
4 |
from transformers import pipeline
|
5 |
import numpy as np
|
@@ -7,15 +7,18 @@ import numpy as np
|
|
7 |
logger = logging.getLogger(__name__)
|
8 |
|
9 |
class SentimentAnalyzer:
|
10 |
-
def __init__(self, use_ai: bool = True):
|
11 |
"""
|
12 |
Initialize sentiment analyzer with both traditional and LLM-based approaches.
|
13 |
|
14 |
Args:
|
15 |
use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
|
|
|
16 |
"""
|
17 |
self.use_ai = use_ai
|
18 |
self.llm_available = False
|
|
|
|
|
19 |
|
20 |
# Traditional manipulation patterns
|
21 |
self.manipulative_patterns = [
|
@@ -32,24 +35,47 @@ class SentimentAnalyzer:
|
|
32 |
|
33 |
if use_ai:
|
34 |
try:
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
"
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
except Exception as e:
|
54 |
logger.warning(f"Failed to initialize LLM pipelines: {str(e)}")
|
55 |
self.llm_available = False
|
@@ -107,18 +133,22 @@ class SentimentAnalyzer:
|
|
107 |
sentiment_scores.append(emotions)
|
108 |
logger.debug(f"Processed emotion scores: {sentiment_scores}")
|
109 |
|
110 |
-
# Get toxicity scores
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
118 |
|
119 |
# Get manipulation scores
|
120 |
logger.debug(f"Analyzing manipulation for chunk {i}")
|
121 |
-
manipulation = self.
|
122 |
chunk,
|
123 |
manipulation_categories,
|
124 |
multi_label=True
|
@@ -135,7 +165,7 @@ class SentimentAnalyzer:
|
|
135 |
sentences = chunk.split('.')
|
136 |
for sentence in sentences:
|
137 |
if len(sentence.strip()) > 10:
|
138 |
-
sent_result = self.
|
139 |
sentence.strip(),
|
140 |
manipulation_categories,
|
141 |
multi_label=False
|
@@ -157,6 +187,10 @@ class SentimentAnalyzer:
|
|
157 |
# Aggregate scores with error handling
|
158 |
def aggregate_scores(scores_list, score_type: str):
|
159 |
try:
|
|
|
|
|
|
|
|
|
160 |
all_scores = {}
|
161 |
for scores in scores_list:
|
162 |
if isinstance(scores, dict):
|
@@ -181,23 +215,24 @@ class SentimentAnalyzer:
|
|
181 |
if isinstance(score, (int, float)):
|
182 |
all_scores[label].append(score)
|
183 |
|
184 |
-
return {k: np.mean(v) for k, v in all_scores.items() if v}
|
185 |
except Exception as agg_error:
|
186 |
logger.error(f"Error aggregating {score_type} scores: {str(agg_error)}")
|
187 |
return {}
|
188 |
|
189 |
emotion_scores = aggregate_scores(sentiment_scores, "emotion")
|
190 |
-
toxicity_scores = aggregate_scores(toxicity_scores, "toxicity")
|
191 |
logger.debug(f"Aggregated emotion scores: {emotion_scores}")
|
192 |
logger.debug(f"Aggregated toxicity scores: {toxicity_scores}")
|
193 |
|
194 |
# Aggregate manipulation scores
|
195 |
manipulation_agg = {
|
196 |
-
category: np.mean([
|
197 |
scores.get(category, 0)
|
198 |
for scores in manipulation_scores
|
199 |
-
])
|
200 |
for category in manipulation_categories
|
|
|
201 |
}
|
202 |
logger.debug(f"Aggregated manipulation scores: {manipulation_agg}")
|
203 |
|
@@ -205,18 +240,28 @@ class SentimentAnalyzer:
|
|
205 |
manipulation_indicators = {
|
206 |
'emotional manipulation': 0.4,
|
207 |
'fear mongering': 0.3,
|
208 |
-
'propaganda': 0.3
|
209 |
-
'toxic': 0.2,
|
210 |
-
'severe_toxic': 0.3,
|
211 |
-
'threat': 0.2
|
212 |
}
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
# Combine toxicity and manipulation scores
|
215 |
combined_scores = {**toxicity_scores, **manipulation_agg}
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
logger.info(f"Final manipulation score: {manipulation_score}")
|
222 |
|
@@ -258,7 +303,7 @@ class SentimentAnalyzer:
|
|
258 |
|
259 |
return {
|
260 |
"sentiment": sentiment,
|
261 |
-
"manipulation_score": manipulation_score,
|
262 |
"flagged_phrases": unique_phrases,
|
263 |
"detailed_scores": {
|
264 |
"emotions": emotion_scores,
|
|
|
1 |
import logging
|
2 |
+
from typing import Dict, Any, List, Optional
|
3 |
from textblob import TextBlob
|
4 |
from transformers import pipeline
|
5 |
import numpy as np
|
|
|
7 |
logger = logging.getLogger(__name__)
|
8 |
|
9 |
class SentimentAnalyzer:
|
10 |
+
def __init__(self, use_ai: bool = True, model_registry: Optional[Any] = None):
|
11 |
"""
|
12 |
Initialize sentiment analyzer with both traditional and LLM-based approaches.
|
13 |
|
14 |
Args:
|
15 |
use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
|
16 |
+
model_registry: Optional shared model registry for better performance
|
17 |
"""
|
18 |
self.use_ai = use_ai
|
19 |
self.llm_available = False
|
20 |
+
self.model_registry = model_registry
|
21 |
+
self.toxicity_available = False
|
22 |
|
23 |
# Traditional manipulation patterns
|
24 |
self.manipulative_patterns = [
|
|
|
35 |
|
36 |
if use_ai:
|
37 |
try:
|
38 |
+
if model_registry and model_registry.is_available:
|
39 |
+
# Use shared models
|
40 |
+
self.sentiment_pipeline = model_registry.sentiment
|
41 |
+
self.zero_shot = model_registry.zero_shot
|
42 |
+
self.toxicity_pipeline = getattr(model_registry, 'toxicity', None)
|
43 |
+
self.toxicity_available = self.toxicity_pipeline is not None
|
44 |
+
self.llm_available = True
|
45 |
+
logger.info("Using shared model pipelines for sentiment analysis")
|
46 |
+
if self.toxicity_available:
|
47 |
+
logger.info("Toxicity analysis enabled")
|
48 |
+
else:
|
49 |
+
logger.info("Toxicity analysis not available")
|
50 |
+
else:
|
51 |
+
# Initialize own pipelines
|
52 |
+
self.sentiment_pipeline = pipeline(
|
53 |
+
"text-classification",
|
54 |
+
model="SamLowe/roberta-base-go_emotions",
|
55 |
+
device=-1,
|
56 |
+
batch_size=16
|
57 |
+
)
|
58 |
+
self.zero_shot = pipeline(
|
59 |
+
"zero-shot-classification",
|
60 |
+
model="facebook/bart-large-mnli",
|
61 |
+
device=-1,
|
62 |
+
batch_size=8
|
63 |
+
)
|
64 |
+
try:
|
65 |
+
self.toxicity_pipeline = pipeline(
|
66 |
+
"text-classification",
|
67 |
+
model="unitary/toxic-bert",
|
68 |
+
device=-1,
|
69 |
+
batch_size=16
|
70 |
+
)
|
71 |
+
self.toxicity_available = True
|
72 |
+
logger.info("Toxicity analysis enabled")
|
73 |
+
except Exception as tox_error:
|
74 |
+
logger.warning(f"Toxicity pipeline initialization failed: {str(tox_error)}")
|
75 |
+
self.toxicity_available = False
|
76 |
+
|
77 |
+
self.llm_available = True
|
78 |
+
logger.info("Initialized dedicated model pipelines for sentiment analysis")
|
79 |
except Exception as e:
|
80 |
logger.warning(f"Failed to initialize LLM pipelines: {str(e)}")
|
81 |
self.llm_available = False
|
|
|
133 |
sentiment_scores.append(emotions)
|
134 |
logger.debug(f"Processed emotion scores: {sentiment_scores}")
|
135 |
|
136 |
+
# Get toxicity scores if available
|
137 |
+
if self.toxicity_available:
|
138 |
+
logger.debug(f"Analyzing toxicity for chunk {i}")
|
139 |
+
try:
|
140 |
+
toxicity = self.toxicity_pipeline(chunk)
|
141 |
+
if isinstance(toxicity, list):
|
142 |
+
toxicity_scores.extend(toxicity)
|
143 |
+
else:
|
144 |
+
toxicity_scores.append(toxicity)
|
145 |
+
logger.debug(f"Processed toxicity scores: {toxicity_scores}")
|
146 |
+
except Exception as tox_error:
|
147 |
+
logger.warning(f"Toxicity analysis failed for chunk {i}: {str(tox_error)}")
|
148 |
|
149 |
# Get manipulation scores
|
150 |
logger.debug(f"Analyzing manipulation for chunk {i}")
|
151 |
+
manipulation = self.zero_shot(
|
152 |
chunk,
|
153 |
manipulation_categories,
|
154 |
multi_label=True
|
|
|
165 |
sentences = chunk.split('.')
|
166 |
for sentence in sentences:
|
167 |
if len(sentence.strip()) > 10:
|
168 |
+
sent_result = self.zero_shot(
|
169 |
sentence.strip(),
|
170 |
manipulation_categories,
|
171 |
multi_label=False
|
|
|
187 |
# Aggregate scores with error handling
|
188 |
def aggregate_scores(scores_list, score_type: str):
|
189 |
try:
|
190 |
+
if not scores_list:
|
191 |
+
logger.warning(f"No {score_type} scores to aggregate")
|
192 |
+
return {}
|
193 |
+
|
194 |
all_scores = {}
|
195 |
for scores in scores_list:
|
196 |
if isinstance(scores, dict):
|
|
|
215 |
if isinstance(score, (int, float)):
|
216 |
all_scores[label].append(score)
|
217 |
|
218 |
+
return {k: float(np.mean(v)) for k, v in all_scores.items() if v}
|
219 |
except Exception as agg_error:
|
220 |
logger.error(f"Error aggregating {score_type} scores: {str(agg_error)}")
|
221 |
return {}
|
222 |
|
223 |
emotion_scores = aggregate_scores(sentiment_scores, "emotion")
|
224 |
+
toxicity_scores = aggregate_scores(toxicity_scores, "toxicity") if self.toxicity_available else {}
|
225 |
logger.debug(f"Aggregated emotion scores: {emotion_scores}")
|
226 |
logger.debug(f"Aggregated toxicity scores: {toxicity_scores}")
|
227 |
|
228 |
# Aggregate manipulation scores
|
229 |
manipulation_agg = {
|
230 |
+
category: float(np.mean([
|
231 |
scores.get(category, 0)
|
232 |
for scores in manipulation_scores
|
233 |
+
]))
|
234 |
for category in manipulation_categories
|
235 |
+
if manipulation_scores # Only process if we have scores
|
236 |
}
|
237 |
logger.debug(f"Aggregated manipulation scores: {manipulation_agg}")
|
238 |
|
|
|
240 |
manipulation_indicators = {
|
241 |
'emotional manipulation': 0.4,
|
242 |
'fear mongering': 0.3,
|
243 |
+
'propaganda': 0.3
|
|
|
|
|
|
|
244 |
}
|
245 |
|
246 |
+
if self.toxicity_available:
|
247 |
+
manipulation_indicators.update({
|
248 |
+
'toxic': 0.2,
|
249 |
+
'severe_toxic': 0.3,
|
250 |
+
'threat': 0.2
|
251 |
+
})
|
252 |
+
|
253 |
# Combine toxicity and manipulation scores
|
254 |
combined_scores = {**toxicity_scores, **manipulation_agg}
|
255 |
+
|
256 |
+
# Calculate manipulation score with fallback
|
257 |
+
if combined_scores:
|
258 |
+
manipulation_score = min(100, sum(
|
259 |
+
combined_scores.get(k, 0) * weight
|
260 |
+
for k, weight in manipulation_indicators.items()
|
261 |
+
) * 100)
|
262 |
+
else:
|
263 |
+
# Fallback to traditional analysis if no scores available
|
264 |
+
manipulation_score = len(self._detect_manipulative_phrases(text)) * 10
|
265 |
|
266 |
logger.info(f"Final manipulation score: {manipulation_score}")
|
267 |
|
|
|
303 |
|
304 |
return {
|
305 |
"sentiment": sentiment,
|
306 |
+
"manipulation_score": round(manipulation_score, 1),
|
307 |
"flagged_phrases": unique_phrases,
|
308 |
"detailed_scores": {
|
309 |
"emotions": emotion_scores,
|