wozwize commited on
Commit
a2624a3
·
1 Parent(s): 5c3b4a6

increasing performance on AI-mode by implementing singletons

Browse files
mediaunmasked/analyzers/bias_analyzer.py CHANGED
@@ -1,21 +1,25 @@
1
  import logging
2
  import os
3
- from typing import Dict, Any, List
4
  from transformers import pipeline
5
  import numpy as np
 
 
6
 
7
  logger = logging.getLogger(__name__)
8
 
9
  class BiasAnalyzer:
10
- def __init__(self, use_ai: bool = True):
11
  """
12
  Initialize bias analyzer with both LLM and traditional approaches.
13
 
14
  Args:
15
  use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
 
16
  """
17
  self.use_ai = use_ai
18
  self.llm_available = False
 
19
 
20
  # Load traditional keywords
21
  self.resources_dir = os.path.join(os.path.dirname(__file__), '..', 'resources')
@@ -24,14 +28,20 @@ class BiasAnalyzer:
24
 
25
  if use_ai:
26
  try:
27
- # Initialize LLM pipeline for zero-shot classification
28
- self.classifier = pipeline(
29
- "zero-shot-classification",
30
- model="facebook/bart-large-mnli",
31
- device=-1 # Use CPU, change to specific GPU index if available
32
- )
33
- self.llm_available = True
34
- logger.info("LLM pipeline initialized successfully for bias analysis")
 
 
 
 
 
 
35
  except Exception as e:
36
  logger.warning(f"Failed to initialize LLM pipeline: {str(e)}")
37
  self.llm_available = False
@@ -128,25 +138,30 @@ class BiasAnalyzer:
128
  }
129
 
130
  def _analyze_with_llm(self, text: str) -> Dict[str, Any]:
131
- """Analyze bias using LLM zero-shot classification."""
132
  try:
133
- # Define bias categories to check against
134
  bias_categories = [
135
  "left-wing bias",
136
  "right-wing bias",
137
  "neutral/balanced perspective"
138
  ]
139
 
140
- # Split text into manageable chunks (2000 chars each)
141
- chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
 
 
 
 
 
142
 
143
- # Analyze each chunk
144
  chunk_scores = []
145
  flagged_phrases = []
146
 
147
  for chunk in chunks:
148
- # Perform zero-shot classification
149
- result = self.classifier(
150
  chunk,
151
  bias_categories,
152
  multi_label=True
@@ -154,21 +169,43 @@ class BiasAnalyzer:
154
 
155
  chunk_scores.append({
156
  label: score
157
- for label, score in zip(result['labels'], result['scores'])
158
  })
159
 
160
- # Identify strongly biased phrases
161
- sentences = chunk.split('.')
162
- for sentence in sentences:
163
- if len(sentence.strip()) > 10: # Ignore very short sentences
164
- sentence_result = self.classifier(
165
- sentence.strip(),
166
- bias_categories,
167
- multi_label=False
168
- )
169
- max_score = max(sentence_result['scores'])
170
- if max_score > 0.8 and sentence_result['labels'][0] != "neutral/balanced perspective":
171
- flagged_phrases.append(sentence.strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  # Aggregate scores across chunks
174
  aggregated_scores = {
@@ -184,7 +221,7 @@ class BiasAnalyzer:
184
  right_score = aggregated_scores["right-wing bias"]
185
  neutral_score = aggregated_scores["neutral/balanced perspective"]
186
 
187
- # Calculate bias score (-1 to 1, where negative is left and positive is right)
188
  bias_score = (right_score - left_score) / max(right_score + left_score, 0.0001)
189
 
190
  # Determine bias label
@@ -206,11 +243,23 @@ class BiasAnalyzer:
206
  # Calculate bias percentage (0-100)
207
  bias_percentage = min(100, abs(bias_score * 100))
208
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  return {
210
  "bias": bias,
211
  "bias_score": round(bias_score, 2),
212
  "bias_percentage": round(bias_percentage, 1),
213
- "flagged_phrases": list(set(flagged_phrases))[:5], # Limit to top 5 unique phrases
214
  "detailed_scores": {
215
  "left_bias": round(left_score * 100, 1),
216
  "right_bias": round(right_score * 100, 1),
 
1
  import logging
2
  import os
3
+ from typing import Dict, Any, List, Optional
4
  from transformers import pipeline
5
  import numpy as np
6
+ import nltk
7
+ from nltk.tokenize import sent_tokenize
8
 
9
  logger = logging.getLogger(__name__)
10
 
11
  class BiasAnalyzer:
12
+ def __init__(self, use_ai: bool = True, model_registry: Optional[Any] = None):
13
  """
14
  Initialize bias analyzer with both LLM and traditional approaches.
15
 
16
  Args:
17
  use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
18
+ model_registry: Optional shared model registry for better performance
19
  """
20
  self.use_ai = use_ai
21
  self.llm_available = False
22
+ self.model_registry = model_registry
23
 
24
  # Load traditional keywords
25
  self.resources_dir = os.path.join(os.path.dirname(__file__), '..', 'resources')
 
28
 
29
  if use_ai:
30
  try:
31
+ if model_registry and model_registry.is_available:
32
+ self.classifier = model_registry.zero_shot
33
+ self.llm_available = True
34
+ logger.info("Using shared model pipeline for bias analysis")
35
+ else:
36
+ # Initialize own pipeline if no shared registry
37
+ self.classifier = pipeline(
38
+ "zero-shot-classification",
39
+ model="facebook/bart-large-mnli",
40
+ device=-1,
41
+ batch_size=8
42
+ )
43
+ self.llm_available = True
44
+ logger.info("Initialized dedicated model pipeline for bias analysis")
45
  except Exception as e:
46
  logger.warning(f"Failed to initialize LLM pipeline: {str(e)}")
47
  self.llm_available = False
 
138
  }
139
 
140
  def _analyze_with_llm(self, text: str) -> Dict[str, Any]:
141
+ """Analyze bias using LLM zero-shot classification with batch processing."""
142
  try:
143
+ # Define bias categories
144
  bias_categories = [
145
  "left-wing bias",
146
  "right-wing bias",
147
  "neutral/balanced perspective"
148
  ]
149
 
150
+ # Clean and prepare text
151
+ cleaned_text = text.replace('$!/$', '').replace('##', '').replace('#', '')
152
+ cleaned_text = '\n'.join(line for line in cleaned_text.split('\n')
153
+ if not line.startswith('[') and not line.startswith('More on'))
154
+
155
+ # Split into larger chunks (4000 chars) for fewer API calls
156
+ chunks = [cleaned_text[i:i+4000] for i in range(0, len(cleaned_text), 4000)]
157
 
158
+ # Process chunks in batches
159
  chunk_scores = []
160
  flagged_phrases = []
161
 
162
  for chunk in chunks:
163
+ # Analyze chunk as a whole first
164
+ chunk_result = self.classifier(
165
  chunk,
166
  bias_categories,
167
  multi_label=True
 
169
 
170
  chunk_scores.append({
171
  label: score
172
+ for label, score in zip(chunk_result['labels'], chunk_result['scores'])
173
  })
174
 
175
+ # Only analyze individual sentences if chunk shows strong bias
176
+ max_chunk_score = max(chunk_result['scores'])
177
+ if max_chunk_score > 0.6:
178
+ sentences = sent_tokenize(chunk)
179
+ # Filter sentences for analysis (longer, potentially more meaningful ones)
180
+ relevant_sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
181
+
182
+ # Process sentences in batches of 8
183
+ for i in range(0, len(relevant_sentences), 8):
184
+ batch = relevant_sentences[i:i+8]
185
+ try:
186
+ batch_results = self.classifier(
187
+ batch,
188
+ bias_categories,
189
+ multi_label=False
190
+ )
191
+
192
+ # Handle single or multiple results
193
+ if not isinstance(batch_results, list):
194
+ batch_results = [batch_results]
195
+
196
+ for sentence, result in zip(batch, batch_results):
197
+ max_score = max(result['scores'])
198
+ if max_score > 0.8 and result['labels'][0] != "neutral/balanced perspective":
199
+ flagged_phrases.append({
200
+ "text": sentence,
201
+ "type": result['labels'][0],
202
+ "score": max_score,
203
+ "highlight": f"[{result['labels'][0].upper()}] (Score: {round(max_score * 100, 1)}%) \"{sentence}\""
204
+ })
205
+
206
+ except Exception as batch_error:
207
+ logger.warning(f"Batch processing error: {str(batch_error)}")
208
+ continue
209
 
210
  # Aggregate scores across chunks
211
  aggregated_scores = {
 
221
  right_score = aggregated_scores["right-wing bias"]
222
  neutral_score = aggregated_scores["neutral/balanced perspective"]
223
 
224
+ # Calculate bias score (-1 to 1)
225
  bias_score = (right_score - left_score) / max(right_score + left_score, 0.0001)
226
 
227
  # Determine bias label
 
243
  # Calculate bias percentage (0-100)
244
  bias_percentage = min(100, abs(bias_score * 100))
245
 
246
+ # Sort and limit flagged phrases
247
+ sorted_phrases = sorted(flagged_phrases, key=lambda x: x['score'], reverse=True)
248
+ unique_phrases = []
249
+ seen = set()
250
+
251
+ for phrase in sorted_phrases:
252
+ if phrase['text'] not in seen:
253
+ unique_phrases.append(phrase)
254
+ seen.add(phrase['text'])
255
+ if len(unique_phrases) >= 5:
256
+ break
257
+
258
  return {
259
  "bias": bias,
260
  "bias_score": round(bias_score, 2),
261
  "bias_percentage": round(bias_percentage, 1),
262
+ "flagged_phrases": unique_phrases,
263
  "detailed_scores": {
264
  "left_bias": round(left_score * 100, 1),
265
  "right_bias": round(right_score * 100, 1),
mediaunmasked/analyzers/evidence_analyzer.py CHANGED
@@ -1,5 +1,5 @@
1
  import logging
2
- from typing import Dict, Any, List
3
  from transformers import pipeline
4
  import numpy as np
5
  import nltk
@@ -8,26 +8,35 @@ from nltk.tokenize import sent_tokenize
8
  logger = logging.getLogger(__name__)
9
 
10
  class EvidenceAnalyzer:
11
- def __init__(self, use_ai: bool = True):
12
  """
13
  Initialize evidence analyzer with LLM and traditional approaches.
14
 
15
  Args:
16
  use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
 
17
  """
18
  self.use_ai = use_ai
19
  self.llm_available = False
 
20
 
21
  if use_ai:
22
  try:
23
- # Zero-shot classifier for evidence analysis
24
- self.classifier = pipeline(
25
- "zero-shot-classification",
26
- model="facebook/bart-large-mnli",
27
- device=-1
28
- )
29
- self.llm_available = True
30
- logger.info("LLM pipeline initialized successfully for evidence analysis")
 
 
 
 
 
 
 
31
  except Exception as e:
32
  logger.warning(f"Failed to initialize LLM pipeline: {str(e)}")
33
  self.llm_available = False
 
1
  import logging
2
+ from typing import Dict, Any, List, Optional
3
  from transformers import pipeline
4
  import numpy as np
5
  import nltk
 
8
  logger = logging.getLogger(__name__)
9
 
10
  class EvidenceAnalyzer:
11
+ def __init__(self, use_ai: bool = True, model_registry: Optional[Any] = None):
12
  """
13
  Initialize evidence analyzer with LLM and traditional approaches.
14
 
15
  Args:
16
  use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
17
+ model_registry: Optional shared model registry for better performance
18
  """
19
  self.use_ai = use_ai
20
  self.llm_available = False
21
+ self.model_registry = model_registry
22
 
23
  if use_ai:
24
  try:
25
+ if model_registry and model_registry.is_available:
26
+ # Use shared models
27
+ self.classifier = model_registry.zero_shot
28
+ self.llm_available = True
29
+ logger.info("Using shared model pipeline for evidence analysis")
30
+ else:
31
+ # Initialize own pipeline
32
+ self.classifier = pipeline(
33
+ "zero-shot-classification",
34
+ model="facebook/bart-large-mnli",
35
+ device=-1,
36
+ batch_size=8
37
+ )
38
+ self.llm_available = True
39
+ logger.info("Initialized dedicated model pipeline for evidence analysis")
40
  except Exception as e:
41
  logger.warning(f"Failed to initialize LLM pipeline: {str(e)}")
42
  self.llm_available = False
mediaunmasked/analyzers/headline_analyzer.py CHANGED
@@ -1,5 +1,5 @@
1
  import logging
2
- from typing import Dict, Any, List
3
  from transformers import pipeline, AutoTokenizer
4
  import numpy as np
5
  import nltk
@@ -8,32 +8,46 @@ from nltk.tokenize import sent_tokenize
8
  logger = logging.getLogger(__name__)
9
 
10
  class HeadlineAnalyzer:
11
- def __init__(self, use_ai: bool = True):
12
  """
13
  Initialize the analyzers for headline analysis.
14
 
15
  Args:
16
  use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
 
17
  """
18
  self.use_ai = use_ai
19
  self.llm_available = False
 
20
 
21
  if use_ai:
22
  try:
23
- # NLI model for contradiction/entailment
24
- self.nli_pipeline = pipeline("text-classification", model="roberta-large-mnli")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- # Zero-shot classifier for clickbait and sensationalism
27
- self.zero_shot = pipeline(
28
- "zero-shot-classification",
29
- model="facebook/bart-large-mnli",
30
- device=-1
31
- )
32
-
33
- self.tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
34
  self.max_length = 512
35
- self.llm_available = True
36
- logger.info("LLM pipelines initialized successfully for headline analysis")
37
  except Exception as e:
38
  logger.warning(f"Failed to initialize LLM pipelines: {str(e)}")
39
  self.llm_available = False
@@ -51,24 +65,17 @@ class HeadlineAnalyzer:
51
  sep_tokens = len(self.tokenizer.encode("[SEP]")) - 2
52
  max_content_tokens = self.max_length - headline_tokens - sep_tokens
53
 
54
- # Process words into sections
 
55
  for word in content_words:
56
- current_section.append(word)
57
-
58
- # Check if current section is approaching token limit
59
- current_text = " ".join(current_section)
60
- if len(self.tokenizer.encode(current_text)) >= max_content_tokens:
61
- current_section.pop()
62
- sections.append(" ".join(current_section))
63
-
64
- # Start new section with 20% overlap for context
65
- overlap_start = max(0, len(current_section) - int(len(current_section) * 0.2))
66
- current_section = current_section[overlap_start:]
67
- current_section.append(word)
68
 
69
- # Add any remaining content
70
- if current_section:
71
- sections.append(" ".join(current_section))
72
 
73
  return sections
74
 
@@ -82,10 +89,16 @@ class HeadlineAnalyzer:
82
  nltk.download('punkt')
83
 
84
  sentences = sent_tokenize(section)
85
-
86
- # Analyze headline against content for contradiction/entailment
87
- nli_scores = []
88
- flagged_phrases = []
 
 
 
 
 
 
89
 
90
  # Categories for sensationalism check
91
  sensationalism_categories = [
@@ -108,44 +121,96 @@ class HeadlineAnalyzer:
108
  for label, score in zip(sensationalism_result['labels'], sensationalism_result['scores'])
109
  }
110
 
111
- # Analyze each sentence for contradiction/support
112
- for sentence in sentences:
113
- if len(sentence.strip()) > 10:
114
- # Check for contradiction/entailment
115
- input_text = f"{headline} [SEP] {sentence}"
116
- nli_result = self.nli_pipeline(input_text, top_k=None)
117
- scores = {item['label']: item['score'] for item in nli_result}
118
- nli_scores.append(scores)
119
-
120
- # Flag contradictory or highly sensationalized content
121
- if scores.get('CONTRADICTION', 0) > 0.4:
122
- flagged_phrases.append({
123
- 'text': sentence.strip(),
124
- 'type': 'contradiction',
125
- 'score': scores['CONTRADICTION']
126
- })
127
 
128
- # Calculate aggregate scores
129
- avg_scores = {
130
- label: np.mean([score[label] for score in nli_scores])
131
- for label in ['ENTAILMENT', 'CONTRADICTION', 'NEUTRAL']
132
- }
 
 
 
 
 
133
 
134
- # Calculate headline accuracy score
135
- accuracy_components = {
136
- 'entailment': avg_scores['ENTAILMENT'] * 0.4,
137
- 'non_contradiction': (1 - avg_scores['CONTRADICTION']) * 0.3,
138
- 'non_sensational': (
139
- sensationalism_scores.get('factual reporting', 0) +
140
- sensationalism_scores.get('accurate headline', 0)
141
- ) * 0.15,
142
- 'non_clickbait': (
143
- 1 - sensationalism_scores.get('clickbait', 0) -
144
- sensationalism_scores.get('sensationalized', 0)
145
- ) * 0.15
146
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- accuracy_score = sum(accuracy_components.values()) * 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  # Sort and limit flagged phrases
151
  sorted_phrases = sorted(
@@ -153,11 +218,19 @@ class HeadlineAnalyzer:
153
  key=lambda x: x['score'],
154
  reverse=True
155
  )
156
- top_phrases = [phrase['text'] for phrase in sorted_phrases[:5]]
 
 
 
 
 
 
 
 
157
 
158
  return {
159
  "accuracy_score": accuracy_score,
160
- "flagged_phrases": top_phrases,
161
  "detailed_scores": {
162
  "nli": avg_scores,
163
  "sensationalism": sensationalism_scores
@@ -167,9 +240,12 @@ class HeadlineAnalyzer:
167
  except Exception as e:
168
  logger.error(f"Section analysis failed: {str(e)}")
169
  return {
170
- "accuracy_score": 0,
171
  "flagged_phrases": [],
172
- "detailed_scores": {}
 
 
 
173
  }
174
 
175
  def _analyze_traditional(self, headline: str, content: str) -> Dict[str, Any]:
@@ -266,13 +342,23 @@ class HeadlineAnalyzer:
266
  accuracy_scores = [r['accuracy_score'] for r in section_results]
267
  final_score = np.mean(accuracy_scores)
268
 
269
- # Combine flagged phrases from all sections
270
  all_phrases = []
271
  for result in section_results:
272
- all_phrases.extend(result['flagged_phrases'])
 
 
 
 
 
 
273
 
274
- # Remove duplicates and limit to top 5
275
- unique_phrases = list(dict.fromkeys(all_phrases))[:5]
 
 
 
 
276
 
277
  return {
278
  "headline_vs_content_score": round(final_score, 1),
 
1
  import logging
2
+ from typing import Dict, Any, List, Optional
3
  from transformers import pipeline, AutoTokenizer
4
  import numpy as np
5
  import nltk
 
8
  logger = logging.getLogger(__name__)
9
 
10
  class HeadlineAnalyzer:
11
+ def __init__(self, use_ai: bool = True, model_registry: Optional[Any] = None):
12
  """
13
  Initialize the analyzers for headline analysis.
14
 
15
  Args:
16
  use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
17
+ model_registry: Optional shared model registry for better performance
18
  """
19
  self.use_ai = use_ai
20
  self.llm_available = False
21
+ self.model_registry = model_registry
22
 
23
  if use_ai:
24
  try:
25
+ if model_registry and model_registry.is_available:
26
+ # Use shared models
27
+ self.nli_pipeline = model_registry.nli
28
+ self.zero_shot = model_registry.zero_shot
29
+ self.tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
30
+ self.llm_available = True
31
+ logger.info("Using shared model pipelines for headline analysis")
32
+ else:
33
+ # Initialize own pipelines
34
+ self.nli_pipeline = pipeline(
35
+ "text-classification",
36
+ model="roberta-large-mnli",
37
+ batch_size=16
38
+ )
39
+ self.zero_shot = pipeline(
40
+ "zero-shot-classification",
41
+ model="facebook/bart-large-mnli",
42
+ device=-1,
43
+ batch_size=8
44
+ )
45
+ self.tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
46
+ self.llm_available = True
47
+ logger.info("Initialized dedicated model pipelines for headline analysis")
48
 
 
 
 
 
 
 
 
 
49
  self.max_length = 512
50
+
 
51
  except Exception as e:
52
  logger.warning(f"Failed to initialize LLM pipelines: {str(e)}")
53
  self.llm_available = False
 
65
  sep_tokens = len(self.tokenizer.encode("[SEP]")) - 2
66
  max_content_tokens = self.max_length - headline_tokens - sep_tokens
67
 
68
+ # Process words into sections with 4000 character chunks
69
+ current_text = ""
70
  for word in content_words:
71
+ if len(current_text) + len(word) + 1 <= 4000:
72
+ current_text += " " + word
73
+ else:
74
+ sections.append(current_text.strip())
75
+ current_text = word
 
 
 
 
 
 
 
76
 
77
+ if current_text:
78
+ sections.append(current_text.strip())
 
79
 
80
  return sections
81
 
 
89
  nltk.download('punkt')
90
 
91
  sentences = sent_tokenize(section)
92
+ if not sentences:
93
+ logger.warning("No sentences found in section")
94
+ return {
95
+ "accuracy_score": 50.0, # Neutral score
96
+ "flagged_phrases": [],
97
+ "detailed_scores": {
98
+ "nli": {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0},
99
+ "sensationalism": {"factual reporting": 0.5, "accurate headline": 0.5}
100
+ }
101
+ }
102
 
103
  # Categories for sensationalism check
104
  sensationalism_categories = [
 
121
  for label, score in zip(sensationalism_result['labels'], sensationalism_result['scores'])
122
  }
123
 
124
+ # Filter relevant sentences (longer than 20 chars)
125
+ relevant_sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ if not relevant_sentences:
128
+ logger.warning("No relevant sentences found in section")
129
+ return {
130
+ "accuracy_score": 50.0, # Neutral score
131
+ "flagged_phrases": [],
132
+ "detailed_scores": {
133
+ "nli": {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0},
134
+ "sensationalism": sensationalism_scores
135
+ }
136
+ }
137
 
138
+ # Process sentences in batches for contradiction/support
139
+ nli_scores = []
140
+ flagged_phrases = []
141
+ batch_size = 8
142
+
143
+ for i in range(0, len(relevant_sentences), batch_size):
144
+ batch = relevant_sentences[i:i+batch_size]
145
+ batch_inputs = [f"{headline} [SEP] {sentence}" for sentence in batch]
146
+
147
+ try:
148
+ # Get NLI scores for batch
149
+ batch_results = self.nli_pipeline(batch_inputs, top_k=None)
150
+ if not isinstance(batch_results, list):
151
+ batch_results = [batch_results]
152
+
153
+ for sentence, result in zip(batch, batch_results):
154
+ scores = {item['label']: item['score'] for item in result}
155
+ nli_scores.append(scores)
156
+
157
+ # Flag contradictory content
158
+ if scores.get('CONTRADICTION', 0) > 0.4:
159
+ flagged_phrases.append({
160
+ 'text': sentence,
161
+ 'type': 'Contradiction',
162
+ 'score': scores['CONTRADICTION'],
163
+ 'highlight': f"[CONTRADICTION] (Score: {round(scores['CONTRADICTION'] * 100, 1)}%) \"{sentence}\""
164
+ })
165
+
166
+ except Exception as batch_error:
167
+ logger.warning(f"Batch processing error: {str(batch_error)}")
168
+ continue
169
+
170
+ # Calculate aggregate scores with validation
171
+ if not nli_scores:
172
+ logger.warning("No NLI scores available")
173
+ avg_scores = {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0}
174
+ else:
175
+ try:
176
+ avg_scores = {
177
+ label: float(np.mean([
178
+ score.get(label, 0.0)
179
+ for score in nli_scores
180
+ ]))
181
+ for label in ['ENTAILMENT', 'CONTRADICTION', 'NEUTRAL']
182
+ }
183
+ except Exception as agg_error:
184
+ logger.error(f"Error aggregating NLI scores: {str(agg_error)}")
185
+ avg_scores = {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0}
186
 
187
+ # Calculate headline accuracy score with validation
188
+ try:
189
+ accuracy_components = {
190
+ 'entailment': avg_scores.get('ENTAILMENT', 0.0) * 0.4,
191
+ 'non_contradiction': (1 - avg_scores.get('CONTRADICTION', 0.0)) * 0.3,
192
+ 'non_sensational': (
193
+ sensationalism_scores.get('factual reporting', 0.0) +
194
+ sensationalism_scores.get('accurate headline', 0.0)
195
+ ) * 0.15,
196
+ 'non_clickbait': (
197
+ 1 - sensationalism_scores.get('clickbait', 0.0) -
198
+ sensationalism_scores.get('sensationalized', 0.0)
199
+ ) * 0.15
200
+ }
201
+
202
+ accuracy_score = sum(accuracy_components.values()) * 100
203
+
204
+ # Validate final score
205
+ if np.isnan(accuracy_score) or not np.isfinite(accuracy_score):
206
+ logger.warning("Invalid accuracy score calculated, using default")
207
+ accuracy_score = 50.0
208
+ else:
209
+ accuracy_score = float(accuracy_score)
210
+
211
+ except Exception as score_error:
212
+ logger.error(f"Error calculating accuracy score: {str(score_error)}")
213
+ accuracy_score = 50.0
214
 
215
  # Sort and limit flagged phrases
216
  sorted_phrases = sorted(
 
218
  key=lambda x: x['score'],
219
  reverse=True
220
  )
221
+ unique_phrases = []
222
+ seen = set()
223
+
224
+ for phrase in sorted_phrases:
225
+ if phrase['text'] not in seen:
226
+ unique_phrases.append(phrase)
227
+ seen.add(phrase['text'])
228
+ if len(unique_phrases) >= 5:
229
+ break
230
 
231
  return {
232
  "accuracy_score": accuracy_score,
233
+ "flagged_phrases": unique_phrases,
234
  "detailed_scores": {
235
  "nli": avg_scores,
236
  "sensationalism": sensationalism_scores
 
240
  except Exception as e:
241
  logger.error(f"Section analysis failed: {str(e)}")
242
  return {
243
+ "accuracy_score": 50.0, # Neutral score for errors
244
  "flagged_phrases": [],
245
+ "detailed_scores": {
246
+ "nli": {"ENTAILMENT": 0.0, "CONTRADICTION": 0.0, "NEUTRAL": 1.0},
247
+ "sensationalism": {}
248
+ }
249
  }
250
 
251
  def _analyze_traditional(self, headline: str, content: str) -> Dict[str, Any]:
 
342
  accuracy_scores = [r['accuracy_score'] for r in section_results]
343
  final_score = np.mean(accuracy_scores)
344
 
345
+ # Combine and deduplicate flagged phrases
346
  all_phrases = []
347
  for result in section_results:
348
+ if 'flagged_phrases' in result:
349
+ all_phrases.extend(result['flagged_phrases'])
350
+
351
+ # Sort by score and get unique phrases
352
+ sorted_phrases = sorted(all_phrases, key=lambda x: x['score'], reverse=True)
353
+ unique_phrases = []
354
+ seen = set()
355
 
356
+ for phrase in sorted_phrases:
357
+ if phrase['text'] not in seen:
358
+ unique_phrases.append(phrase)
359
+ seen.add(phrase['text'])
360
+ if len(unique_phrases) >= 5:
361
+ break
362
 
363
  return {
364
  "headline_vs_content_score": round(final_score, 1),
mediaunmasked/analyzers/scoring.py CHANGED
@@ -1,5 +1,8 @@
1
  from typing import Dict, Any, Literal
2
  import logging
 
 
 
3
 
4
  from .headline_analyzer import HeadlineAnalyzer
5
  from .sentiment_analyzer import SentimentAnalyzer
@@ -11,6 +14,64 @@ logger = logging.getLogger(__name__)
11
  # Define analysis mode type
12
  AnalysisMode = Literal['ai', 'traditional']
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  class MediaScorer:
15
  def __init__(self, use_ai: bool = True):
16
  """
@@ -23,11 +84,31 @@ class MediaScorer:
23
  self.analysis_mode: AnalysisMode = 'ai' if use_ai else 'traditional'
24
  logger.info(f"Initializing MediaScorer with {self.analysis_mode} analysis")
25
 
26
- # Initialize analyzers with analysis mode preference
27
- self.headline_analyzer = HeadlineAnalyzer(use_ai=use_ai)
28
- self.sentiment_analyzer = SentimentAnalyzer(use_ai=use_ai)
29
- self.bias_analyzer = BiasAnalyzer(use_ai=use_ai)
30
- self.evidence_analyzer = EvidenceAnalyzer(use_ai=use_ai)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  logger.info(f"All analyzers initialized in {self.analysis_mode} mode")
33
 
@@ -53,32 +134,58 @@ class MediaScorer:
53
  """)
54
  logger.info(f"Evidence Analysis: {evidence_analysis}")
55
 
56
- # Calculate component scores
57
  # For headline: 20% contradiction = 20% score (don't invert)
58
- headline_score = headline_analysis["headline_vs_content_score"] / 100
 
 
 
 
 
59
 
60
  # For manipulation: 0% = good (use directly), 100% = bad
61
- manipulation_score = (100 - sentiment_analysis["manipulation_score"]) / 100
 
 
 
 
 
62
 
63
  # For bias: 0% = good (use directly), 100% = bad
64
- bias_score = (100 - bias_analysis["bias_percentage"]) / 100
 
 
 
 
 
65
 
66
- evidence_score = evidence_analysis["evidence_based_score"] / 100 # Higher is better
 
 
 
 
 
 
67
 
68
  logger.info(f"""Component Scores:
69
- Headline: {headline_score * 100:.1f}% (from {headline_analysis["headline_vs_content_score"]}%)
70
  Evidence: {evidence_score * 100:.1f}%
71
- Manipulation: {manipulation_score * 100:.1f}% (100 - {sentiment_analysis["manipulation_score"]}%)
72
- Bias: {bias_score * 100:.1f}% (100 - {bias_analysis["bias_percentage"]}%)
73
  """)
74
 
75
  # Calculate final score
76
- final_score = (
77
  (headline_score * 0.25) +
78
  (manipulation_score * 0.25) +
79
  (bias_score * 0.25) +
80
  (evidence_score * 0.25)
81
- ) * 100
 
 
 
 
 
82
 
83
  # Determine rating
84
  if final_score >= 80:
@@ -89,27 +196,27 @@ class MediaScorer:
89
  rating = "Misleading"
90
 
91
  result = {
92
- "media_unmasked_score": round(final_score, 1),
93
  "rating": rating,
94
  "analysis_mode": self.analysis_mode,
95
  "details": {
96
  "headline_analysis": {
97
- "headline_vs_content_score": headline_analysis["headline_vs_content_score"],
98
  "flagged_phrases": headline_analysis.get("flagged_phrases", [])
99
  },
100
  "sentiment_analysis": {
101
- "sentiment": sentiment_analysis["sentiment"],
102
- "manipulation_score": sentiment_analysis["manipulation_score"],
103
  "flagged_phrases": sentiment_analysis.get("flagged_phrases", [])
104
  },
105
  "bias_analysis": {
106
- "bias": bias_analysis["bias"],
107
- "bias_score": bias_analysis["bias_score"],
108
- "bias_percentage": bias_analysis["bias_percentage"],
109
  "flagged_phrases": bias_analysis.get("flagged_phrases", [])
110
  },
111
  "evidence_analysis": {
112
- "evidence_based_score": evidence_analysis["evidence_based_score"],
113
  "flagged_phrases": evidence_analysis.get("flagged_phrases", [])
114
  }
115
  }
 
1
  from typing import Dict, Any, Literal
2
  import logging
3
+ from transformers import pipeline
4
+ import torch
5
+ import numpy as np
6
 
7
  from .headline_analyzer import HeadlineAnalyzer
8
  from .sentiment_analyzer import SentimentAnalyzer
 
14
  # Define analysis mode type
15
  AnalysisMode = Literal['ai', 'traditional']
16
 
17
+ class ModelRegistry:
18
+ """Singleton class to manage shared model pipelines."""
19
+ _instance = None
20
+ _initialized = False
21
+
22
+ def __new__(cls):
23
+ if cls._instance is None:
24
+ cls._instance = super(ModelRegistry, cls).__new__(cls)
25
+ return cls._instance
26
+
27
+ def __init__(self):
28
+ if not self._initialized:
29
+ try:
30
+ # Use GPU if available
31
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32
+ logger.info(f"Using device: {self.device}")
33
+
34
+ # Initialize shared models with larger batch sizes
35
+ self.zero_shot = pipeline(
36
+ "zero-shot-classification",
37
+ model="facebook/bart-large-mnli",
38
+ device=self.device,
39
+ batch_size=8
40
+ )
41
+
42
+ self.sentiment = pipeline(
43
+ "text-classification",
44
+ model="SamLowe/roberta-base-go_emotions",
45
+ device=self.device,
46
+ batch_size=16
47
+ )
48
+
49
+ self.nli = pipeline(
50
+ "text-classification",
51
+ model="roberta-large-mnli",
52
+ device=self.device,
53
+ batch_size=16
54
+ )
55
+
56
+ # Add toxicity pipeline
57
+ self.toxicity = pipeline(
58
+ "text-classification",
59
+ model="unitary/toxic-bert",
60
+ device=self.device,
61
+ batch_size=16
62
+ )
63
+
64
+ logger.info("Successfully initialized shared model pipelines")
65
+ self._initialized = True
66
+
67
+ except Exception as e:
68
+ logger.error(f"Failed to initialize shared models: {str(e)}")
69
+ self._initialized = False
70
+
71
+ @property
72
+ def is_available(self):
73
+ return self._initialized
74
+
75
  class MediaScorer:
76
  def __init__(self, use_ai: bool = True):
77
  """
 
84
  self.analysis_mode: AnalysisMode = 'ai' if use_ai else 'traditional'
85
  logger.info(f"Initializing MediaScorer with {self.analysis_mode} analysis")
86
 
87
+ # Initialize shared model registry if using AI
88
+ if use_ai:
89
+ self.model_registry = ModelRegistry()
90
+ if not self.model_registry.is_available:
91
+ logger.warning("Shared models not available, falling back to traditional analysis")
92
+ self.use_ai = False
93
+ self.analysis_mode = 'traditional'
94
+
95
+ # Initialize analyzers with analysis mode preference and shared models
96
+ self.headline_analyzer = HeadlineAnalyzer(
97
+ use_ai=self.use_ai,
98
+ model_registry=self.model_registry if self.use_ai else None
99
+ )
100
+ self.sentiment_analyzer = SentimentAnalyzer(
101
+ use_ai=self.use_ai,
102
+ model_registry=self.model_registry if self.use_ai else None
103
+ )
104
+ self.bias_analyzer = BiasAnalyzer(
105
+ use_ai=self.use_ai,
106
+ model_registry=self.model_registry if self.use_ai else None
107
+ )
108
+ self.evidence_analyzer = EvidenceAnalyzer(
109
+ use_ai=self.use_ai,
110
+ model_registry=self.model_registry if self.use_ai else None
111
+ )
112
 
113
  logger.info(f"All analyzers initialized in {self.analysis_mode} mode")
114
 
 
134
  """)
135
  logger.info(f"Evidence Analysis: {evidence_analysis}")
136
 
137
+ # Calculate component scores with NaN handling
138
  # For headline: 20% contradiction = 20% score (don't invert)
139
+ headline_score = headline_analysis.get("headline_vs_content_score", 0)
140
+ if isinstance(headline_score, (int, float)) and not np.isnan(headline_score):
141
+ headline_score = headline_score / 100
142
+ else:
143
+ headline_score = 0.5 # Default to neutral if score is invalid
144
+ logger.warning("Invalid headline score, using default value of 0.5")
145
 
146
  # For manipulation: 0% = good (use directly), 100% = bad
147
+ manipulation_score = sentiment_analysis.get("manipulation_score", 0)
148
+ if isinstance(manipulation_score, (int, float)) and not np.isnan(manipulation_score):
149
+ manipulation_score = (100 - manipulation_score) / 100
150
+ else:
151
+ manipulation_score = 0.5
152
+ logger.warning("Invalid manipulation score, using default value of 0.5")
153
 
154
  # For bias: 0% = good (use directly), 100% = bad
155
+ bias_percentage = bias_analysis.get("bias_percentage", 0)
156
+ if isinstance(bias_percentage, (int, float)) and not np.isnan(bias_percentage):
157
+ bias_score = (100 - bias_percentage) / 100
158
+ else:
159
+ bias_score = 0.5
160
+ logger.warning("Invalid bias score, using default value of 0.5")
161
 
162
+ # For evidence: higher is better
163
+ evidence_score = evidence_analysis.get("evidence_based_score", 0)
164
+ if isinstance(evidence_score, (int, float)) and not np.isnan(evidence_score):
165
+ evidence_score = evidence_score / 100
166
+ else:
167
+ evidence_score = 0.5
168
+ logger.warning("Invalid evidence score, using default value of 0.5")
169
 
170
  logger.info(f"""Component Scores:
171
+ Headline: {headline_score * 100:.1f}% (from {headline_analysis.get("headline_vs_content_score", 0)})
172
  Evidence: {evidence_score * 100:.1f}%
173
+ Manipulation: {manipulation_score * 100:.1f}% (100 - {sentiment_analysis.get("manipulation_score", 0)}%)
174
+ Bias: {bias_score * 100:.1f}% (100 - {bias_analysis.get("bias_percentage", 0)}%)
175
  """)
176
 
177
  # Calculate final score
178
+ final_score = float((
179
  (headline_score * 0.25) +
180
  (manipulation_score * 0.25) +
181
  (bias_score * 0.25) +
182
  (evidence_score * 0.25)
183
+ ) * 100)
184
+
185
+ # Ensure final score is valid
186
+ if np.isnan(final_score) or not np.isfinite(final_score):
187
+ final_score = 50.0 # Default to neutral
188
+ logger.warning("Invalid final score calculated, using default value of 50.0")
189
 
190
  # Determine rating
191
  if final_score >= 80:
 
196
  rating = "Misleading"
197
 
198
  result = {
199
+ "media_unmasked_score": round(float(final_score), 1),
200
  "rating": rating,
201
  "analysis_mode": self.analysis_mode,
202
  "details": {
203
  "headline_analysis": {
204
+ "headline_vs_content_score": float(headline_analysis.get("headline_vs_content_score", 0)),
205
  "flagged_phrases": headline_analysis.get("flagged_phrases", [])
206
  },
207
  "sentiment_analysis": {
208
+ "sentiment": str(sentiment_analysis.get("sentiment", "Neutral")),
209
+ "manipulation_score": float(sentiment_analysis.get("manipulation_score", 0)),
210
  "flagged_phrases": sentiment_analysis.get("flagged_phrases", [])
211
  },
212
  "bias_analysis": {
213
+ "bias": str(bias_analysis.get("bias", "Neutral")),
214
+ "bias_score": float(bias_analysis.get("bias_score", 0)),
215
+ "bias_percentage": float(bias_analysis.get("bias_percentage", 0)),
216
  "flagged_phrases": bias_analysis.get("flagged_phrases", [])
217
  },
218
  "evidence_analysis": {
219
+ "evidence_based_score": float(evidence_analysis.get("evidence_based_score", 0)),
220
  "flagged_phrases": evidence_analysis.get("flagged_phrases", [])
221
  }
222
  }
mediaunmasked/analyzers/sentiment_analyzer.py CHANGED
@@ -1,5 +1,5 @@
1
  import logging
2
- from typing import Dict, Any, List
3
  from textblob import TextBlob
4
  from transformers import pipeline
5
  import numpy as np
@@ -7,15 +7,18 @@ import numpy as np
7
  logger = logging.getLogger(__name__)
8
 
9
  class SentimentAnalyzer:
10
- def __init__(self, use_ai: bool = True):
11
  """
12
  Initialize sentiment analyzer with both traditional and LLM-based approaches.
13
 
14
  Args:
15
  use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
 
16
  """
17
  self.use_ai = use_ai
18
  self.llm_available = False
 
 
19
 
20
  # Traditional manipulation patterns
21
  self.manipulative_patterns = [
@@ -32,24 +35,47 @@ class SentimentAnalyzer:
32
 
33
  if use_ai:
34
  try:
35
- # Initialize LLM pipelines
36
- self.sentiment_pipeline = pipeline(
37
- "text-classification",
38
- model="SamLowe/roberta-base-go_emotions",
39
- top_k=None
40
- )
41
- self.toxicity_pipeline = pipeline(
42
- "text-classification",
43
- model="martin-ha/toxic-comment-model",
44
- top_k=None
45
- )
46
- self.manipulation_pipeline = pipeline(
47
- "zero-shot-classification",
48
- model="facebook/bart-large-mnli",
49
- device=-1
50
- )
51
- self.llm_available = True
52
- logger.info("LLM pipelines initialized successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  except Exception as e:
54
  logger.warning(f"Failed to initialize LLM pipelines: {str(e)}")
55
  self.llm_available = False
@@ -107,18 +133,22 @@ class SentimentAnalyzer:
107
  sentiment_scores.append(emotions)
108
  logger.debug(f"Processed emotion scores: {sentiment_scores}")
109
 
110
- # Get toxicity scores
111
- logger.debug(f"Analyzing toxicity for chunk {i}")
112
- toxicity = self.toxicity_pipeline(chunk)
113
- if isinstance(toxicity, list):
114
- toxicity_scores.extend(toxicity)
115
- else:
116
- toxicity_scores.append(toxicity)
117
- logger.debug(f"Processed toxicity scores: {toxicity_scores}")
 
 
 
 
118
 
119
  # Get manipulation scores
120
  logger.debug(f"Analyzing manipulation for chunk {i}")
121
- manipulation = self.manipulation_pipeline(
122
  chunk,
123
  manipulation_categories,
124
  multi_label=True
@@ -135,7 +165,7 @@ class SentimentAnalyzer:
135
  sentences = chunk.split('.')
136
  for sentence in sentences:
137
  if len(sentence.strip()) > 10:
138
- sent_result = self.manipulation_pipeline(
139
  sentence.strip(),
140
  manipulation_categories,
141
  multi_label=False
@@ -157,6 +187,10 @@ class SentimentAnalyzer:
157
  # Aggregate scores with error handling
158
  def aggregate_scores(scores_list, score_type: str):
159
  try:
 
 
 
 
160
  all_scores = {}
161
  for scores in scores_list:
162
  if isinstance(scores, dict):
@@ -181,23 +215,24 @@ class SentimentAnalyzer:
181
  if isinstance(score, (int, float)):
182
  all_scores[label].append(score)
183
 
184
- return {k: np.mean(v) for k, v in all_scores.items() if v}
185
  except Exception as agg_error:
186
  logger.error(f"Error aggregating {score_type} scores: {str(agg_error)}")
187
  return {}
188
 
189
  emotion_scores = aggregate_scores(sentiment_scores, "emotion")
190
- toxicity_scores = aggregate_scores(toxicity_scores, "toxicity")
191
  logger.debug(f"Aggregated emotion scores: {emotion_scores}")
192
  logger.debug(f"Aggregated toxicity scores: {toxicity_scores}")
193
 
194
  # Aggregate manipulation scores
195
  manipulation_agg = {
196
- category: np.mean([
197
  scores.get(category, 0)
198
  for scores in manipulation_scores
199
- ])
200
  for category in manipulation_categories
 
201
  }
202
  logger.debug(f"Aggregated manipulation scores: {manipulation_agg}")
203
 
@@ -205,18 +240,28 @@ class SentimentAnalyzer:
205
  manipulation_indicators = {
206
  'emotional manipulation': 0.4,
207
  'fear mongering': 0.3,
208
- 'propaganda': 0.3,
209
- 'toxic': 0.2,
210
- 'severe_toxic': 0.3,
211
- 'threat': 0.2
212
  }
213
 
 
 
 
 
 
 
 
214
  # Combine toxicity and manipulation scores
215
  combined_scores = {**toxicity_scores, **manipulation_agg}
216
- manipulation_score = min(100, sum(
217
- combined_scores.get(k, 0) * weight
218
- for k, weight in manipulation_indicators.items()
219
- ) * 100)
 
 
 
 
 
 
220
 
221
  logger.info(f"Final manipulation score: {manipulation_score}")
222
 
@@ -258,7 +303,7 @@ class SentimentAnalyzer:
258
 
259
  return {
260
  "sentiment": sentiment,
261
- "manipulation_score": manipulation_score,
262
  "flagged_phrases": unique_phrases,
263
  "detailed_scores": {
264
  "emotions": emotion_scores,
 
1
  import logging
2
+ from typing import Dict, Any, List, Optional
3
  from textblob import TextBlob
4
  from transformers import pipeline
5
  import numpy as np
 
7
  logger = logging.getLogger(__name__)
8
 
9
  class SentimentAnalyzer:
10
+ def __init__(self, use_ai: bool = True, model_registry: Optional[Any] = None):
11
  """
12
  Initialize sentiment analyzer with both traditional and LLM-based approaches.
13
 
14
  Args:
15
  use_ai: Boolean indicating whether to use AI-powered analysis (True) or traditional analysis (False)
16
+ model_registry: Optional shared model registry for better performance
17
  """
18
  self.use_ai = use_ai
19
  self.llm_available = False
20
+ self.model_registry = model_registry
21
+ self.toxicity_available = False
22
 
23
  # Traditional manipulation patterns
24
  self.manipulative_patterns = [
 
35
 
36
  if use_ai:
37
  try:
38
+ if model_registry and model_registry.is_available:
39
+ # Use shared models
40
+ self.sentiment_pipeline = model_registry.sentiment
41
+ self.zero_shot = model_registry.zero_shot
42
+ self.toxicity_pipeline = getattr(model_registry, 'toxicity', None)
43
+ self.toxicity_available = self.toxicity_pipeline is not None
44
+ self.llm_available = True
45
+ logger.info("Using shared model pipelines for sentiment analysis")
46
+ if self.toxicity_available:
47
+ logger.info("Toxicity analysis enabled")
48
+ else:
49
+ logger.info("Toxicity analysis not available")
50
+ else:
51
+ # Initialize own pipelines
52
+ self.sentiment_pipeline = pipeline(
53
+ "text-classification",
54
+ model="SamLowe/roberta-base-go_emotions",
55
+ device=-1,
56
+ batch_size=16
57
+ )
58
+ self.zero_shot = pipeline(
59
+ "zero-shot-classification",
60
+ model="facebook/bart-large-mnli",
61
+ device=-1,
62
+ batch_size=8
63
+ )
64
+ try:
65
+ self.toxicity_pipeline = pipeline(
66
+ "text-classification",
67
+ model="unitary/toxic-bert",
68
+ device=-1,
69
+ batch_size=16
70
+ )
71
+ self.toxicity_available = True
72
+ logger.info("Toxicity analysis enabled")
73
+ except Exception as tox_error:
74
+ logger.warning(f"Toxicity pipeline initialization failed: {str(tox_error)}")
75
+ self.toxicity_available = False
76
+
77
+ self.llm_available = True
78
+ logger.info("Initialized dedicated model pipelines for sentiment analysis")
79
  except Exception as e:
80
  logger.warning(f"Failed to initialize LLM pipelines: {str(e)}")
81
  self.llm_available = False
 
133
  sentiment_scores.append(emotions)
134
  logger.debug(f"Processed emotion scores: {sentiment_scores}")
135
 
136
+ # Get toxicity scores if available
137
+ if self.toxicity_available:
138
+ logger.debug(f"Analyzing toxicity for chunk {i}")
139
+ try:
140
+ toxicity = self.toxicity_pipeline(chunk)
141
+ if isinstance(toxicity, list):
142
+ toxicity_scores.extend(toxicity)
143
+ else:
144
+ toxicity_scores.append(toxicity)
145
+ logger.debug(f"Processed toxicity scores: {toxicity_scores}")
146
+ except Exception as tox_error:
147
+ logger.warning(f"Toxicity analysis failed for chunk {i}: {str(tox_error)}")
148
 
149
  # Get manipulation scores
150
  logger.debug(f"Analyzing manipulation for chunk {i}")
151
+ manipulation = self.zero_shot(
152
  chunk,
153
  manipulation_categories,
154
  multi_label=True
 
165
  sentences = chunk.split('.')
166
  for sentence in sentences:
167
  if len(sentence.strip()) > 10:
168
+ sent_result = self.zero_shot(
169
  sentence.strip(),
170
  manipulation_categories,
171
  multi_label=False
 
187
  # Aggregate scores with error handling
188
  def aggregate_scores(scores_list, score_type: str):
189
  try:
190
+ if not scores_list:
191
+ logger.warning(f"No {score_type} scores to aggregate")
192
+ return {}
193
+
194
  all_scores = {}
195
  for scores in scores_list:
196
  if isinstance(scores, dict):
 
215
  if isinstance(score, (int, float)):
216
  all_scores[label].append(score)
217
 
218
+ return {k: float(np.mean(v)) for k, v in all_scores.items() if v}
219
  except Exception as agg_error:
220
  logger.error(f"Error aggregating {score_type} scores: {str(agg_error)}")
221
  return {}
222
 
223
  emotion_scores = aggregate_scores(sentiment_scores, "emotion")
224
+ toxicity_scores = aggregate_scores(toxicity_scores, "toxicity") if self.toxicity_available else {}
225
  logger.debug(f"Aggregated emotion scores: {emotion_scores}")
226
  logger.debug(f"Aggregated toxicity scores: {toxicity_scores}")
227
 
228
  # Aggregate manipulation scores
229
  manipulation_agg = {
230
+ category: float(np.mean([
231
  scores.get(category, 0)
232
  for scores in manipulation_scores
233
+ ]))
234
  for category in manipulation_categories
235
+ if manipulation_scores # Only process if we have scores
236
  }
237
  logger.debug(f"Aggregated manipulation scores: {manipulation_agg}")
238
 
 
240
  manipulation_indicators = {
241
  'emotional manipulation': 0.4,
242
  'fear mongering': 0.3,
243
+ 'propaganda': 0.3
 
 
 
244
  }
245
 
246
+ if self.toxicity_available:
247
+ manipulation_indicators.update({
248
+ 'toxic': 0.2,
249
+ 'severe_toxic': 0.3,
250
+ 'threat': 0.2
251
+ })
252
+
253
  # Combine toxicity and manipulation scores
254
  combined_scores = {**toxicity_scores, **manipulation_agg}
255
+
256
+ # Calculate manipulation score with fallback
257
+ if combined_scores:
258
+ manipulation_score = min(100, sum(
259
+ combined_scores.get(k, 0) * weight
260
+ for k, weight in manipulation_indicators.items()
261
+ ) * 100)
262
+ else:
263
+ # Fallback to traditional analysis if no scores available
264
+ manipulation_score = len(self._detect_manipulative_phrases(text)) * 10
265
 
266
  logger.info(f"Final manipulation score: {manipulation_score}")
267
 
 
303
 
304
  return {
305
  "sentiment": sentiment,
306
+ "manipulation_score": round(manipulation_score, 1),
307
  "flagged_phrases": unique_phrases,
308
  "detailed_scores": {
309
  "emotions": emotion_scores,