Spaces:

wozwize
/

media-unmasked-api

Running

App Files Files Community

media-unmasked-api / mediaunmasked /analyzers /headline_analyzer.py

wozwize

initial commit of media-unmasked-api to huggingface

876b12f 5 months ago

raw

history blame

7.49 kB

	import logging
	from typing import Dict, Any, List
	from transformers import pipeline
	from transformers import AutoTokenizer
	import numpy as np

	logger = logging.getLogger(__name__)

	class HeadlineAnalyzer:
	def __init__(self):
	"""Initialize the NLI model for contradiction detection."""
	self.nli_pipeline = pipeline("text-classification", model="roberta-large-mnli")
	self.tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
	self.max_length = 512

	def _split_content(self, headline: str, content: str) -> List[str]:
	"""Split content into sections that fit within token limit."""
	content_words = content.split()
	sections = []
	current_section = []

	# Account for headline and [SEP] token in the max length
	headline_tokens = len(self.tokenizer.encode(headline))
	sep_tokens = len(self.tokenizer.encode("[SEP]")) - 2 # -2 because encode adds special tokens
	max_content_tokens = self.max_length - headline_tokens - sep_tokens

	# Process words into sections
	for word in content_words:
	current_section.append(word)

	# Check if current section is approaching token limit
	current_text = " ".join(current_section)
	if len(self.tokenizer.encode(current_text)) >= max_content_tokens:
	# Remove last word (it might make us go over limit)
	current_section.pop()
	sections.append(" ".join(current_section))

	# Start new section with 20% overlap for context
	overlap_start = max(0, len(current_section) - int(len(current_section) * 0.2))
	current_section = current_section[overlap_start:]
	current_section.append(word)

	# Add any remaining content as the last section
	if current_section:
	sections.append(" ".join(current_section))

	logger.info(f"""Content Splitting:
	- Original content length: {len(content_words)} words
	- Split into {len(sections)} sections
	- Headline uses {headline_tokens} tokens
	- Available tokens per section: {max_content_tokens}
	""")
	return sections

	def _analyze_section(self, headline: str, section: str) -> Dict[str, float]:
	"""Analyze a single section of content."""
	input_text = f"{headline} [SEP] {section}"
	result = self.nli_pipeline(input_text, top_k=None)

	# Extract scores
	scores = {item['label']: item['score'] for item in result}

	logger.info("\nSection Analysis:")
	logger.info("-"*30)
	logger.info(f"Section preview: {section[:100]}...")
	for label, score in scores.items():
	logger.info(f"Label: {label:<12} Score: {score:.3f}")

	return scores

	def analyze(self, headline: str, content: str) -> Dict[str, Any]:
	"""Analyze how well the headline matches the content using an AI model."""
	try:
	logger.info("\n" + "="*50)
	logger.info("HEADLINE ANALYSIS STARTED")
	logger.info("="*50)

	# Handle empty inputs
	if not headline.strip() or not content.strip():
	logger.warning("Empty headline or content provided")
	return {
	"headline_vs_content_score": 0,
	"entailment_score": 0,
	"contradiction_score": 0,
	"contradictory_phrases": []
	}

	# Split content if too long
	content_tokens = len(self.tokenizer.encode(content))
	if content_tokens > self.max_length:
	logger.warning(f"""
	Content Length Warning:
	- Total tokens: {content_tokens}
	- Max allowed: {self.max_length}
	- Splitting into sections...
	""")
	sections = self._split_content(headline, content)

	# Analyze each section
	section_scores = []
	for i, section in enumerate(sections, 1):
	logger.info(f"\nAnalyzing section {i}/{len(sections)}")
	scores = self._analyze_section(headline, section)
	section_scores.append(scores)

	# Aggregate scores across sections
	# Use max contradiction (if any section strongly contradicts, that's important)
	# Use mean entailment (overall support across sections)
	# Use mean neutral (general neutral tone across sections)
	entailment_score = np.mean([s.get('ENTAILMENT', 0) for s in section_scores])
	contradiction_score = np.max([s.get('CONTRADICTION', 0) for s in section_scores])
	neutral_score = np.mean([s.get('NEUTRAL', 0) for s in section_scores])

	logger.info("\nAggregated Scores Across Sections:")
	logger.info("-"*30)
	logger.info(f"Mean Entailment: {entailment_score:.3f}")
	logger.info(f"Max Contradiction: {contradiction_score:.3f}")
	logger.info(f"Mean Neutral: {neutral_score:.3f}")
	else:
	# Single section analysis
	scores = self._analyze_section(headline, content)
	entailment_score = scores.get('ENTAILMENT', 0)
	contradiction_score = scores.get('CONTRADICTION', 0)
	neutral_score = scores.get('NEUTRAL', 0)

	# Compute final consistency score
	final_score = (
	(entailment_score * 0.6) + # Base score from entailment
	(neutral_score * 0.3) + # Neutral is acceptable
	((1 - contradiction_score) * 0.1) # Small penalty for contradiction
	) * 100

	# Log final results
	logger.info("\nFinal Analysis Results:")
	logger.info("-"*30)
	logger.info(f"Headline: {headline}")
	logger.info(f"Content Length: {content_tokens} tokens")
	logger.info("\nFinal Scores:")
	logger.info(f"{'Entailment:':<15} {entailment_score:.3f}")
	logger.info(f"{'Neutral:':<15} {neutral_score:.3f}")
	logger.info(f"{'Contradiction:':<15} {contradiction_score:.3f}")
	logger.info(f"\nFinal Score: {final_score:.1f}%")
	logger.info("="*50 + "\n")

	return {
	"headline_vs_content_score": round(final_score, 1),
	"entailment_score": round(entailment_score, 2),
	"contradiction_score": round(contradiction_score, 2),
	"contradictory_phrases": []
	}

	except Exception as e:
	logger.error("\nHEADLINE ANALYSIS ERROR")
	logger.error("-"*30)
	logger.error(f"Error Type: {type(e).__name__}")
	logger.error(f"Error Message: {str(e)}")
	logger.error("Stack Trace:", exc_info=True)
	logger.error("="*50 + "\n")
	return {
	"headline_vs_content_score": 0,
	"entailment_score": 0,
	"contradiction_score": 0,
	"contradictory_phrases": []
	}