Spaces:

Phoenix21
/

AiNewsV2

Sleeping

AiNewsV2 / tools /language_detector.py

Modular code

7516245 4 months ago

1.28 kB

	from langdetect import detect

	def is_english(text):
	"""
	Detects if text is in English

	Args:
	text (str): The text to analyze

	Returns:
	bool: True if text is in English, False otherwise
	"""
	# Ensure we have enough text to analyze
	if not text or len(text.strip()) < 50:
	return False

	try:
	# Try primary language detection
	return detect(text) == 'en'
	except:
	# If detection fails, use a more robust approach
	common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that',
	'this', 'are', 'was', 'be', 'have', 'it', 'not', 'they', 'by', 'from']
	text_lower = text.lower()
	# Count occurrences of common English words
	english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ")
	# Calculate ratio of English words to text length
	text_words = len(text_lower.split())
	if text_words == 0: # Avoid division by zero
	return False

	english_ratio = english_word_count / min(20, text_words) # Cap at 20 to avoid skew
	return english_word_count >= 5 or english_ratio > 0.25 # More stringent criteria