AiNewsV2 / tools /language_detector.py
Phoenix21's picture
Modular code
7516245
from langdetect import detect
def is_english(text):
"""
Detects if text is in English
Args:
text (str): The text to analyze
Returns:
bool: True if text is in English, False otherwise
"""
# Ensure we have enough text to analyze
if not text or len(text.strip()) < 50:
return False
try:
# Try primary language detection
return detect(text) == 'en'
except:
# If detection fails, use a more robust approach
common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that',
'this', 'are', 'was', 'be', 'have', 'it', 'not', 'they', 'by', 'from']
text_lower = text.lower()
# Count occurrences of common English words
english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ")
# Calculate ratio of English words to text length
text_words = len(text_lower.split())
if text_words == 0: # Avoid division by zero
return False
english_ratio = english_word_count / min(20, text_words) # Cap at 20 to avoid skew
return english_word_count >= 5 or english_ratio > 0.25 # More stringent criteria