File size: 1,278 Bytes
7516245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from langdetect import detect

def is_english(text):
    """
    Detects if text is in English
    
    Args:
        text (str): The text to analyze
        
    Returns:
        bool: True if text is in English, False otherwise
    """
    # Ensure we have enough text to analyze
    if not text or len(text.strip()) < 50:
        return False
        
    try:
        # Try primary language detection
        return detect(text) == 'en'
    except:
        # If detection fails, use a more robust approach
        common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that', 
                              'this', 'are', 'was', 'be', 'have', 'it', 'not', 'they', 'by', 'from']
        text_lower = text.lower()
        # Count occurrences of common English words
        english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ")
        # Calculate ratio of English words to text length
        text_words = len(text_lower.split())
        if text_words == 0:  # Avoid division by zero
            return False
            
        english_ratio = english_word_count / min(20, text_words)  # Cap at 20 to avoid skew
        return english_word_count >= 5 or english_ratio > 0.25  # More stringent criteria