|
from langdetect import detect |
|
|
|
def is_english(text): |
|
""" |
|
Detects if text is in English |
|
|
|
Args: |
|
text (str): The text to analyze |
|
|
|
Returns: |
|
bool: True if text is in English, False otherwise |
|
""" |
|
|
|
if not text or len(text.strip()) < 50: |
|
return False |
|
|
|
try: |
|
|
|
return detect(text) == 'en' |
|
except: |
|
|
|
common_english_words = ['the', 'and', 'in', 'to', 'of', 'is', 'for', 'with', 'on', 'that', |
|
'this', 'are', 'was', 'be', 'have', 'it', 'not', 'they', 'by', 'from'] |
|
text_lower = text.lower() |
|
|
|
english_word_count = sum(1 for word in common_english_words if f" {word} " in f" {text_lower} ") |
|
|
|
text_words = len(text_lower.split()) |
|
if text_words == 0: |
|
return False |
|
|
|
english_ratio = english_word_count / min(20, text_words) |
|
return english_word_count >= 5 or english_ratio > 0.25 |