# Fix for torch.classes watchdog errors import sys class ModuleProtector: def __init__(self, module_name): self.module_name = module_name self.original_module = sys.modules.get(module_name) def __enter__(self): if self.module_name in sys.modules: self.original_module = sys.modules[self.module_name] sys.modules[self.module_name] = None def __exit__(self, *args): if self.original_module is not None: sys.modules[self.module_name] = self.original_module # Temporarily remove torch.classes from sys.modules to prevent Streamlit's file watcher from accessing it with ModuleProtector('torch.classes'): import streamlit as st # Set page configuration - MUST BE THE FIRST STREAMLIT COMMAND st.set_page_config( page_title="Multilingual Toxicity Analyzer", page_icon="", layout="wide", initial_sidebar_state="expanded" ) # Now import all other dependencies import torch import os import plotly.graph_objects as go import pandas as pd from model.inference_optimized import OptimizedToxicityClassifier import langid from typing import List, Dict import time import psutil import platform try: import cpuinfo except ImportError: cpuinfo = None from streamlit_extras.colored_header import colored_header from streamlit_extras.add_vertical_space import add_vertical_space from streamlit_extras.stylable_container import stylable_container from streamlit_extras.card import card from streamlit_extras.metric_cards import style_metric_cards # Configure paths ONNX_MODEL_PATH = os.environ.get("ONNX_MODEL_PATH", "weights/toxic_classifier.onnx") PYTORCH_MODEL_DIR = os.environ.get("PYTORCH_MODEL_DIR", "weights/toxic_classifier_xlm-roberta-large") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Get GPU info if available def get_gpu_info(): if DEVICE == "cuda": try: gpu_name = torch.cuda.get_device_name(0) gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / 1024**3 # Convert to GB gpu_memory_allocated = torch.cuda.memory_allocated(0) / 1024**3 # Convert to GB cuda_version = torch.version.cuda memory_info = f"{gpu_memory_allocated:.1f}/{gpu_memory_total:.1f} GB" return f"{gpu_name} (CUDA {cuda_version}, Memory: {memory_info})" except Exception as e: return "CUDA device" return "CPU" # Get CPU information def get_cpu_info(): try: cpu_percent = psutil.cpu_percent(interval=0.1) cpu_count = psutil.cpu_count(logical=True) cpu_freq = psutil.cpu_freq() if cpu_freq: freq_info = f"{cpu_freq.current/1000:.2f} GHz" else: freq_info = "Unknown" # Try multiple methods to get CPU model name cpu_model = None # Method 1: Try reading from /proc/cpuinfo directly try: with open('/proc/cpuinfo', 'r') as f: for line in f: if 'model name' in line: cpu_model = line.split(':', 1)[1].strip() break except: pass # Method 2: If Method 1 fails, try using platform.processor() if not cpu_model: cpu_model = platform.processor() # Method 3: If still no result, try using platform.machine() if not cpu_model or cpu_model == '': cpu_model = platform.machine() # Method 4: Final fallback to using psutil if not cpu_model or cpu_model == '': try: import cpuinfo cpu_model = cpuinfo.get_cpu_info()['brand_raw'] except: pass # Clean up the model name if cpu_model: # Remove common unnecessary parts replacements = [ '(R)', '(TM)', '(r)', '(tm)', 'CPU', '@', ' ', 'Processor' ] for r in replacements: cpu_model = cpu_model.replace(r, ' ') # Clean up extra spaces cpu_model = ' '.join(cpu_model.split()) # Limit length if len(cpu_model) > 40: cpu_model = cpu_model[:37] + "..." else: cpu_model = "Unknown CPU" return { "name": cpu_model, "cores": cpu_count, "freq": freq_info, "usage": f"{cpu_percent:.1f}%" } except Exception as e: return { "name": "CPU", "cores": "Unknown", "freq": "Unknown", "usage": "Unknown" } # Get RAM information def get_ram_info(): try: ram = psutil.virtual_memory() ram_total = ram.total / (1024**3) # Convert to GB ram_used = ram.used / (1024**3) # Convert to GB ram_percent = ram.percent return { "total": f"{ram_total:.1f} GB", "used": f"{ram_used:.1f} GB", "percent": f"{ram_percent:.1f}%" } except Exception as e: return { "total": "Unknown", "used": "Unknown", "percent": "Unknown" } # Update system resource information def update_system_resources(): cpu_info = get_cpu_info() ram_info = get_ram_info() return { "cpu": cpu_info, "ram": ram_info } # Initialize system information GPU_INFO = get_gpu_info() SYSTEM_INFO = update_system_resources() # Add a function to update GPU memory info in real-time def update_gpu_info(): if DEVICE == "cuda": try: gpu_memory_allocated = torch.cuda.memory_allocated(0) / 1024**3 # Convert to GB gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / 1024**3 # Convert to GB return f"{gpu_memory_allocated:.1f}/{gpu_memory_total:.1f} GB" except: return "N/A" return "N/A" # Helper function to convert hex to rgba def hex_to_rgba(hex_color, alpha=1.0): hex_color = hex_color.lstrip('#') r = int(hex_color[0:2], 16) g = int(hex_color[2:4], 16) b = int(hex_color[4:6], 16) return f'rgba({r}, {g}, {b}, {alpha})' # Supported languages with emoji flags SUPPORTED_LANGUAGES = { 'en': {'name': 'English', 'flag': '🇺🇸'}, 'ru': {'name': 'Russian', 'flag': '🇷🇺'}, 'tr': {'name': 'Turkish', 'flag': '🇹🇷'}, 'es': {'name': 'Spanish', 'flag': '🇪🇸'}, 'fr': {'name': 'French', 'flag': '🇫🇷'}, 'it': {'name': 'Italian', 'flag': '🇮🇹'}, 'pt': {'name': 'Portuguese', 'flag': '🇵🇹'} } # Language examples - expanded with multiple examples per language, categorized as toxic or non-toxic LANGUAGE_EXAMPLES = { 'en': { 'toxic': [ "You are such an idiot, nobody likes your stupid content.", "Shut up you worthless piece of garbage. Everyone hates you.", "This is the most pathetic thing I've ever seen. Only losers would think this is good.", "Just kill yourself already, the world would be better without you." ], 'non_toxic': [ "I disagree with your opinion, but I appreciate your perspective.", "This content could use some improvement, but I see the effort you put into it.", "While I don't personally enjoy this type of content, others might find it valuable.", "Thank you for sharing your thoughts on this complex topic." ] }, 'ru': { 'toxic': [ "Ты полный придурок, твой контент никому не нравится.", "Заткнись, бесполезный кусок мусора. Все тебя ненавидят.", "Это самая жалкая вещь, которую я когда-либо видел. Только неудачники думают, что это хорошо.", "Почему бы тебе просто не исчезнуть нахрен? Никто не будет скучать по тебе." ], 'non_toxic': [ "Я не согласен с вашим мнением, но уважаю вашу точку зрения.", "Этот контент можно улучшить, но я вижу, сколько усилий вы в него вложили.", "Хотя мне лично не нравится такой контент, другие могут найти его полезным.", "Спасибо, что поделились своими мыслями на эту сложную тему." ] }, 'tr': { 'toxic': [ "Sen tam bir aptalsın, kimse senin aptalca içeriğini beğenmiyor.", "Kapa çeneni değersiz çöp parçası. Herkes senden nefret ediyor.", "Bu gördüğüm en acıklı şey. Sadece lanet olası kaybedenler bunun iyi olduğunu düşünür.", "Dünya sensiz daha iyi olurdu, kaybol git." ], 'non_toxic': [ "Fikrinize katılmıyorum ama bakış açınızı takdir ediyorum.", "Bu içerik biraz geliştirilebilir, ancak gösterdiğiniz çabayı görüyorum.", "Şahsen bu tür içerikten hoşlanmasam da, başkaları bunu değerli bulabilir.", "Bu karmaşık konu hakkındaki düşüncelerinizi paylaştığınız için teşekkür ederim." ] }, 'es': { 'toxic': [ "Eres un completo idiota y nadie te quiere.", "Cállate, pedazo de basura inútil. Todos te odian.", "Esto es lo más patético que he visto nunca. Solo los perdedores pensarían que esto es bueno.", "El mundo estaría mejor sin ti, deberías desaparecer, joder." ], 'non_toxic': [ "No estoy de acuerdo con tu opinión, pero aprecio tu perspectiva.", "Este contenido podría mejorarse, pero veo el esfuerzo que has puesto en él.", "Aunque personalmente no disfruto este tipo de contenido, otros podrían encontrarlo valioso.", "Gracias por compartir tus pensamientos sobre este tema tan complejo." ] }, 'fr': { 'toxic': [ "Tu es tellement stupide, personne n'aime ton contenu minable.", "Ferme-la, espèce de déchet inutile. Tout le monde te déteste.", "C'est la chose la plus pathétique que j'ai jamais vue. Seuls les loosers penseraient que c'est bien.", "Le monde serait meilleur sans toi, connard, va-t'en." ], 'non_toxic': [ "Je ne suis pas d'accord avec ton opinion, mais j'apprécie ta perspective.", "Ce contenu pourrait être amélioré, mais je vois l'effort que tu y as mis.", "Bien que personnellement je n'apprécie pas ce type de contenu, d'autres pourraient le trouver précieux.", "Merci d'avoir partagé tes réflexions sur ce sujet complexe." ] }, 'it': { 'toxic': [ "Sei un tale idiota, a nessuno piace il tuo contenuto stupido.", "Chiudi quella bocca, pezzo di spazzatura inutile. Tutti ti odiano.", "Questa è la cosa più patetica che abbia mai visto. Solo i perdenti penserebbero che sia buona.", "Il mondo sarebbe migliore senza di te, sparisci." ], 'non_toxic': [ "Non sono d'accordo con la tua opinione, ma apprezzo la tua prospettiva.", "Questo contenuto potrebbe essere migliorato, ma vedo lo sforzo che ci hai messo.", "Anche se personalmente non apprezzo questo tipo di contenuto, altri potrebbero trovarlo utile.", "Grazie per aver condiviso i tuoi pensieri su questo argomento complesso." ] }, 'pt': { 'toxic': [ "Você é um idiota completo, ninguém gosta do seu conteúdo estúpido.", "Cale a boca, seu pedaço de lixo inútil. Todos te odeiam.", "Isso é a coisa mais patética que eu já vi. Só perdedores pensariam que isso é bom.", "O mundo seria melhor sem você, desapareça." ], 'non_toxic': [ "Eu discordo da sua opinião, mas aprecio sua perspectiva.", "Este conteúdo poderia ser melhorado, mas vejo o esforço que você colocou nele.", "Embora eu pessoalmente não goste deste tipo de conteúdo, outros podem achá-lo valioso.", "Obrigado por compartilhar seus pensamentos sobre este tema complexo." ] } } # Theme colors - Light theme with black text THEME = { "primary": "#2D3142", "background": "#FFFFFF", "surface": "#FFFFFF", "text": "#000000", # Changed to pure black for maximum contrast "text_secondary": "#FFFFFF", # For text that needs to be white "button": "#000000", # Dark black for buttons "toxic": "#E53935", # Darker red for better contrast "non_toxic": "#2E7D32", # Darker green for better contrast "warning": "#F57C00", # Darker orange for better contrast "info": "#1976D2", # Darker blue for better contrast "sidebar_bg": "#FFFFFF", "card_bg": "white", "input_bg": "#F8F9FA" } # Custom CSS for better styling st.markdown(f""" """, unsafe_allow_html=True) # Custom CSS for metric labels - Add this near the top with the other CSS st.markdown(f""" """, unsafe_allow_html=True) # Load model at app start @st.cache_resource def load_classifier(): try: if os.path.exists(ONNX_MODEL_PATH): classifier = OptimizedToxicityClassifier(onnx_path=ONNX_MODEL_PATH, device=DEVICE) st.session_state['model_type'] = 'Loaded' return classifier elif os.path.exists(PYTORCH_MODEL_DIR): classifier = OptimizedToxicityClassifier(pytorch_path=PYTORCH_MODEL_DIR, device=DEVICE) st.session_state['model_type'] = 'Loaded' return classifier else: st.error(f"❌ No model found at {ONNX_MODEL_PATH} or {PYTORCH_MODEL_DIR}") return None except Exception as e: st.error(f"Error loading model: {str(e)}") import traceback st.error(traceback.format_exc()) return None def detect_language(text: str) -> str: """Detect language of input text""" try: lang, _ = langid.classify(text) return lang if lang in SUPPORTED_LANGUAGES else 'en' except: return 'en' def predict_toxicity(text: str, selected_language: str = "Auto-detect") -> Dict: """Predict toxicity of input text""" if not text or not text.strip(): return { "error": "Please enter some text to analyze.", "results": None } if not st.session_state.get('model_loaded', False): return { "error": "Model not loaded. Please check logs.", "results": None } # Add a spinner while processing with st.spinner("Analyzing text..."): # Record start time for inference metrics start_time = time.time() # Detect language if auto-detect is selected if selected_language == "Auto-detect": lang_detection_start = time.time() lang_code = detect_language(text) lang_detection_time = time.time() - lang_detection_start detected = True else: # Get language code from the display name without flag selected_name = selected_language.split(' ')[1] if len(selected_language.split(' ')) > 1 else selected_language lang_code = next((code for code, info in SUPPORTED_LANGUAGES.items() if info['name'] == selected_name), 'en') lang_detection_time = 0 detected = False # Run prediction try: model_inference_start = time.time() results = classifier.predict([text], langs=[lang_code])[0] model_inference_time = time.time() - model_inference_start total_time = time.time() - start_time return { "results": results, "detected": detected, "lang_code": lang_code, "performance": { "total_time": total_time, "lang_detection_time": lang_detection_time, "model_inference_time": model_inference_time } } except Exception as e: import traceback traceback.print_exc() return { "error": f"Error processing text: {str(e)}", "results": None } # Function to set example text def set_example(lang_code, example_type, example_index=0): st.session_state['use_example'] = True # Get the example based on the language, type and index example = LANGUAGE_EXAMPLES[lang_code][example_type][example_index] st.session_state['example_text'] = example st.session_state['detected_lang'] = lang_code st.session_state['example_info'] = { 'type': example_type, 'lang': lang_code, 'index': example_index } # Initialize session state for example selection if not present if 'use_example' not in st.session_state: st.session_state['use_example'] = False st.session_state['example_text'] = "" st.session_state['detected_lang'] = "Auto-detect" st.session_state['example_info'] = None # Sidebar content with st.sidebar: st.markdown("
Detect toxic content in multiple languages with state-of-the-art accuracy
""", unsafe_allow_html=True) # Text input area with interactive styling with stylable_container( key="text_input_container", css_styles=f""" {{ border-radius: 10px; overflow: hidden; transition: all 0.3s ease; box-shadow: 0 2px 8px rgba(0,0,0,0.15); background-color: {THEME["card_bg"]}; padding: 10px; margin-bottom: 15px; }} textarea {{ caret-color: black !important; color: {THEME["text"]} !important; }} /* Ensure the text input cursor is visible */ .stTextArea textarea {{ caret-color: black !important; }} """ ): # Get the current example text if it exists current_example = st.session_state.get('example_text', '') # Set the text input value, allowing for modifications text_input = st.text_area( "Enter text to analyze", height=80, value=current_example if st.session_state.get('use_example', False) else st.session_state.get('text_input', ''), key="text_input", help="Enter text in any supported language to analyze for toxicity" ) # Check if the text has been modified from the example if st.session_state.get('use_example', False) and text_input != current_example: # Text was modified, clear example state st.session_state['use_example'] = False st.session_state['example_text'] = "" st.session_state['example_info'] = None # Analyze button with improved styling in a more compact layout col1, col2, col3 = st.columns([1, 2, 1]) with col2: analyze_button = st.button( "Analyze Text", type="primary", use_container_width=True, help="Click to analyze the entered text for toxicity" ) # Process when button is clicked or text is submitted if analyze_button or (text_input and 'last_analyzed' not in st.session_state or st.session_state.get('last_analyzed') != text_input): if text_input: st.session_state['last_analyzed'] = text_input # Get system resource info before prediction pre_prediction_resources = update_system_resources() # Make prediction prediction = predict_toxicity(text_input, selected_language) # Update resource usage after prediction post_prediction_resources = update_system_resources() # Calculate resource usage delta resource_delta = { "cpu_usage": float(post_prediction_resources["cpu"]["usage"].rstrip("%")) - float(pre_prediction_resources["cpu"]["usage"].rstrip("%")), "ram_usage": float(post_prediction_resources["ram"]["percent"].rstrip("%")) - float(pre_prediction_resources["ram"]["percent"].rstrip("%")) } # Update GPU memory info after prediction if DEVICE == "cuda": new_memory_info = update_gpu_info() # Note: Ideally we would update the displayed memory usage here, # but Streamlit doesn't support dynamic updates without a rerun, # so we'll just include memory info in our metrics # Set analysis status flags but remove celebration effect code st.session_state['is_analysis_complete'] = True st.session_state['analysis_has_error'] = "error" in prediction and prediction["error"] if "error" in prediction and prediction["error"]: st.error(prediction["error"]) elif prediction["results"]: # Remove celebration effect call # celebration_effect() results = prediction["results"] performance = prediction.get("performance", {}) # Overall toxicity result is_toxic = results["is_toxic"] result_color = THEME["toxic"] if is_toxic else THEME["non_toxic"] result_text = "TOXIC" if is_toxic else "NON-TOXIC" # Language info lang_code = prediction["lang_code"] lang_info = SUPPORTED_LANGUAGES.get(lang_code, {"name": lang_code, "flag": "🌐"}) # Count toxic categories toxic_count = len(results["toxic_categories"]) if is_toxic else 0 # Create data for visualization but don't display the table categories = [] probabilities = [] statuses = [] # Use the same thresholds that are used in the inference model category_thresholds = { 'toxic': 0.60, 'severe_toxic': 0.54, 'obscene': 0.60, 'threat': 0.48, 'insult': 0.60, 'identity_hate': 0.50 } for label, prob in results["probabilities"].items(): categories.append(label.replace('_', ' ').title()) probabilities.append(round(prob * 100, 1)) threshold = category_thresholds.get(label, 0.5) * 100 statuses.append("DETECTED" if prob * 100 >= threshold else "Not Detected") # Sort by probability for the chart chart_data = sorted(zip(categories, probabilities, statuses), key=lambda x: x[1], reverse=True) chart_cats, chart_probs, chart_statuses = zip(*chart_data) # Two column layout for results col1, col2 = st.columns([3, 2]) with col1: # Card with overall result and detected categories with stylable_container( key="result_card", css_styles=f""" {{ border-radius: 10px; padding: 10px 15px; background-color: {THEME["card_bg"]}; border-left: 5px solid {result_color}; margin-bottom: 10px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); overflow: hidden; }} """ ): # Overall result with abbreviated display st.markdown(f"""