diff --git "a/src/streamlit_app.py" "b/src/streamlit_app.py" --- "a/src/streamlit_app.py" +++ "b/src/streamlit_app.py" @@ -1,40 +1,2077 @@ -import altair as alt -import numpy as np -import pandas as pd import streamlit as st +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +from plotly.subplots import make_subplots +import os +import time +from PIL import Image + +# Only import APIs if available +try: + from google import genai + GENAI_AVAILABLE = True +except ImportError: + GENAI_AVAILABLE = False + +try: + from openai import OpenAI + OPENAI_AVAILABLE = True +except ImportError: + OPENAI_AVAILABLE = False + +BASE_DIR = os.path.dirname(__file__) +DATA_DIR = os.path.join(BASE_DIR, "data") + +# Page configuration +st.set_page_config( + page_title="Translation Comparison Tool", + page_icon="🌐", + layout="wide", + initial_sidebar_state="collapsed" +) + +# Custom CSS for Material Design with Tailwind-inspired styling +st.markdown(""" + +""", unsafe_allow_html=True) + +# Model configurations +MODEL_CONFIG = { + 'Gemini': { + 'languages': ['Afrikaans', 'Northern Sotho', 'isiZulu'], + 'models': ['gemini-2.0-flash-exp', 'gemini-1.5-flash', 'gemini-1.5-pro'], + 'default_model': 'gemini-2.0-flash-exp' + }, + 'GPT': { + 'languages': ['Afrikaans', 'Northern Sotho', 'isiZulu'], + 'models': ['gpt-4', 'gpt-4-turbo', 'gpt-3.5-turbo'], + 'default_model': 'gpt-4' + }, + 'NLLB': { + 'languages': ['Northern Sotho', 'isiZulu'], # No Afrikaans model available + 'models': { + 'Northern Sotho': 'dsfsi/dcs-eng-nso-nllb-1.3B', + 'isiZulu': 'dsfsi/dcs-eng-zul-nllb-1.3B' + } + } +} + +# Language code mappings +LANGUAGE_CODES = { + 'Afrikaans': 'afr', + 'Northern Sotho': 'nso', + 'isiZulu': 'isizulu' +} + +# Load logo +def load_logo(): + """Load logo with error handling""" + try: + if os.path.exists(f"{BASE_DIR}/logo.png"): + return Image.open(f"{BASE_DIR}/logo.png") + except Exception as e: + st.warning(f"Could not load logo: {str(e)}") + return None + +# Load and cache data +@st.cache_data +def load_translation_data(): + """Load sample translation data""" + try: + sample_data = { + 'english': ['Hello world', 'How are you?', 'Good morning', 'Thank you', 'Welcome', 'Goodbye'], + 'afr': ['Hallo wêreld', 'Hoe gaan dit?', 'Goeie môre', 'Dankie', 'Welkom', 'Totsiens'], + 'afr_rev': ['Hallo wêreld', 'Hoe gaan dit met jou?', 'Goeie môre', 'Baie dankie', 'Welkom', 'Totsiens'], + 'nso': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'], + 'nso_rev': ['Dumela lefase', 'O phela bjang?', 'Thobela', 'Ke a leboga kudu', 'O amogetšwe', 'Šala gabotse'], + 'isizulu': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'], + 'isizulu_rev': ['Sawubona mhlaba', 'Unjani wena?', 'Sawubona', 'Ngiyabonga kakhulu', 'Wamukelekile', 'Sala kahle'], + 'nso_mt_nllb': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'], + 'isizulu_mt_nllb': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'], + 'afr_mt_gpt': ['Hallo wêreld', 'Hoe gaan dit?', 'Goeie môre', 'Dankie', 'Welkom', 'Totsiens'], + 'nso_mt_gpt': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'], + 'isizulu_mt_gpt': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'], + 'afr_mt_gemini': ['Hallo wêreld', 'Hoe is dit?', 'Goeie môre', 'Baie dankie', 'Welkom', 'Totsiens'], + 'nso_mt_gemini': ['Dumela lefase', 'O phela bjang?', 'Thobela', 'Ke a leboga kudu', 'O amogetšwe', 'Šala gabotse'], + 'isizulu_mt_gemini': ['Sawubona mhlaba', 'Unjani wena?', 'Sawubona', 'Ngiyabonga kakhulu', 'Wamukelekile', 'Sala kahle'] + } + return pd.DataFrame(sample_data) + except Exception as e: + st.error(f"Error loading data: {str(e)}") + return pd.DataFrame({'english': ['Sample text'], 'error': ['Data loading failed']}) + +def translate_with_gemini(text, target_language, model_name="gemini-2.0-flash-exp", client=None): + """Translate text using Gemini API""" + try: + if not GENAI_AVAILABLE: + return "❌ Gemini library not installed" + + if not client: + return "❌ Gemini API not configured. Please check your GEMINI_API_KEY." + + lang_map = { + 'Afrikaans': 'Afrikaans', + 'Northern Sotho': 'Northern Sotho (Sepedi)', + 'isiZulu': 'isiZulu' + } + + prompt = f"Translate the following English text to {lang_map.get(target_language, target_language)}: '{text}'. Provide only the translation without any explanations." + + response = client.models.generate_content( + model=model_name, contents=prompt + ) + return response.text.strip() + except Exception as e: + return f"❌ Error: {str(e)}" + +def translate_with_openai(text, target_language, model_name="gpt-4o", client=None): + """Translate text using OpenAI API with Chat Completions""" + try: + if not OPENAI_AVAILABLE: + return "❌ OpenAI library not installed" + + if not client: + return "❌ OpenAI API not configured. Please check your OPENAI_API_KEY." + + lang_map = { + 'Afrikaans': 'Afrikaans', + 'Northern Sotho': 'Northern Sotho (Sepedi)', + 'isiZulu': 'isiZulu' + } + + # Use Chat Completions API (supported indefinitely) + response = client.chat.completions.create( + model=model_name, + messages=[ + {"role": "system", "content": "You are a professional translator. Provide only the translation without any explanations."}, + {"role": "user", "content": f"Translate the following text to {lang_map.get(target_language, target_language)}: {text}"} + ], + max_tokens=1000, + temperature=0.3 # Lower temperature for more consistent translations + ) + + return response.choices[0].message.content.strip() + + except Exception as e: + return f"❌ Error: {str(e)}" + +@st.cache_resource +def initialize_apis(): + """Initialize API clients with proper error handling, supporting both local and HF Spaces.""" + genai_client = None + openai_client = None + + def get_secret(name): + """Fetch secret from env first (Docker Spaces), then Streamlit secrets.""" + return ( + os.environ.get(name) + or (st.secrets.get(name) if hasattr(st, "secrets") and name in st.secrets else None) + ) + + try: + # Gemini API + if GENAI_AVAILABLE: + try: + api_key = get_secret("GEMINI_API_KEY") + if api_key: + genai_client = genai.Client(api_key=api_key) + else: + st.warning("⚠️ Gemini API key not found") + except Exception as e: + st.error(f"❌ Gemini API error: {str(e)}") + + # OpenAI API + if OPENAI_AVAILABLE: + try: + api_key = get_secret("OPENAI_API_KEY") + if api_key: + try: + # Try new OpenAI API client + openai_client = OpenAI(api_key=api_key) + except TypeError: + import openai + openai.api_key = api_key + openai_client = openai + else: + st.warning("⚠️ OpenAI API key not found") + except Exception as e: + st.error(f"❌ OpenAI API error: {str(e)}") + + except Exception as e: + st.error(f"❌ API initialization error: {str(e)}") + + return genai_client, openai_client + +def translate_with_nllb(text, target_language): + """Translate text using unified NLLB API""" + try: + import requests + + # Single ngrok URL for unified API + API_URL = "https://4c2faecc052a.ngrok-free.app" + + # Map Streamlit language names to API format + lang_mapping = { + 'Northern Sotho': 'nso', + 'isiZulu': 'zul' + } + + api_lang = lang_mapping.get(target_language, target_language.lower()) + + response = requests.post( + f"{API_URL}/translate_simple", + params={ + "text": text, + "target_language": api_lang + }, + timeout=30 + ) + + if response.status_code == 200: + result = response.json() + return result.get(api_lang, '❌ Translation not found') + else: + return f"❌ API Error: {response.status_code}" + + except Exception as e: + return f"❌ Error: {str(e)}" + +def create_language_tabs(available_languages, current_language, key_suffix=""): + """Create language tabs with proper styling""" + tabs_html = '
' + + for lang in available_languages: + active_class = "active" if lang == current_language else "" + tabs_html += f''' +
+ {lang} +
+ ''' + + tabs_html += '
' + + # Add JavaScript for tab functionality + script = f''' + + ''' + + return tabs_html + script + +def main(): + """Main application function""" + # Load and display logo and title side by side + logo = load_logo() + + # Initialize session state FIRST to avoid refreshes + if 'target_language' not in st.session_state: + st.session_state.target_language = 'Afrikaans' + if 'translation_result' not in st.session_state: + st.session_state.translation_result = "" + if 'current_page' not in st.session_state: + st.session_state.current_page = 1 + if 'initialized' not in st.session_state: + st.session_state.initialized = True + + col1, col2, col3 = st.columns([1, 2, 1]) + with col2: + if logo: + # Convert logo to base64 for HTML embedding + import base64 + from io import BytesIO + buffered = BytesIO() + logo.save(buffered, format="PNG") + img_str = base64.b64encode(buffered.getvalue()).decode() + + st.markdown(f''' +
+ +

UP Translate

+
+ ''', unsafe_allow_html=True) + else: + st.markdown('

UP Translate

', unsafe_allow_html=True) + + # Initialize APIs + genai_client, openai_client = initialize_apis() + + # Initialize session state + if 'target_language' not in st.session_state: + st.session_state.target_language = 'Afrikaans' + if 'translation_result' not in st.session_state: + st.session_state.translation_result = "" + + # Create tabs + tab1, tab2 = st.tabs(["🤖 Live Translations", "📊 Existing Translations"]) + + with tab1: + # st.markdown('

Live Translation

', unsafe_allow_html=True) + + # Create simplified model options + model_options = [] + model_mapping = {} + + # Add Gemini models + for model in MODEL_CONFIG['Gemini']['models']: + display_name = f"Gemini - {model}" + model_options.append(display_name) + model_mapping[display_name] = ('Gemini', None, model) + + # Add GPT models + for model in MODEL_CONFIG['GPT']['models']: + display_name = f"GPT - {model}" + model_options.append(display_name) + model_mapping[display_name] = ('GPT', None, model) + + # Add single NLLB option + model_options.append("NLLB - Specialized Models") + model_mapping["NLLB - Specialized Models"] = ('NLLB', None, None) + + # Model selection with inline label + label_col, dropdown_col = st.columns([2, 10]) + with label_col: + st.markdown('
Select Model:
', unsafe_allow_html=True) + with dropdown_col: + selected_model_option = st.selectbox( + "Select Model:", + model_options, + index=0, + key="model_selection_dropdown", + label_visibility="collapsed" + ) + + selected_provider, _, selected_model = model_mapping[selected_model_option] + + # Translation interface + col_left, col_center, col_right = st.columns([5, 1, 5]) + + # Left side - English Input + with col_left: + st.markdown('
', unsafe_allow_html=True) + st.markdown('
English
', unsafe_allow_html=True) + st.markdown('
', unsafe_allow_html=True) + + input_text = st.text_area( + "Input", + placeholder="Input text here", + height=350, + key="input_text_live", + label_visibility="collapsed" + ) + + # Center - Translate Button + with col_center: + # Add spacing to align button with text areas + st.markdown('
', unsafe_allow_html=True) + translate_clicked = st.button( + "Translate", + key="translate_btn_live", + help="Translate text", + type="primary", + use_container_width=True + ) + + # Right side - Translation Output + with col_right: + # Determine available languages based on selected provider + if selected_provider == 'NLLB': + available_languages = MODEL_CONFIG['NLLB']['languages'] + else: + available_languages = ['Afrikaans', 'Northern Sotho', 'isiZulu'] + + # Set default language to first available if current selection not available + if st.session_state.target_language not in available_languages: + st.session_state.target_language = available_languages[0] + + # Create container with custom styling + st.markdown('
', unsafe_allow_html=True) + + # Language selection buttons + lang_cols = st.columns(len(available_languages)) + for i, lang in enumerate(available_languages): + with lang_cols[i]: + button_type = "primary" if lang == st.session_state.target_language else "secondary" + if st.button( + lang, + key=f"lang_btn_{lang}_live", + type=button_type, + use_container_width=True + ): + if st.session_state.target_language != lang: # Only update if different + st.session_state.target_language = lang + st.session_state.translation_result = "" # Clear previous result + st.rerun() + + # Translation logic + if translate_clicked and input_text: + with st.spinner("Translating..."): + target_lang = st.session_state.target_language + + if selected_provider == 'Gemini': + result = translate_with_gemini(input_text, target_lang, selected_model, genai_client) + + elif selected_provider == 'GPT': + result = translate_with_openai(input_text, target_lang, selected_model, openai_client) + + elif selected_provider == 'NLLB': + result = translate_with_nllb(input_text, target_lang) + + st.session_state.translation_result = result + + # Translation output area with proper labeling + st.text_area( + f"Translation ({st.session_state.target_language})", # Dynamic label + value=st.session_state.translation_result, + placeholder="Translation will appear here", + height=350, + key="translation_output_live_fixed", # Changed key to avoid conflicts + disabled=True, + label_visibility="collapsed" + ) + + # Support information + st.markdown(""" +
+ Available Models:
+ 🔮 Gemini: All languages (gemini-2.0-flash-exp, gemini-1.5-flash, gemini-1.5-pro)
+ 🧠 GPT: All languages (gpt-4, gpt-4-turbo, gpt-3.5-turbo)
+ 🤗 NLLB: Northern Sotho, isiZulu only (specialized models) +
+ """, unsafe_allow_html=True) + + with tab2: + # Load data from base directory automatically + @st.cache_data + def load_analysis_data(): + """Load all analysis data from base directory""" + df_translations = None + df_bleu = None + df_chrf = None + df_comet = None + + try: + # Try to load translations data + if os.path.exists(f"{DATA_DIR}/translations.tsv"): + df_translations = pd.read_csv(f"{DATA_DIR}/translations.tsv", sep="\t") + + # Convert new CSV format to expected format for analysis + # New format: id,english,afr_human,afr_revised,nso_human,nso_revised,zul_human,zul_revised,afr_gemini,afr_gpt,nso_gemini,nso_gpt,nso_nllb,zul_gemini,zul_gpt,zul_nllb + # Expected format: english, afr_human, afr_revised, nso_human, nso_revised, isizulu_human, isizulu_revised, etc. + + # Rename zul columns to isizulu for backward compatibility with analysis code + column_mapping = { + 'zul_human': 'isizulu_human', + 'zul_revised': 'isizulu_revised', + 'zul_gemini': 'isizulu_mt_gemini', + 'zul_gpt': 'isizulu_mt_gpt', + 'zul_nllb': 'isizulu_mt_nllb', + 'afr_gemini': 'afr_mt_gemini', + 'afr_gpt': 'afr_mt_gpt', + 'nso_gemini': 'nso_mt_gemini', + 'nso_gpt': 'nso_mt_gpt', + 'nso_nllb': 'nso_mt_nllb' + } + + df_translations = df_translations.rename(columns=column_mapping) + + elif os.path.exists(f"{DATA_DIR}/translation_data.csv"): + df_translations = pd.read_csv(f"{DATA_DIR}/translation_data.csv") + else: + print("No translation data found, using sample data") + df_translations = load_translation_data() # Fallback to sample data + + # Try to load BLEU scores + if os.path.exists(f"{DATA_DIR}/bleu_scores.csv"): + df_bleu = pd.read_csv(f"{DATA_DIR}/bleu_scores.csv") + + # Convert zul references to isizulu for compatibility + df_bleu['comparison_pair'] = df_bleu['comparison_pair'].str.replace('zul_', 'isizulu_') + df_bleu['language'] = df_bleu['language'].replace('isiZulu', 'isiZulu') # Already correct + + else: + # Sample BLEU data (using isizulu for compatibility with existing analysis code) + df_bleu = pd.DataFrame({ + 'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'nso_human_vs_nso_nllb', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised', 'isizulu_human_vs_isizulu_nllb'], + 'bleu_score': [0.78, 0.72, 0.89, 0.65, 0.68, 0.85, 0.71, 0.71, 0.69, 0.87, 0.73], + 'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu', 'isiZulu'] + }) + + # Try to load COMET scores + if os.path.exists(f"{DATA_DIR}/comet_scores.csv"): + df_comet = pd.read_csv(f"{DATA_DIR}/comet_scores.csv") + else: + # Sample COMET data + df_comet = pd.DataFrame({ + 'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised'], + 'comet_score': [0.82, 0.79, 0.92, 0.71, 0.74, 0.88, 0.76, 0.73, 0.90], + 'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu'] + }) + + # Try to load CHRF scores + if os.path.exists(f"{DATA_DIR}/chrf_scores.csv"): + df_chrf = pd.read_csv(f"{DATA_DIR}/chrf_scores.csv") + else: + # Sample CHRF data + df_chrf = pd.DataFrame({ + 'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised'], + 'chrf_score': [0.75, 0.70, 0.88, 0.60, 0.65, 0.80, 0.68, 0.66, 0.85], + 'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu'] + }) + + return df_translations, df_bleu, df_comet, df_chrf + + except Exception as e: + st.error(f"Error loading data: {str(e)}") + return None, None, None, None + + # Load all data + df_translations, df_bleu, df_comet, df_chrf = load_analysis_data() + + if df_translations is not None: + # Language selection in columns + lang_col1, lang_col2 = st.columns([2, 10]) + with lang_col1: + st.markdown('
Select Language:
', unsafe_allow_html=True) + with lang_col2: + languages = ['Afrikaans', 'Northern Sotho', 'isiZulu'] + selected_lang = st.selectbox( + "Select Language for Analysis:", + languages, + key="global_lang_select", + label_visibility="collapsed" + ) + + # Get language code + lang_codes = {'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu'} + code = lang_codes[selected_lang] + + # Create analysis tabs + analysis_tab1, analysis_tab2, analysis_tab3, analysis_tab4 = st.tabs(["Sample Translations", "📊 Quality Metrics", "🔄 Revision Analysis", "🔍 Word Comparison"]) + + with analysis_tab1: + # Translation Samples Tab + st.markdown(""" +
+

+ 📝 Translation Samples for {selected_lang} +

+
+ """.format(selected_lang=selected_lang), unsafe_allow_html=True) + + # Use the global language selection + samples_code = code + + # Show sample translations for the selected language + display_cols = ['english'] + [col for col in df_translations.columns if col.startswith(samples_code)] + + if display_cols and len(display_cols) > 1: # Need at least english + 1 translation column + # Control panel + control_col1, control_col2, control_col3, control_col4 = st.columns([1, 7, 1, 2]) + + with control_col1: + st.markdown('
Samples per page:
', unsafe_allow_html=True) + with control_col2: + page_size = st.selectbox( + "Samples per page:", + [10, 25, 50, 100], + index=0, + key="page_size_select", + label_visibility="collapsed" + ) + + # Initialize session state for pagination + if 'current_page' not in st.session_state: + st.session_state.current_page = 1 + + # Filter data and calculate pagination + available_data = df_translations[display_cols].dropna(subset=[col for col in display_cols if col != 'english'], how='all') + total_samples = len(available_data) + total_pages = max(1, (total_samples + page_size - 1) // page_size) # Ceiling division + + # Ensure current page is valid + if st.session_state.current_page > total_pages: + st.session_state.current_page = 1 + + # Calculate start and end indices + start_idx = (st.session_state.current_page - 1) * page_size + end_idx = min(start_idx + page_size, total_samples) + + # Get current page data + current_page_data = available_data.iloc[start_idx:end_idx] + + with control_col3: + st.markdown('
Page:
', unsafe_allow_html=True) + with control_col4: + # Page navigation + nav_col1, nav_col2, nav_col3, nav_col4, nav_col5 = st.columns([1, 1, 2, 1, 1]) + + with nav_col1: + if st.button("⏮️", key="first_page", help="First page", disabled=(st.session_state.current_page == 1)): + st.session_state.current_page = 1 + st.rerun() + + with nav_col2: + if st.button("◀️", key="prev_page", help="Previous page", disabled=(st.session_state.current_page == 1)): + st.session_state.current_page -= 1 + st.rerun() + + with nav_col3: + st.markdown(f'
{st.session_state.current_page} / {total_pages}
', unsafe_allow_html=True) + + with nav_col4: + if st.button("▶️", key="next_page", help="Next page", disabled=(st.session_state.current_page == total_pages)): + st.session_state.current_page += 1 + st.rerun() + + with nav_col5: + if st.button("⏭️", key="last_page", help="Last page", disabled=(st.session_state.current_page == total_pages)): + st.session_state.current_page = total_pages + st.rerun() + + # Statistics cards + stats_col1, stats_col2, stats_col3, stats_col4 = st.columns(4) + + with stats_col1: + st.markdown(f""" +
+
Showing
+
{len(current_page_data)}
+
+ """, unsafe_allow_html=True) + + with stats_col2: + available_systems = len([col for col in display_cols if col != 'english']) + st.markdown(f""" +
+
Translation Systems
+
{available_systems}
+
+ """, unsafe_allow_html=True) + + with stats_col3: + st.markdown(f""" +
+
Total Available
+
{total_samples}
+
+ """, unsafe_allow_html=True) + + with stats_col4: + st.markdown(f""" +
+
Current Page
+
{st.session_state.current_page}/{total_pages}
+
+ """, unsafe_allow_html=True) + + # Display the samples table + st.markdown("### Translation Examples") + + if len(current_page_data) > 0: + # Create a styled dataframe with better column names + display_df = current_page_data.copy() + + # Rename columns for better display + column_rename = { + 'english': 'English (Source)', + } + + # Add human-readable names for translation columns + for col in display_df.columns: + if col.startswith(samples_code): + if '_human' in col: + column_rename[col] = f'{selected_lang} (Human)' + elif '_revised' in col: + column_rename[col] = f'{selected_lang} (Revised)' + elif '_mt_gemini' in col or '_gemini' in col: + column_rename[col] = f'{selected_lang} (Gemini)' + elif '_mt_gpt' in col or '_gpt' in col: + column_rename[col] = f'{selected_lang} (GPT)' + elif '_mt_nllb' in col or '_nllb' in col: + column_rename[col] = f'{selected_lang} (NLLB)' + else: + # Generic fallback + clean_name = col.replace(f'{samples_code}_', '').replace('_', ' ').title() + column_rename[col] = f'{selected_lang} ({clean_name})' + + display_df = display_df.rename(columns=column_rename) + + # Add row numbers based on actual position in full dataset + display_df.index = range(start_idx + 1, end_idx + 1) + display_df.index.name = 'Sample #' + + st.dataframe( + display_df, + use_container_width=True, + height=min(600, 50 + len(display_df) * 35), # Dynamic height based on content + column_config={ + col: st.column_config.TextColumn(col, width="medium") + for col in display_df.columns + } + ) + + # Page info summary + st.markdown(f""" +
+ 📄 Showing samples {start_idx + 1} to {end_idx} of {total_samples} total samples • Page {st.session_state.current_page} of {total_pages} +
+ """, unsafe_allow_html=True) + + # Quick jump to page + if total_pages > 5: # Only show quick jump for datasets with many pages + st.markdown("### Quick Navigation") + jump_col1, jump_col2, jump_col3 = st.columns([1, 2, 1]) + + with jump_col2: + target_page = st.number_input( + f"Jump to page (1-{total_pages}):", + min_value=1, + max_value=total_pages, + value=st.session_state.current_page, + key="page_jump" + ) + + if st.button("🔗 Go to Page", use_container_width=True): + if target_page != st.session_state.current_page: + st.session_state.current_page = target_page + st.rerun() + + else: + st.warning("⚠️ No translation samples found for the current page.") + + else: + st.warning(f"⚠️ No translation data available for {selected_lang}. Expected columns starting with '{samples_code}_'") + + # Debug information + available_columns = [col for col in df_translations.columns if col.startswith(samples_code)] + if available_columns: + st.info(f"🔍 Found columns: {', '.join(available_columns)}") + else: + all_lang_columns = [col for col in df_translations.columns if any(col.startswith(prefix) for prefix in ['afr_', 'nso_', 'isizulu_'])] + if all_lang_columns: + st.info(f"💡 Available language columns: {', '.join(all_lang_columns[:10])}{'...' if len(all_lang_columns) > 10 else ''}") + + with analysis_tab2: + st.markdown(""" +
+

+ 📈 Quality Metrics for {selected_lang} +

+
+ """.format(selected_lang=selected_lang), unsafe_allow_html=True) + + # Get language code + lang_codes = {'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu'} + code = lang_codes[selected_lang] + + # Score visualizations + if df_bleu is not None and df_chrf is not None and df_comet is not None: + # Filter scores for selected language + lang_bleu = df_bleu[df_bleu['language'] == selected_lang] if 'language' in df_bleu.columns else df_bleu + lang_chrf = df_chrf[df_chrf['language'] == selected_lang] if 'language' in df_chrf.columns else df_chrf + lang_comet = df_comet[df_comet['language'] == selected_lang] if 'language' in df_comet.columns else df_comet + + # Check if we have domain-level data + has_domain_data = ('domain' in lang_bleu.columns and 'domain' in lang_chrf.columns and + 'domain' in lang_comet.columns and + len(lang_bleu[lang_bleu['domain'] != 'Overall']) > 0) + + if has_domain_data: + # Add domain filter + available_domains = sorted(lang_bleu['domain'].unique()) + domain_options = ['Overall'] + [d for d in available_domains if d != 'Overall'] + + selected_domain = st.selectbox( + "📍 Select Domain for Analysis:", + domain_options, + key=f"domain_selector_{selected_lang}" + ) + + # Filter data based on selected domain + if selected_domain == 'Overall': + display_bleu = lang_bleu[lang_bleu['domain'] == 'Overall'] + display_chrf = lang_chrf[lang_chrf['domain'] == 'Overall'] + display_comet = lang_comet[lang_comet['domain'] == 'Overall'] + chart_title_suffix = " - Overall" + else: + display_bleu = lang_bleu[lang_bleu['domain'] == selected_domain] + display_chrf = lang_chrf[lang_chrf['domain'] == selected_domain] + display_comet = lang_comet[lang_comet['domain'] == selected_domain] + chart_title_suffix = f" - {selected_domain}" + else: + # Use all data if no domain column + display_bleu = lang_bleu + display_chrf = lang_chrf + display_comet = lang_comet + chart_title_suffix = "" + + # Create score charts + if len(display_bleu) > 0 and len(display_chrf) > 0 and len(display_comet) > 0: + chart_col1, chart_col2, chart_col3 = st.columns(3) + + with chart_col1: + # chrF Score Chart + fig_chrf = px.bar( + display_chrf, + x='comparison_pair', + y='chrf_score', + title=f'chrF Scores - {selected_lang}{chart_title_suffix}', + color='chrf_score', + color_continuous_scale='oranges' + ) + fig_chrf.update_layout( + xaxis_title="Translation Pairs", + yaxis_title="chrF Score", + xaxis_tickangle=-45, + height=400, + font=dict(family="Inter", size=12) + ) + st.plotly_chart(fig_chrf, use_container_width=True) + + with chart_col2: + # BLEU Score Chart + fig_bleu = px.bar( + display_bleu, + x='comparison_pair', + y='bleu_score', + title=f'BLEU Scores - {selected_lang}{chart_title_suffix}', + color='bleu_score', + color_continuous_scale='blues' + ) + fig_bleu.update_layout( + xaxis_title="Translation Pairs", + yaxis_title="BLEU Score", + xaxis_tickangle=-45, + height=400, + font=dict(family="Inter", size=12) + ) + st.plotly_chart(fig_bleu, use_container_width=True) + + with chart_col3: + # COMET Score Chart + fig_comet = px.bar( + display_comet, + x='comparison_pair', + y='comet_score', + title=f'COMET Scores - {selected_lang}{chart_title_suffix}', + color='comet_score', + color_continuous_scale='greens' + ) + fig_comet.update_layout( + xaxis_title="Translation Pairs", + yaxis_title="COMET Score", + xaxis_tickangle=-45, + height=400, + font=dict(family="Inter", size=12) + ) + st.plotly_chart(fig_comet, use_container_width=True) + + # PRIMARY SPIDER CHART - Domain Performance when available, Model Performance otherwise + if has_domain_data: + st.markdown(f""" +

+ 🕸️ Domain Performance Spider Charts - {selected_lang} +

+ """, unsafe_allow_html=True) + + # Filter out "Overall" so only domain-level values are shown + domain_bleu = lang_bleu[lang_bleu['domain'] != 'Overall'] + domain_chrf = lang_chrf[lang_chrf['domain'] != 'Overall'] + domain_comet = lang_comet[lang_comet['domain'] != 'Overall'] + + # Pivot all metrics + pivot_bleu = domain_bleu.pivot( + index='comparison_pair', + columns='domain', + values='bleu_score' + ).fillna(0) + + pivot_chrf = domain_chrf.pivot( + index='comparison_pair', + columns='domain', + values='chrf_score' + ).fillna(0) + + pivot_comet = domain_comet.pivot( + index='comparison_pair', + columns='domain', + values='comet_score' + ).fillna(0) + + # Ensure domains are in the same order for all metrics + domains = sorted(set(pivot_bleu.columns) | set(pivot_chrf.columns) | set(pivot_comet.columns)) + pivot_bleu = pivot_bleu.reindex(columns=domains, fill_value=0) + pivot_chrf = pivot_chrf.reindex(columns=domains, fill_value=0) + pivot_comet = pivot_comet.reindex(columns=domains, fill_value=0) + + # Define distinct colors with reduced opacity + distinct_colors = [ + 'rgba(255, 99, 132, 0.4)', # Red + 'rgba(54, 162, 235, 0.4)', # Blue + 'rgba(99, 255, 132, 0.4)', # Green + 'rgba(75, 192, 192, 0.4)', # Teal + 'rgba(255, 205, 86, 0.4)', # Yellow + 'rgba(153, 102, 255, 0.4)', # Purple + 'rgba(255, 159, 64, 0.4)', # Orange + 'rgba(199, 199, 199, 0.4)', # Grey + 'rgba(83, 102, 255, 0.4)', # Indigo + 'rgba(255, 99, 255, 0.4)', # Magenta + ] + + # Border colors (same colors but full opacity for borders) + border_colors = [ + 'rgba(255, 99, 132, 1.0)', # Red + 'rgba(54, 162, 235, 1.0)', # Blue + 'rgba(99, 255, 132, 1.0)', # Green + 'rgba(75, 192, 192, 1.0)', # Teal + 'rgba(255, 205, 86, 1.0)', # Yellow + 'rgba(153, 102, 255, 1.0)', # Purple + 'rgba(255, 159, 64, 1.0)', # Orange + 'rgba(199, 199, 199, 1.0)', # Grey + 'rgba(83, 102, 255, 1.0)', # Indigo + 'rgba(255, 99, 255, 1.0)', # Magenta + ] + + # Layout for three side-by-side spider charts + spider_col1, spider_col2, spider_col3 = st.columns(3) + + # ---------------- CHRF SPIDER ---------------- + with spider_col1: + fig_chrf_spider = go.Figure() + for i, (model_name, row) in enumerate(pivot_chrf.iterrows()): + color_idx = i % len(distinct_colors) + fig_chrf_spider.add_trace(go.Scatterpolar( + r=row.tolist() + [row.tolist()[0]], # close loop + theta=domains + [domains[0]], + fill='toself', + name=model_name.split('_')[-1].upper(), + fillcolor=distinct_colors[color_idx], + line=dict(color=border_colors[color_idx], width=2), + opacity=0.7, + showlegend=False # Hide legend on first chart + )) + fig_chrf_spider.update_layout( + polar=dict(radialaxis=dict(visible=True, range=[0, 1])), + showlegend=False, + title=dict(text=f"Domain Performance (chrF) - {selected_lang}"), + height=450 + ) + st.plotly_chart(fig_chrf_spider, use_container_width=True) + + # ---------------- BLEU SPIDER ---------------- + with spider_col2: + fig_bleu_spider = go.Figure() + for i, (model_name, row) in enumerate(pivot_bleu.iterrows()): + color_idx = i % len(distinct_colors) + fig_bleu_spider.add_trace(go.Scatterpolar( + r=row.tolist() + [row.tolist()[0]], # close loop + theta=domains + [domains[0]], + fill='toself', + name=model_name.split('_')[-1].upper(), + fillcolor=distinct_colors[color_idx], + line=dict(color=border_colors[color_idx], width=2), + opacity=0.7, + showlegend=True # Show legend on middle chart + )) + fig_bleu_spider.update_layout( + polar=dict(radialaxis=dict(visible=True, range=[0, 1])), + showlegend=True, + title=dict(text=f"Domain Performance (BLEU) - {selected_lang}"), + height=450, + legend=dict( + orientation="h", + yanchor="bottom", + y=-0.3, + xanchor="center", + x=0.5 + ) + ) + st.plotly_chart(fig_bleu_spider, use_container_width=True) + + # ---------------- COMET SPIDER ---------------- + with spider_col3: + fig_comet_spider = go.Figure() + for i, (model_name, row) in enumerate(pivot_comet.iterrows()): + color_idx = i % len(distinct_colors) + fig_comet_spider.add_trace(go.Scatterpolar( + r=row.tolist() + [row.tolist()[0]], # close loop + theta=domains + [domains[0]], + fill='toself', + name=model_name.split('_')[-1].upper(), + fillcolor=distinct_colors[color_idx], + line=dict(color=border_colors[color_idx], width=2), + opacity=0.7, + showlegend=False # Hide legend on last chart + )) + fig_comet_spider.update_layout( + polar=dict(radialaxis=dict(visible=True, range=[0, 1])), + showlegend=False, + title=dict(text=f"Domain Performance (COMET) - {selected_lang}"), + height=450 + ) + st.plotly_chart(fig_comet_spider, use_container_width=True) + + # # Overall Performance Summary + # st.markdown(""" + #

+ # 📋 Overall Performance Summary + #

+ # """, unsafe_allow_html=True) + + # # Create overall summary table + # if len(display_bleu) > 0 and len(display_chrf) > 0 and len(display_comet) > 0: + # # Merge all three metrics + # merged_scores = pd.merge(display_bleu, display_chrf, on='comparison_pair', suffixes=('_bleu', '_chrf')) + # merged_scores = pd.merge(merged_scores, display_comet, on='comparison_pair') + # merged_scores['model'] = merged_scores['comparison_pair'].apply(lambda x: x.split('_')[-1].upper()) + + # summary_data = [] + # for _, row in merged_scores.iterrows(): + # summary_data.append({ + # 'Model': row['model'], + # 'BLEU Score': f"{row['bleu_score']:.3f}", + # 'chrF Score': f"{row['chrf_score']:.3f}", + # 'COMET Score': f"{row['comet_score']:.3f}", + # 'Average': f"{(row['bleu_score'] + row['chrf_score'] + row['comet_score']) / 3:.3f}" + # }) + + # summary_df = pd.DataFrame(summary_data) + + # # Only sort if dataframe has data and 'Average' column exists + # if len(summary_df) > 0 and 'Average' in summary_df.columns: + # summary_df = summary_df.sort_values('Average', ascending=False) + + # # Style the dataframe + # st.dataframe( + # summary_df, + # use_container_width=True, + # hide_index=True, + # column_config={ + # "Model": st.column_config.TextColumn("Model", width="medium"), + # "BLEU Score": st.column_config.NumberColumn("BLEU Score", format="%.3f"), + # "chrF Score": st.column_config.NumberColumn("chrF Score", format="%.3f"), + # "COMET Score": st.column_config.NumberColumn("COMET Score", format="%.3f"), + # "Average": st.column_config.NumberColumn("Average", format="%.3f") + # } + # ) + + with analysis_tab3: + # Revision Analysis Tab + st.markdown(""" +
+

+ ✏️ Human Translation Revision Analysis for {selected_lang} +

+
+ """.format(selected_lang=selected_lang), unsafe_allow_html=True) + + # Use the global language selection + rev_code = code + + # Check for revision columns + human_col = f"{rev_code}_human" + revised_col = f"{rev_code}_revised" + + if human_col in df_translations.columns and revised_col in df_translations.columns: + # Get all rows with human translations for this language + df_lang_data = df_translations[[human_col, revised_col]].copy() + + # Remove rows where human translation is missing (can't analyze revisions without original) + df_lang_data = df_lang_data[df_lang_data[human_col].notna()].copy() + + total_human_translations = len(df_lang_data) + + if total_human_translations == 0: + st.warning(f"⚠️ No human translations found for {selected_lang}") + else: + # Calculate revision statistics + # For missing revised translations, we assume no revision was made (same as original) + df_lang_data[revised_col] = df_lang_data[revised_col].fillna(df_lang_data[human_col]) + + # Count actual changes + revisions_made = sum(df_lang_data[human_col] != df_lang_data[revised_col]) + revision_rate = (revisions_made / total_human_translations) * 100 + + # Count how many had revision data available + revisions_available = sum(df_translations[revised_col].notna()) + + # Calculate revision types + def categorize_revision(original, revised): + if pd.isna(original) or pd.isna(revised): + return "Missing Data" + if str(original).strip() == str(revised).strip(): + return "No Change" + + orig_words = str(original).lower().split() + rev_words = str(revised).lower().split() + + if len(rev_words) > len(orig_words): + return "Expansion" + elif len(rev_words) < len(orig_words): + return "Reduction" + else: + return "Modification" + + df_lang_data['revision_type'] = df_lang_data.apply( + lambda row: categorize_revision(row[human_col], row[revised_col]), axis=1 + ) + + # Revision statistics cards + rev_col1, rev_col2, rev_col3, rev_col4 = st.columns(4) + + with rev_col1: + st.markdown(f""" +
+
Human Translations
+
{total_human_translations}
+
+ """, unsafe_allow_html=True) + + with rev_col2: + st.markdown(f""" +
+
Revisions Available
+
{revisions_available}
+
+ """, unsafe_allow_html=True) + + with rev_col3: + st.markdown(f""" +
+
Changes Made
+
{revisions_made}
+
+ """, unsafe_allow_html=True) + + with rev_col4: + st.markdown(f""" +
+
Revision Rate
+
{revision_rate:.1f}%
+
+ """, unsafe_allow_html=True) + + # Revision type analysis + st.markdown(""" +

+ 📈 Revision Pattern Analysis +

+ """, unsafe_allow_html=True) + + revision_counts = df_lang_data['revision_type'].value_counts() + + if len(revision_counts) > 0: + # Create revision type charts + rev_chart_col1, rev_chart_col2 = st.columns(2) + + with rev_chart_col1: + # Pie chart of revision types + fig_pie = px.pie( + values=revision_counts.values, + names=revision_counts.index, + title=f"Revision Types Distribution", + color_discrete_sequence=px.colors.qualitative.Set3 + ) + fig_pie.update_layout(height=400, font=dict(family="Inter", size=12)) + st.plotly_chart(fig_pie, use_container_width=True) + + with rev_chart_col2: + # Bar chart of revision types + fig_bar = px.bar( + x=revision_counts.values, + y=revision_counts.index, + orientation='h', + title=f"Revision Frequency", + color=revision_counts.values, + color_continuous_scale='viridis' + ) + fig_bar.update_layout( + height=400, + xaxis_title="Count", + yaxis_title="Revision Type", + font=dict(family="Inter", size=12) + ) + st.plotly_chart(fig_bar, use_container_width=True) + + # Word-level revision analysis + st.markdown(""" +

+ 🔤 Word-Level Changes Analysis +

+ """, unsafe_allow_html=True) + + # Calculate word changes only for actual revisions + words_added = [] + words_removed = [] + + changed_revisions = df_lang_data[df_lang_data['revision_type'] != 'No Change'] + + for _, row in changed_revisions.iterrows(): + if pd.notna(row[human_col]) and pd.notna(row[revised_col]): + orig_words = set(str(row[human_col]).lower().split()) + rev_words = set(str(row[revised_col]).lower().split()) + + added = rev_words - orig_words + removed = orig_words - rev_words + + words_added.extend(list(added)) + words_removed.extend(list(removed)) + + from collections import Counter + added_counts = Counter(words_added) + removed_counts = Counter(words_removed) + + word_analysis_col1, word_analysis_col2 = st.columns(2) + + with word_analysis_col1: + st.markdown("**🟢 Most Added Words**") + if added_counts: + top_added = dict(added_counts.most_common(15)) + + # Create horizontal bar chart for added words + fig_added = px.bar( + x=list(top_added.values()), + y=list(top_added.keys()), + orientation='h', + title="Most Frequently Added Words", + color=list(top_added.values()), + color_continuous_scale='Greens' + ) + fig_added.update_layout( + height=400, + xaxis_title="Frequency", + yaxis_title="Words", + font=dict(family="Inter", size=10) + ) + st.plotly_chart(fig_added, use_container_width=True) + else: + st.markdown("*No words added in revisions*") + + with word_analysis_col2: + st.markdown("**🔴 Most Removed Words**") + if removed_counts: + top_removed = dict(removed_counts.most_common(15)) + + # Create horizontal bar chart for removed words + fig_removed = px.bar( + x=list(top_removed.values()), + y=list(top_removed.keys()), + orientation='h', + title="Most Frequently Removed Words", + color=list(top_removed.values()), + color_continuous_scale='Reds' + ) + fig_removed.update_layout( + height=400, + xaxis_title="Frequency", + yaxis_title="Words", + font=dict(family="Inter", size=10) + ) + st.plotly_chart(fig_removed, use_container_width=True) + else: + st.markdown("*No words removed in revisions*") + + # Revision examples + st.markdown(""" +

+ 📝 Revision Examples +

+ """, unsafe_allow_html=True) + + # Show examples of different types of revisions + revision_examples = changed_revisions.head(10) + if len(revision_examples) > 0: + + # Create tabs for different revision types + available_types = revision_examples['revision_type'].unique() + if len(available_types) > 1: + type_tabs = st.tabs([f"{rtype} ({len(revision_examples[revision_examples['revision_type'] == rtype])})" + for rtype in available_types]) + + for i, rtype in enumerate(available_types): + with type_tabs[i]: + type_examples = revision_examples[revision_examples['revision_type'] == rtype].head(5) + for idx, row in type_examples.iterrows(): + st.markdown(f""" +
+
Original:
+
{row[human_col]}
+
Revised:
+
{row[revised_col]}
+
Type: {row['revision_type']}
+
+ """, unsafe_allow_html=True) + else: + # Single type, show directly + for idx, row in revision_examples.iterrows(): + st.markdown(f""" +
+
Original:
+
{row[human_col]}
+
Revised:
+
{row[revised_col]}
+
Type: {row['revision_type']}
+
+ """, unsafe_allow_html=True) + else: + st.info(f"No revisions found for {selected_lang}.") + else: + st.info(f"No revision data available for analysis.") + + else: + st.warning(f"⚠️ Revision columns not found for {selected_lang}. Expected columns: `{human_col}` and `{revised_col}`") + + with analysis_tab4: + # Translation comparison section + st.markdown(""" +
+

+ 🔍 Translation Comparison & Word Analysis for {selected_lang} +

+
+ """.format(selected_lang=selected_lang), unsafe_allow_html=True) + + # Use the global language selection + comp_code = code + + # Get available translation columns for selected language + available_cols = [] + for col in df_translations.columns: + if col.startswith(comp_code) and col != 'english': + available_cols.append(col) + + if len(available_cols) >= 2: + comp_col1, comp_col2, comp_col3 = st.columns([1, 1, 1]) + + with comp_col1: + col1_selection = st.selectbox( + "First Translation:", + available_cols, + key="col1_select" + ) + + with comp_col2: + col2_selection = st.selectbox( + "Second Translation:", + [col for col in available_cols if col != col1_selection], + key="col2_select" + ) + + with comp_col3: + # Add spacing to align button with selectboxes + st.markdown('
', unsafe_allow_html=True) + analyze_clicked = st.button( + "🔍 Analyze", + type="primary", + use_container_width=True, + key="analyze_word_diff_btn" + ) + + if analyze_clicked: + # Perform word analysis with ALL available data + def get_word_differences(text1, text2): + # Handle missing data by using available text + if pd.isna(text1) and pd.isna(text2): + return set(), set(), set() + + # If one is missing, treat it as empty for comparison + words1 = set(str(text1).lower().split()) if pd.notna(text1) else set() + words2 = set(str(text2).lower().split()) if pd.notna(text2) else set() + + only_in_1 = words1 - words2 + only_in_2 = words2 - words1 + common = words1 & words2 + + return only_in_1, only_in_2, common + + # Analyze ALL rows with available data + unique_words_1 = [] + unique_words_2 = [] + common_words = [] + all_words_1 = [] # For frequency counting + all_words_2 = [] # For frequency counting + + # Process all rows, including those with missing revisions + for _, row in df_translations.iterrows(): + # Get text from columns, using original if revision is missing + text1 = row[col1_selection] if pd.notna(row[col1_selection]) else None + text2 = row[col2_selection] if pd.notna(row[col2_selection]) else None + + # Skip if both are missing + if text1 is None and text2 is None: + continue + + # Collect ALL words from each column for frequency analysis + if text1 is not None: + words_from_1 = str(text1).lower().split() + all_words_1.extend(words_from_1) + + if text2 is not None: + words_from_2 = str(text2).lower().split() + all_words_2.extend(words_from_2) + + # Only do comparison if both texts exist + if text1 is not None and text2 is not None: + only_1, only_2, common = get_word_differences(text1, text2) + unique_words_1.extend(list(only_1)) + unique_words_2.extend(list(only_2)) + common_words.extend(list(common)) + + from collections import Counter + + # Count frequencies from ALL words + all_freq_1 = Counter(all_words_1) # All words from column 1 + all_freq_2 = Counter(all_words_2) # All words from column 2 + unique_freq_1 = Counter(unique_words_1) # Only unique words + unique_freq_2 = Counter(unique_words_2) # Only unique words + common_freq = Counter(common_words) # Only common words + + # Display statistics + st.markdown('
', unsafe_allow_html=True) + + col_result1, col_result2, col_result3, col_result4 = st.columns(4) + + with col_result1: + st.markdown(f""" +
+
Unique to {col1_selection.replace('_', ' ').title()}
+
{len(unique_freq_1)}
+
unique words
+
+ """, unsafe_allow_html=True) + + with col_result2: + st.markdown(f""" +
+
Unique to {col2_selection.replace('_', ' ').title()}
+
{len(unique_freq_2)}
+
unique words
+
+ """, unsafe_allow_html=True) + + with col_result3: + st.markdown(f""" +
+
Common Words
+
{len(common_freq)}
+
shared words
+
+ """, unsafe_allow_html=True) + + with col_result4: + st.markdown(f""" +
+
Total Vocabulary
+
{len(set(all_words_1 + all_words_2))}
+
total unique words
+
+ """, unsafe_allow_html=True) + + st.markdown('
', unsafe_allow_html=True) + + # Word Clouds Section + st.markdown(""" +

+ ☁️ Word Clouds Visualization +

+ """, unsafe_allow_html=True) + + # Generate word clouds using matplotlib and wordcloud + try: + # Show loading spinner while generating word clouds + with st.spinner("🎨 Generating word clouds... This may take a moment."): + import matplotlib.pyplot as plt + from wordcloud import WordCloud + import io + import base64 + + # Function to create word cloud image (optimized) + def create_wordcloud_image(word_freq, title, color_scheme='viridis'): + if not word_freq or len(word_freq) == 0: + return None + + try: + # Create word cloud with all frequency data, but limit max_words to 25 + wordcloud = WordCloud( + width=300, # Reduced size + height=200, # Reduced size + background_color='white', + colormap=color_scheme, + max_words=25, # Display top 25 words + relative_scaling=0.6, + random_state=42, + min_font_size=8, + max_font_size=60, + prefer_horizontal=0.9, + collocations=False # Avoid word combinations + ).generate_from_frequencies(word_freq) # Use ALL frequency data + + # Create matplotlib figure with smaller size + fig, ax = plt.subplots(figsize=(5, 3)) # Smaller figure + ax.imshow(wordcloud, interpolation='bilinear') + ax.axis('off') + ax.set_title(title, fontsize=10, fontweight='bold', pad=10) + + # Convert to base64 for HTML display + buffer = io.BytesIO() + plt.savefig(buffer, format='png', bbox_inches='tight', dpi=100, facecolor='white') # Lower DPI + buffer.seek(0) + image_base64 = base64.b64encode(buffer.getvalue()).decode() + plt.close(fig) # Important: close figure to free memory + + return image_base64 + except Exception as e: + st.warning(f"Error creating word cloud for {title}: {str(e)}") + return None + + # Create all word clouds in one row + cloud_col1, cloud_col2, cloud_col3 = st.columns(3) + + with cloud_col1: + if unique_freq_1 and len(unique_freq_1) > 0: + # Use ALL unique words but display top 25 in cloud + img1 = create_wordcloud_image( + dict(unique_freq_1), # Use ALL unique words for frequency + f"Unique: {col1_selection.replace('_', ' ').title()}", + 'Reds' + ) + if img1: + st.markdown(f''' +
+ +
+
+ Showing top 25 of {len(unique_freq_1)} unique words +
+ ''', unsafe_allow_html=True) + else: + st.markdown(""" +
+
📝
+
No unique words
+
+ """, unsafe_allow_html=True) + else: + st.markdown(""" +
+
📝
+
No unique words found
+
+ """, unsafe_allow_html=True) + + with cloud_col2: + if unique_freq_2 and len(unique_freq_2) > 0: + # Use ALL unique words but display top 25 in cloud + img2 = create_wordcloud_image( + dict(unique_freq_2), # Use ALL unique words for frequency + f"Unique: {col2_selection.replace('_', ' ').title()}", + 'Greens' + ) + if img2: + st.markdown(f''' +
+ +
+
+ Showing top 25 of {len(unique_freq_2)} unique words +
+ ''', unsafe_allow_html=True) + else: + st.markdown(""" +
+
📝
+
No unique words
+
+ """, unsafe_allow_html=True) + else: + st.markdown(""" +
+
📝
+
No unique words found
+
+ """, unsafe_allow_html=True) + + with cloud_col3: + if common_freq and len(common_freq) > 0: + # Use ALL common words but display top 25 in cloud + img3 = create_wordcloud_image( + dict(common_freq), # Use ALL common words for frequency + "Common Words", + 'Blues' + ) + if img3: + st.markdown(f''' +
+ +
+
+ Showing top 25 of {len(common_freq)} common words +
+ ''', unsafe_allow_html=True) + else: + st.markdown(""" +
+
📝
+
No common words
+
+ """, unsafe_allow_html=True) + else: + st.markdown(""" +
+
🤝
+
No common words found
+
+ """, unsafe_allow_html=True) + + except ImportError: + st.warning("📦 WordCloud library not available. Install with: `pip install wordcloud`") + + # Fallback to top words lists + st.markdown("**📋 Top Unique Words (Fallback)**") + + fallback_col1, fallback_col2, fallback_col3 = st.columns(3) + + with fallback_col1: + st.markdown(f"**🔴 Unique to {col1_selection.replace('_', ' ').title()}**") + if unique_freq_1: + for word, count in unique_freq_1.most_common(10): + st.markdown(f"• {word} ({count})") + else: + st.markdown("*No unique words*") + + with fallback_col2: + st.markdown(f"**🟢 Unique to {col2_selection.replace('_', ' ').title()}**") + if unique_freq_2: + for word, count in unique_freq_2.most_common(10): + st.markdown(f"• {word} ({count})") + else: + st.markdown("*No unique words*") + + with fallback_col3: + st.markdown("**🔵 Common Words**") + if common_freq: + for word, count in common_freq.most_common(10): + st.markdown(f"• {word} ({count})") + else: + st.markdown("*No common words*") + + # Word frequency bar charts as additional analysis + st.markdown(""" +

+ 📊 Top Words Frequency Comparison +

+ """, unsafe_allow_html=True) + + freq_col1, freq_col2 = st.columns(2) + + with freq_col1: + if unique_freq_1: + top_words_1 = dict(unique_freq_1.most_common(10)) + fig_freq1 = px.bar( + x=list(top_words_1.values()), + y=list(top_words_1.keys()), + orientation='h', + title=f"Top Unique Words: {col1_selection.replace('_', ' ').title()}", + color=list(top_words_1.values()), + color_continuous_scale='Reds' + ) + fig_freq1.update_layout( + height=400, + xaxis_title="Frequency", + yaxis_title="Words", + font=dict(family="Inter", size=10) + ) + st.plotly_chart(fig_freq1, use_container_width=True) + + with freq_col2: + if unique_freq_2: + top_words_2 = dict(unique_freq_2.most_common(10)) + fig_freq2 = px.bar( + x=list(top_words_2.values()), + y=list(top_words_2.keys()), + orientation='h', + title=f"Top Unique Words: {col2_selection.replace('_', ' ').title()}", + color=list(top_words_2.values()), + color_continuous_scale='Greens' + ) + fig_freq2.update_layout( + height=400, + xaxis_title="Frequency", + yaxis_title="Words", + font=dict(family="Inter", size=10) + ) + st.plotly_chart(fig_freq2, use_container_width=True) + else: + st.warning("⚠️ Need at least 2 translation columns for comparison analysis.") + + else: + st.markdown(""" +
+

❌ No Data Available

+

+ Please ensure translation data files are available in the data directory. +

+
+ """, unsafe_allow_html=True) + + # Footer + st.markdown("---") + st.markdown(""" +
+ Built for DSFSI using Streamlit • Translation APIs: Gemini, GPT, NLLB (hosted locally) • Data Science for Social Impact +
+ """, unsafe_allow_html=True) -""" -# Welcome to Streamlit! - -Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:. -If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community -forums](https://discuss.streamlit.io). - -In the meantime, below is an example of what you can do with just a few lines of code: -""" - -num_points = st.slider("Number of points in spiral", 1, 10000, 1100) -num_turns = st.slider("Number of turns in spiral", 1, 300, 31) - -indices = np.linspace(0, 1, num_points) -theta = 2 * np.pi * num_turns * indices -radius = indices - -x = radius * np.cos(theta) -y = radius * np.sin(theta) - -df = pd.DataFrame({ - "x": x, - "y": y, - "idx": indices, - "rand": np.random.randn(num_points), -}) - -st.altair_chart(alt.Chart(df, height=700, width=700) - .mark_point(filled=True) - .encode( - x=alt.X("x", axis=None), - y=alt.Y("y", axis=None), - color=alt.Color("idx", legend=None, scale=alt.Scale()), - size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])), - )) \ No newline at end of file +if __name__ == "__main__": + main() \ No newline at end of file