diff --git "a/src/streamlit_app.py" "b/src/streamlit_app.py"
--- "a/src/streamlit_app.py"
+++ "b/src/streamlit_app.py"
@@ -1,40 +1,2077 @@
-import altair as alt
-import numpy as np
-import pandas as pd
import streamlit as st
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import os
+import time
+from PIL import Image
+
+# Only import APIs if available
+try:
+ from google import genai
+ GENAI_AVAILABLE = True
+except ImportError:
+ GENAI_AVAILABLE = False
+
+try:
+ from openai import OpenAI
+ OPENAI_AVAILABLE = True
+except ImportError:
+ OPENAI_AVAILABLE = False
+
+BASE_DIR = os.path.dirname(__file__)
+DATA_DIR = os.path.join(BASE_DIR, "data")
+
+# Page configuration
+st.set_page_config(
+ page_title="Translation Comparison Tool",
+ page_icon="🌐",
+ layout="wide",
+ initial_sidebar_state="collapsed"
+)
+
+# Custom CSS for Material Design with Tailwind-inspired styling
+st.markdown("""
+
+""", unsafe_allow_html=True)
+
+# Model configurations
+MODEL_CONFIG = {
+ 'Gemini': {
+ 'languages': ['Afrikaans', 'Northern Sotho', 'isiZulu'],
+ 'models': ['gemini-2.0-flash-exp', 'gemini-1.5-flash', 'gemini-1.5-pro'],
+ 'default_model': 'gemini-2.0-flash-exp'
+ },
+ 'GPT': {
+ 'languages': ['Afrikaans', 'Northern Sotho', 'isiZulu'],
+ 'models': ['gpt-4', 'gpt-4-turbo', 'gpt-3.5-turbo'],
+ 'default_model': 'gpt-4'
+ },
+ 'NLLB': {
+ 'languages': ['Northern Sotho', 'isiZulu'], # No Afrikaans model available
+ 'models': {
+ 'Northern Sotho': 'dsfsi/dcs-eng-nso-nllb-1.3B',
+ 'isiZulu': 'dsfsi/dcs-eng-zul-nllb-1.3B'
+ }
+ }
+}
+
+# Language code mappings
+LANGUAGE_CODES = {
+ 'Afrikaans': 'afr',
+ 'Northern Sotho': 'nso',
+ 'isiZulu': 'isizulu'
+}
+
+# Load logo
+def load_logo():
+ """Load logo with error handling"""
+ try:
+ if os.path.exists(f"{BASE_DIR}/logo.png"):
+ return Image.open(f"{BASE_DIR}/logo.png")
+ except Exception as e:
+ st.warning(f"Could not load logo: {str(e)}")
+ return None
+
+# Load and cache data
+@st.cache_data
+def load_translation_data():
+ """Load sample translation data"""
+ try:
+ sample_data = {
+ 'english': ['Hello world', 'How are you?', 'Good morning', 'Thank you', 'Welcome', 'Goodbye'],
+ 'afr': ['Hallo wêreld', 'Hoe gaan dit?', 'Goeie môre', 'Dankie', 'Welkom', 'Totsiens'],
+ 'afr_rev': ['Hallo wêreld', 'Hoe gaan dit met jou?', 'Goeie môre', 'Baie dankie', 'Welkom', 'Totsiens'],
+ 'nso': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'],
+ 'nso_rev': ['Dumela lefase', 'O phela bjang?', 'Thobela', 'Ke a leboga kudu', 'O amogetšwe', 'Šala gabotse'],
+ 'isizulu': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'],
+ 'isizulu_rev': ['Sawubona mhlaba', 'Unjani wena?', 'Sawubona', 'Ngiyabonga kakhulu', 'Wamukelekile', 'Sala kahle'],
+ 'nso_mt_nllb': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'],
+ 'isizulu_mt_nllb': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'],
+ 'afr_mt_gpt': ['Hallo wêreld', 'Hoe gaan dit?', 'Goeie môre', 'Dankie', 'Welkom', 'Totsiens'],
+ 'nso_mt_gpt': ['Dumela lefase', 'O kae?', 'Thobela', 'Ke a leboga', 'O amogetšwe', 'Šala gabotse'],
+ 'isizulu_mt_gpt': ['Sawubona mhlaba', 'Unjani?', 'Sawubona', 'Ngiyabonga', 'Wamukelekile', 'Sala kahle'],
+ 'afr_mt_gemini': ['Hallo wêreld', 'Hoe is dit?', 'Goeie môre', 'Baie dankie', 'Welkom', 'Totsiens'],
+ 'nso_mt_gemini': ['Dumela lefase', 'O phela bjang?', 'Thobela', 'Ke a leboga kudu', 'O amogetšwe', 'Šala gabotse'],
+ 'isizulu_mt_gemini': ['Sawubona mhlaba', 'Unjani wena?', 'Sawubona', 'Ngiyabonga kakhulu', 'Wamukelekile', 'Sala kahle']
+ }
+ return pd.DataFrame(sample_data)
+ except Exception as e:
+ st.error(f"Error loading data: {str(e)}")
+ return pd.DataFrame({'english': ['Sample text'], 'error': ['Data loading failed']})
+
+def translate_with_gemini(text, target_language, model_name="gemini-2.0-flash-exp", client=None):
+ """Translate text using Gemini API"""
+ try:
+ if not GENAI_AVAILABLE:
+ return "❌ Gemini library not installed"
+
+ if not client:
+ return "❌ Gemini API not configured. Please check your GEMINI_API_KEY."
+
+ lang_map = {
+ 'Afrikaans': 'Afrikaans',
+ 'Northern Sotho': 'Northern Sotho (Sepedi)',
+ 'isiZulu': 'isiZulu'
+ }
+
+ prompt = f"Translate the following English text to {lang_map.get(target_language, target_language)}: '{text}'. Provide only the translation without any explanations."
+
+ response = client.models.generate_content(
+ model=model_name, contents=prompt
+ )
+ return response.text.strip()
+ except Exception as e:
+ return f"❌ Error: {str(e)}"
+
+def translate_with_openai(text, target_language, model_name="gpt-4o", client=None):
+ """Translate text using OpenAI API with Chat Completions"""
+ try:
+ if not OPENAI_AVAILABLE:
+ return "❌ OpenAI library not installed"
+
+ if not client:
+ return "❌ OpenAI API not configured. Please check your OPENAI_API_KEY."
+
+ lang_map = {
+ 'Afrikaans': 'Afrikaans',
+ 'Northern Sotho': 'Northern Sotho (Sepedi)',
+ 'isiZulu': 'isiZulu'
+ }
+
+ # Use Chat Completions API (supported indefinitely)
+ response = client.chat.completions.create(
+ model=model_name,
+ messages=[
+ {"role": "system", "content": "You are a professional translator. Provide only the translation without any explanations."},
+ {"role": "user", "content": f"Translate the following text to {lang_map.get(target_language, target_language)}: {text}"}
+ ],
+ max_tokens=1000,
+ temperature=0.3 # Lower temperature for more consistent translations
+ )
+
+ return response.choices[0].message.content.strip()
+
+ except Exception as e:
+ return f"❌ Error: {str(e)}"
+
+@st.cache_resource
+def initialize_apis():
+ """Initialize API clients with proper error handling, supporting both local and HF Spaces."""
+ genai_client = None
+ openai_client = None
+
+ def get_secret(name):
+ """Fetch secret from env first (Docker Spaces), then Streamlit secrets."""
+ return (
+ os.environ.get(name)
+ or (st.secrets.get(name) if hasattr(st, "secrets") and name in st.secrets else None)
+ )
+
+ try:
+ # Gemini API
+ if GENAI_AVAILABLE:
+ try:
+ api_key = get_secret("GEMINI_API_KEY")
+ if api_key:
+ genai_client = genai.Client(api_key=api_key)
+ else:
+ st.warning("⚠️ Gemini API key not found")
+ except Exception as e:
+ st.error(f"❌ Gemini API error: {str(e)}")
+
+ # OpenAI API
+ if OPENAI_AVAILABLE:
+ try:
+ api_key = get_secret("OPENAI_API_KEY")
+ if api_key:
+ try:
+ # Try new OpenAI API client
+ openai_client = OpenAI(api_key=api_key)
+ except TypeError:
+ import openai
+ openai.api_key = api_key
+ openai_client = openai
+ else:
+ st.warning("⚠️ OpenAI API key not found")
+ except Exception as e:
+ st.error(f"❌ OpenAI API error: {str(e)}")
+
+ except Exception as e:
+ st.error(f"❌ API initialization error: {str(e)}")
+
+ return genai_client, openai_client
+
+def translate_with_nllb(text, target_language):
+ """Translate text using unified NLLB API"""
+ try:
+ import requests
+
+ # Single ngrok URL for unified API
+ API_URL = "https://4c2faecc052a.ngrok-free.app"
+
+ # Map Streamlit language names to API format
+ lang_mapping = {
+ 'Northern Sotho': 'nso',
+ 'isiZulu': 'zul'
+ }
+
+ api_lang = lang_mapping.get(target_language, target_language.lower())
+
+ response = requests.post(
+ f"{API_URL}/translate_simple",
+ params={
+ "text": text,
+ "target_language": api_lang
+ },
+ timeout=30
+ )
+
+ if response.status_code == 200:
+ result = response.json()
+ return result.get(api_lang, '❌ Translation not found')
+ else:
+ return f"❌ API Error: {response.status_code}"
+
+ except Exception as e:
+ return f"❌ Error: {str(e)}"
+
+def create_language_tabs(available_languages, current_language, key_suffix=""):
+ """Create language tabs with proper styling"""
+ tabs_html = '
', unsafe_allow_html=True)
+ st.markdown('', unsafe_allow_html=True)
+ st.markdown('
', unsafe_allow_html=True)
+
+ input_text = st.text_area(
+ "Input",
+ placeholder="Input text here",
+ height=350,
+ key="input_text_live",
+ label_visibility="collapsed"
+ )
+
+ # Center - Translate Button
+ with col_center:
+ # Add spacing to align button with text areas
+ st.markdown('', unsafe_allow_html=True)
+
+ # Language selection buttons
+ lang_cols = st.columns(len(available_languages))
+ for i, lang in enumerate(available_languages):
+ with lang_cols[i]:
+ button_type = "primary" if lang == st.session_state.target_language else "secondary"
+ if st.button(
+ lang,
+ key=f"lang_btn_{lang}_live",
+ type=button_type,
+ use_container_width=True
+ ):
+ if st.session_state.target_language != lang: # Only update if different
+ st.session_state.target_language = lang
+ st.session_state.translation_result = "" # Clear previous result
+ st.rerun()
+
+ # Translation logic
+ if translate_clicked and input_text:
+ with st.spinner("Translating..."):
+ target_lang = st.session_state.target_language
+
+ if selected_provider == 'Gemini':
+ result = translate_with_gemini(input_text, target_lang, selected_model, genai_client)
+
+ elif selected_provider == 'GPT':
+ result = translate_with_openai(input_text, target_lang, selected_model, openai_client)
+
+ elif selected_provider == 'NLLB':
+ result = translate_with_nllb(input_text, target_lang)
+
+ st.session_state.translation_result = result
+
+ # Translation output area with proper labeling
+ st.text_area(
+ f"Translation ({st.session_state.target_language})", # Dynamic label
+ value=st.session_state.translation_result,
+ placeholder="Translation will appear here",
+ height=350,
+ key="translation_output_live_fixed", # Changed key to avoid conflicts
+ disabled=True,
+ label_visibility="collapsed"
+ )
+
+ # Support information
+ st.markdown("""
+
+ Available Models:
+ 🔮 Gemini: All languages (gemini-2.0-flash-exp, gemini-1.5-flash, gemini-1.5-pro)
+ 🧠 GPT: All languages (gpt-4, gpt-4-turbo, gpt-3.5-turbo)
+ 🤗 NLLB: Northern Sotho, isiZulu only (specialized models)
+
+ """, unsafe_allow_html=True)
+
+ with tab2:
+ # Load data from base directory automatically
+ @st.cache_data
+ def load_analysis_data():
+ """Load all analysis data from base directory"""
+ df_translations = None
+ df_bleu = None
+ df_chrf = None
+ df_comet = None
+
+ try:
+ # Try to load translations data
+ if os.path.exists(f"{DATA_DIR}/translations.tsv"):
+ df_translations = pd.read_csv(f"{DATA_DIR}/translations.tsv", sep="\t")
+
+ # Convert new CSV format to expected format for analysis
+ # New format: id,english,afr_human,afr_revised,nso_human,nso_revised,zul_human,zul_revised,afr_gemini,afr_gpt,nso_gemini,nso_gpt,nso_nllb,zul_gemini,zul_gpt,zul_nllb
+ # Expected format: english, afr_human, afr_revised, nso_human, nso_revised, isizulu_human, isizulu_revised, etc.
+
+ # Rename zul columns to isizulu for backward compatibility with analysis code
+ column_mapping = {
+ 'zul_human': 'isizulu_human',
+ 'zul_revised': 'isizulu_revised',
+ 'zul_gemini': 'isizulu_mt_gemini',
+ 'zul_gpt': 'isizulu_mt_gpt',
+ 'zul_nllb': 'isizulu_mt_nllb',
+ 'afr_gemini': 'afr_mt_gemini',
+ 'afr_gpt': 'afr_mt_gpt',
+ 'nso_gemini': 'nso_mt_gemini',
+ 'nso_gpt': 'nso_mt_gpt',
+ 'nso_nllb': 'nso_mt_nllb'
+ }
+
+ df_translations = df_translations.rename(columns=column_mapping)
+
+ elif os.path.exists(f"{DATA_DIR}/translation_data.csv"):
+ df_translations = pd.read_csv(f"{DATA_DIR}/translation_data.csv")
+ else:
+ print("No translation data found, using sample data")
+ df_translations = load_translation_data() # Fallback to sample data
+
+ # Try to load BLEU scores
+ if os.path.exists(f"{DATA_DIR}/bleu_scores.csv"):
+ df_bleu = pd.read_csv(f"{DATA_DIR}/bleu_scores.csv")
+
+ # Convert zul references to isizulu for compatibility
+ df_bleu['comparison_pair'] = df_bleu['comparison_pair'].str.replace('zul_', 'isizulu_')
+ df_bleu['language'] = df_bleu['language'].replace('isiZulu', 'isiZulu') # Already correct
+
+ else:
+ # Sample BLEU data (using isizulu for compatibility with existing analysis code)
+ df_bleu = pd.DataFrame({
+ 'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'nso_human_vs_nso_nllb', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised', 'isizulu_human_vs_isizulu_nllb'],
+ 'bleu_score': [0.78, 0.72, 0.89, 0.65, 0.68, 0.85, 0.71, 0.71, 0.69, 0.87, 0.73],
+ 'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu', 'isiZulu']
+ })
+
+ # Try to load COMET scores
+ if os.path.exists(f"{DATA_DIR}/comet_scores.csv"):
+ df_comet = pd.read_csv(f"{DATA_DIR}/comet_scores.csv")
+ else:
+ # Sample COMET data
+ df_comet = pd.DataFrame({
+ 'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised'],
+ 'comet_score': [0.82, 0.79, 0.92, 0.71, 0.74, 0.88, 0.76, 0.73, 0.90],
+ 'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu']
+ })
+
+ # Try to load CHRF scores
+ if os.path.exists(f"{DATA_DIR}/chrf_scores.csv"):
+ df_chrf = pd.read_csv(f"{DATA_DIR}/chrf_scores.csv")
+ else:
+ # Sample CHRF data
+ df_chrf = pd.DataFrame({
+ 'comparison_pair': ['afr_human_vs_afr_gemini', 'afr_human_vs_afr_gpt', 'afr_human_vs_afr_revised', 'nso_human_vs_nso_gemini', 'nso_human_vs_nso_gpt', 'nso_human_vs_nso_revised', 'isizulu_human_vs_isizulu_gemini', 'isizulu_human_vs_isizulu_gpt', 'isizulu_human_vs_isizulu_revised'],
+ 'chrf_score': [0.75, 0.70, 0.88, 0.60, 0.65, 0.80, 0.68, 0.66, 0.85],
+ 'language': ['Afrikaans', 'Afrikaans', 'Afrikaans', 'Northern Sotho', 'Northern Sotho', 'Northern Sotho', 'isiZulu', 'isiZulu', 'isiZulu']
+ })
+
+ return df_translations, df_bleu, df_comet, df_chrf
+
+ except Exception as e:
+ st.error(f"Error loading data: {str(e)}")
+ return None, None, None, None
+
+ # Load all data
+ df_translations, df_bleu, df_comet, df_chrf = load_analysis_data()
+
+ if df_translations is not None:
+ # Language selection in columns
+ lang_col1, lang_col2 = st.columns([2, 10])
+ with lang_col1:
+ st.markdown('
Select Language:
', unsafe_allow_html=True)
+ with lang_col2:
+ languages = ['Afrikaans', 'Northern Sotho', 'isiZulu']
+ selected_lang = st.selectbox(
+ "Select Language for Analysis:",
+ languages,
+ key="global_lang_select",
+ label_visibility="collapsed"
+ )
+
+ # Get language code
+ lang_codes = {'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu'}
+ code = lang_codes[selected_lang]
+
+ # Create analysis tabs
+ analysis_tab1, analysis_tab2, analysis_tab3, analysis_tab4 = st.tabs(["Sample Translations", "📊 Quality Metrics", "🔄 Revision Analysis", "🔍 Word Comparison"])
+
+ with analysis_tab1:
+ # Translation Samples Tab
+ st.markdown("""
+
+
+ 📝 Translation Samples for {selected_lang}
+
+
+ """.format(selected_lang=selected_lang), unsafe_allow_html=True)
+
+ # Use the global language selection
+ samples_code = code
+
+ # Show sample translations for the selected language
+ display_cols = ['english'] + [col for col in df_translations.columns if col.startswith(samples_code)]
+
+ if display_cols and len(display_cols) > 1: # Need at least english + 1 translation column
+ # Control panel
+ control_col1, control_col2, control_col3, control_col4 = st.columns([1, 7, 1, 2])
+
+ with control_col1:
+ st.markdown('
Samples per page:
', unsafe_allow_html=True)
+ with control_col2:
+ page_size = st.selectbox(
+ "Samples per page:",
+ [10, 25, 50, 100],
+ index=0,
+ key="page_size_select",
+ label_visibility="collapsed"
+ )
+
+ # Initialize session state for pagination
+ if 'current_page' not in st.session_state:
+ st.session_state.current_page = 1
+
+ # Filter data and calculate pagination
+ available_data = df_translations[display_cols].dropna(subset=[col for col in display_cols if col != 'english'], how='all')
+ total_samples = len(available_data)
+ total_pages = max(1, (total_samples + page_size - 1) // page_size) # Ceiling division
+
+ # Ensure current page is valid
+ if st.session_state.current_page > total_pages:
+ st.session_state.current_page = 1
+
+ # Calculate start and end indices
+ start_idx = (st.session_state.current_page - 1) * page_size
+ end_idx = min(start_idx + page_size, total_samples)
+
+ # Get current page data
+ current_page_data = available_data.iloc[start_idx:end_idx]
+
+ with control_col3:
+ st.markdown('
Page:
', unsafe_allow_html=True)
+ with control_col4:
+ # Page navigation
+ nav_col1, nav_col2, nav_col3, nav_col4, nav_col5 = st.columns([1, 1, 2, 1, 1])
+
+ with nav_col1:
+ if st.button("⏮️", key="first_page", help="First page", disabled=(st.session_state.current_page == 1)):
+ st.session_state.current_page = 1
+ st.rerun()
+
+ with nav_col2:
+ if st.button("◀️", key="prev_page", help="Previous page", disabled=(st.session_state.current_page == 1)):
+ st.session_state.current_page -= 1
+ st.rerun()
+
+ with nav_col3:
+ st.markdown(f'
{st.session_state.current_page} / {total_pages}
', unsafe_allow_html=True)
+
+ with nav_col4:
+ if st.button("▶️", key="next_page", help="Next page", disabled=(st.session_state.current_page == total_pages)):
+ st.session_state.current_page += 1
+ st.rerun()
+
+ with nav_col5:
+ if st.button("⏭️", key="last_page", help="Last page", disabled=(st.session_state.current_page == total_pages)):
+ st.session_state.current_page = total_pages
+ st.rerun()
+
+ # Statistics cards
+ stats_col1, stats_col2, stats_col3, stats_col4 = st.columns(4)
+
+ with stats_col1:
+ st.markdown(f"""
+
+
Showing
+
{len(current_page_data)}
+
+ """, unsafe_allow_html=True)
+
+ with stats_col2:
+ available_systems = len([col for col in display_cols if col != 'english'])
+ st.markdown(f"""
+
+
Translation Systems
+
{available_systems}
+
+ """, unsafe_allow_html=True)
+
+ with stats_col3:
+ st.markdown(f"""
+
+
Total Available
+
{total_samples}
+
+ """, unsafe_allow_html=True)
+
+ with stats_col4:
+ st.markdown(f"""
+
+
Current Page
+
{st.session_state.current_page}/{total_pages}
+
+ """, unsafe_allow_html=True)
+
+ # Display the samples table
+ st.markdown("### Translation Examples")
+
+ if len(current_page_data) > 0:
+ # Create a styled dataframe with better column names
+ display_df = current_page_data.copy()
+
+ # Rename columns for better display
+ column_rename = {
+ 'english': 'English (Source)',
+ }
+
+ # Add human-readable names for translation columns
+ for col in display_df.columns:
+ if col.startswith(samples_code):
+ if '_human' in col:
+ column_rename[col] = f'{selected_lang} (Human)'
+ elif '_revised' in col:
+ column_rename[col] = f'{selected_lang} (Revised)'
+ elif '_mt_gemini' in col or '_gemini' in col:
+ column_rename[col] = f'{selected_lang} (Gemini)'
+ elif '_mt_gpt' in col or '_gpt' in col:
+ column_rename[col] = f'{selected_lang} (GPT)'
+ elif '_mt_nllb' in col or '_nllb' in col:
+ column_rename[col] = f'{selected_lang} (NLLB)'
+ else:
+ # Generic fallback
+ clean_name = col.replace(f'{samples_code}_', '').replace('_', ' ').title()
+ column_rename[col] = f'{selected_lang} ({clean_name})'
+
+ display_df = display_df.rename(columns=column_rename)
+
+ # Add row numbers based on actual position in full dataset
+ display_df.index = range(start_idx + 1, end_idx + 1)
+ display_df.index.name = 'Sample #'
+
+ st.dataframe(
+ display_df,
+ use_container_width=True,
+ height=min(600, 50 + len(display_df) * 35), # Dynamic height based on content
+ column_config={
+ col: st.column_config.TextColumn(col, width="medium")
+ for col in display_df.columns
+ }
+ )
+
+ # Page info summary
+ st.markdown(f"""
+
+ 📄 Showing samples {start_idx + 1} to {end_idx} of {total_samples} total samples • Page {st.session_state.current_page} of {total_pages}
+
+ """, unsafe_allow_html=True)
+
+ # Quick jump to page
+ if total_pages > 5: # Only show quick jump for datasets with many pages
+ st.markdown("### Quick Navigation")
+ jump_col1, jump_col2, jump_col3 = st.columns([1, 2, 1])
+
+ with jump_col2:
+ target_page = st.number_input(
+ f"Jump to page (1-{total_pages}):",
+ min_value=1,
+ max_value=total_pages,
+ value=st.session_state.current_page,
+ key="page_jump"
+ )
+
+ if st.button("🔗 Go to Page", use_container_width=True):
+ if target_page != st.session_state.current_page:
+ st.session_state.current_page = target_page
+ st.rerun()
+
+ else:
+ st.warning("⚠️ No translation samples found for the current page.")
+
+ else:
+ st.warning(f"⚠️ No translation data available for {selected_lang}. Expected columns starting with '{samples_code}_'")
+
+ # Debug information
+ available_columns = [col for col in df_translations.columns if col.startswith(samples_code)]
+ if available_columns:
+ st.info(f"🔍 Found columns: {', '.join(available_columns)}")
+ else:
+ all_lang_columns = [col for col in df_translations.columns if any(col.startswith(prefix) for prefix in ['afr_', 'nso_', 'isizulu_'])]
+ if all_lang_columns:
+ st.info(f"💡 Available language columns: {', '.join(all_lang_columns[:10])}{'...' if len(all_lang_columns) > 10 else ''}")
+
+ with analysis_tab2:
+ st.markdown("""
+
+
+ 📈 Quality Metrics for {selected_lang}
+
+
+ """.format(selected_lang=selected_lang), unsafe_allow_html=True)
+
+ # Get language code
+ lang_codes = {'Afrikaans': 'afr', 'Northern Sotho': 'nso', 'isiZulu': 'isizulu'}
+ code = lang_codes[selected_lang]
+
+ # Score visualizations
+ if df_bleu is not None and df_chrf is not None and df_comet is not None:
+ # Filter scores for selected language
+ lang_bleu = df_bleu[df_bleu['language'] == selected_lang] if 'language' in df_bleu.columns else df_bleu
+ lang_chrf = df_chrf[df_chrf['language'] == selected_lang] if 'language' in df_chrf.columns else df_chrf
+ lang_comet = df_comet[df_comet['language'] == selected_lang] if 'language' in df_comet.columns else df_comet
+
+ # Check if we have domain-level data
+ has_domain_data = ('domain' in lang_bleu.columns and 'domain' in lang_chrf.columns and
+ 'domain' in lang_comet.columns and
+ len(lang_bleu[lang_bleu['domain'] != 'Overall']) > 0)
+
+ if has_domain_data:
+ # Add domain filter
+ available_domains = sorted(lang_bleu['domain'].unique())
+ domain_options = ['Overall'] + [d for d in available_domains if d != 'Overall']
+
+ selected_domain = st.selectbox(
+ "📍 Select Domain for Analysis:",
+ domain_options,
+ key=f"domain_selector_{selected_lang}"
+ )
+
+ # Filter data based on selected domain
+ if selected_domain == 'Overall':
+ display_bleu = lang_bleu[lang_bleu['domain'] == 'Overall']
+ display_chrf = lang_chrf[lang_chrf['domain'] == 'Overall']
+ display_comet = lang_comet[lang_comet['domain'] == 'Overall']
+ chart_title_suffix = " - Overall"
+ else:
+ display_bleu = lang_bleu[lang_bleu['domain'] == selected_domain]
+ display_chrf = lang_chrf[lang_chrf['domain'] == selected_domain]
+ display_comet = lang_comet[lang_comet['domain'] == selected_domain]
+ chart_title_suffix = f" - {selected_domain}"
+ else:
+ # Use all data if no domain column
+ display_bleu = lang_bleu
+ display_chrf = lang_chrf
+ display_comet = lang_comet
+ chart_title_suffix = ""
+
+ # Create score charts
+ if len(display_bleu) > 0 and len(display_chrf) > 0 and len(display_comet) > 0:
+ chart_col1, chart_col2, chart_col3 = st.columns(3)
+
+ with chart_col1:
+ # chrF Score Chart
+ fig_chrf = px.bar(
+ display_chrf,
+ x='comparison_pair',
+ y='chrf_score',
+ title=f'chrF Scores - {selected_lang}{chart_title_suffix}',
+ color='chrf_score',
+ color_continuous_scale='oranges'
+ )
+ fig_chrf.update_layout(
+ xaxis_title="Translation Pairs",
+ yaxis_title="chrF Score",
+ xaxis_tickangle=-45,
+ height=400,
+ font=dict(family="Inter", size=12)
+ )
+ st.plotly_chart(fig_chrf, use_container_width=True)
+
+ with chart_col2:
+ # BLEU Score Chart
+ fig_bleu = px.bar(
+ display_bleu,
+ x='comparison_pair',
+ y='bleu_score',
+ title=f'BLEU Scores - {selected_lang}{chart_title_suffix}',
+ color='bleu_score',
+ color_continuous_scale='blues'
+ )
+ fig_bleu.update_layout(
+ xaxis_title="Translation Pairs",
+ yaxis_title="BLEU Score",
+ xaxis_tickangle=-45,
+ height=400,
+ font=dict(family="Inter", size=12)
+ )
+ st.plotly_chart(fig_bleu, use_container_width=True)
+
+ with chart_col3:
+ # COMET Score Chart
+ fig_comet = px.bar(
+ display_comet,
+ x='comparison_pair',
+ y='comet_score',
+ title=f'COMET Scores - {selected_lang}{chart_title_suffix}',
+ color='comet_score',
+ color_continuous_scale='greens'
+ )
+ fig_comet.update_layout(
+ xaxis_title="Translation Pairs",
+ yaxis_title="COMET Score",
+ xaxis_tickangle=-45,
+ height=400,
+ font=dict(family="Inter", size=12)
+ )
+ st.plotly_chart(fig_comet, use_container_width=True)
+
+ # PRIMARY SPIDER CHART - Domain Performance when available, Model Performance otherwise
+ if has_domain_data:
+ st.markdown(f"""
+
+ 🕸️ Domain Performance Spider Charts - {selected_lang}
+
+ """, unsafe_allow_html=True)
+
+ # Filter out "Overall" so only domain-level values are shown
+ domain_bleu = lang_bleu[lang_bleu['domain'] != 'Overall']
+ domain_chrf = lang_chrf[lang_chrf['domain'] != 'Overall']
+ domain_comet = lang_comet[lang_comet['domain'] != 'Overall']
+
+ # Pivot all metrics
+ pivot_bleu = domain_bleu.pivot(
+ index='comparison_pair',
+ columns='domain',
+ values='bleu_score'
+ ).fillna(0)
+
+ pivot_chrf = domain_chrf.pivot(
+ index='comparison_pair',
+ columns='domain',
+ values='chrf_score'
+ ).fillna(0)
+
+ pivot_comet = domain_comet.pivot(
+ index='comparison_pair',
+ columns='domain',
+ values='comet_score'
+ ).fillna(0)
+
+ # Ensure domains are in the same order for all metrics
+ domains = sorted(set(pivot_bleu.columns) | set(pivot_chrf.columns) | set(pivot_comet.columns))
+ pivot_bleu = pivot_bleu.reindex(columns=domains, fill_value=0)
+ pivot_chrf = pivot_chrf.reindex(columns=domains, fill_value=0)
+ pivot_comet = pivot_comet.reindex(columns=domains, fill_value=0)
+
+ # Define distinct colors with reduced opacity
+ distinct_colors = [
+ 'rgba(255, 99, 132, 0.4)', # Red
+ 'rgba(54, 162, 235, 0.4)', # Blue
+ 'rgba(99, 255, 132, 0.4)', # Green
+ 'rgba(75, 192, 192, 0.4)', # Teal
+ 'rgba(255, 205, 86, 0.4)', # Yellow
+ 'rgba(153, 102, 255, 0.4)', # Purple
+ 'rgba(255, 159, 64, 0.4)', # Orange
+ 'rgba(199, 199, 199, 0.4)', # Grey
+ 'rgba(83, 102, 255, 0.4)', # Indigo
+ 'rgba(255, 99, 255, 0.4)', # Magenta
+ ]
+
+ # Border colors (same colors but full opacity for borders)
+ border_colors = [
+ 'rgba(255, 99, 132, 1.0)', # Red
+ 'rgba(54, 162, 235, 1.0)', # Blue
+ 'rgba(99, 255, 132, 1.0)', # Green
+ 'rgba(75, 192, 192, 1.0)', # Teal
+ 'rgba(255, 205, 86, 1.0)', # Yellow
+ 'rgba(153, 102, 255, 1.0)', # Purple
+ 'rgba(255, 159, 64, 1.0)', # Orange
+ 'rgba(199, 199, 199, 1.0)', # Grey
+ 'rgba(83, 102, 255, 1.0)', # Indigo
+ 'rgba(255, 99, 255, 1.0)', # Magenta
+ ]
+
+ # Layout for three side-by-side spider charts
+ spider_col1, spider_col2, spider_col3 = st.columns(3)
+
+ # ---------------- CHRF SPIDER ----------------
+ with spider_col1:
+ fig_chrf_spider = go.Figure()
+ for i, (model_name, row) in enumerate(pivot_chrf.iterrows()):
+ color_idx = i % len(distinct_colors)
+ fig_chrf_spider.add_trace(go.Scatterpolar(
+ r=row.tolist() + [row.tolist()[0]], # close loop
+ theta=domains + [domains[0]],
+ fill='toself',
+ name=model_name.split('_')[-1].upper(),
+ fillcolor=distinct_colors[color_idx],
+ line=dict(color=border_colors[color_idx], width=2),
+ opacity=0.7,
+ showlegend=False # Hide legend on first chart
+ ))
+ fig_chrf_spider.update_layout(
+ polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
+ showlegend=False,
+ title=dict(text=f"Domain Performance (chrF) - {selected_lang}"),
+ height=450
+ )
+ st.plotly_chart(fig_chrf_spider, use_container_width=True)
+
+ # ---------------- BLEU SPIDER ----------------
+ with spider_col2:
+ fig_bleu_spider = go.Figure()
+ for i, (model_name, row) in enumerate(pivot_bleu.iterrows()):
+ color_idx = i % len(distinct_colors)
+ fig_bleu_spider.add_trace(go.Scatterpolar(
+ r=row.tolist() + [row.tolist()[0]], # close loop
+ theta=domains + [domains[0]],
+ fill='toself',
+ name=model_name.split('_')[-1].upper(),
+ fillcolor=distinct_colors[color_idx],
+ line=dict(color=border_colors[color_idx], width=2),
+ opacity=0.7,
+ showlegend=True # Show legend on middle chart
+ ))
+ fig_bleu_spider.update_layout(
+ polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
+ showlegend=True,
+ title=dict(text=f"Domain Performance (BLEU) - {selected_lang}"),
+ height=450,
+ legend=dict(
+ orientation="h",
+ yanchor="bottom",
+ y=-0.3,
+ xanchor="center",
+ x=0.5
+ )
+ )
+ st.plotly_chart(fig_bleu_spider, use_container_width=True)
+
+ # ---------------- COMET SPIDER ----------------
+ with spider_col3:
+ fig_comet_spider = go.Figure()
+ for i, (model_name, row) in enumerate(pivot_comet.iterrows()):
+ color_idx = i % len(distinct_colors)
+ fig_comet_spider.add_trace(go.Scatterpolar(
+ r=row.tolist() + [row.tolist()[0]], # close loop
+ theta=domains + [domains[0]],
+ fill='toself',
+ name=model_name.split('_')[-1].upper(),
+ fillcolor=distinct_colors[color_idx],
+ line=dict(color=border_colors[color_idx], width=2),
+ opacity=0.7,
+ showlegend=False # Hide legend on last chart
+ ))
+ fig_comet_spider.update_layout(
+ polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
+ showlegend=False,
+ title=dict(text=f"Domain Performance (COMET) - {selected_lang}"),
+ height=450
+ )
+ st.plotly_chart(fig_comet_spider, use_container_width=True)
+
+ # # Overall Performance Summary
+ # st.markdown("""
+ #
+ # 📋 Overall Performance Summary
+ #
+ # """, unsafe_allow_html=True)
+
+ # # Create overall summary table
+ # if len(display_bleu) > 0 and len(display_chrf) > 0 and len(display_comet) > 0:
+ # # Merge all three metrics
+ # merged_scores = pd.merge(display_bleu, display_chrf, on='comparison_pair', suffixes=('_bleu', '_chrf'))
+ # merged_scores = pd.merge(merged_scores, display_comet, on='comparison_pair')
+ # merged_scores['model'] = merged_scores['comparison_pair'].apply(lambda x: x.split('_')[-1].upper())
+
+ # summary_data = []
+ # for _, row in merged_scores.iterrows():
+ # summary_data.append({
+ # 'Model': row['model'],
+ # 'BLEU Score': f"{row['bleu_score']:.3f}",
+ # 'chrF Score': f"{row['chrf_score']:.3f}",
+ # 'COMET Score': f"{row['comet_score']:.3f}",
+ # 'Average': f"{(row['bleu_score'] + row['chrf_score'] + row['comet_score']) / 3:.3f}"
+ # })
+
+ # summary_df = pd.DataFrame(summary_data)
+
+ # # Only sort if dataframe has data and 'Average' column exists
+ # if len(summary_df) > 0 and 'Average' in summary_df.columns:
+ # summary_df = summary_df.sort_values('Average', ascending=False)
+
+ # # Style the dataframe
+ # st.dataframe(
+ # summary_df,
+ # use_container_width=True,
+ # hide_index=True,
+ # column_config={
+ # "Model": st.column_config.TextColumn("Model", width="medium"),
+ # "BLEU Score": st.column_config.NumberColumn("BLEU Score", format="%.3f"),
+ # "chrF Score": st.column_config.NumberColumn("chrF Score", format="%.3f"),
+ # "COMET Score": st.column_config.NumberColumn("COMET Score", format="%.3f"),
+ # "Average": st.column_config.NumberColumn("Average", format="%.3f")
+ # }
+ # )
+
+ with analysis_tab3:
+ # Revision Analysis Tab
+ st.markdown("""
+
+
+ ✏️ Human Translation Revision Analysis for {selected_lang}
+
+
+ """.format(selected_lang=selected_lang), unsafe_allow_html=True)
+
+ # Use the global language selection
+ rev_code = code
+
+ # Check for revision columns
+ human_col = f"{rev_code}_human"
+ revised_col = f"{rev_code}_revised"
+
+ if human_col in df_translations.columns and revised_col in df_translations.columns:
+ # Get all rows with human translations for this language
+ df_lang_data = df_translations[[human_col, revised_col]].copy()
+
+ # Remove rows where human translation is missing (can't analyze revisions without original)
+ df_lang_data = df_lang_data[df_lang_data[human_col].notna()].copy()
+
+ total_human_translations = len(df_lang_data)
+
+ if total_human_translations == 0:
+ st.warning(f"⚠️ No human translations found for {selected_lang}")
+ else:
+ # Calculate revision statistics
+ # For missing revised translations, we assume no revision was made (same as original)
+ df_lang_data[revised_col] = df_lang_data[revised_col].fillna(df_lang_data[human_col])
+
+ # Count actual changes
+ revisions_made = sum(df_lang_data[human_col] != df_lang_data[revised_col])
+ revision_rate = (revisions_made / total_human_translations) * 100
+
+ # Count how many had revision data available
+ revisions_available = sum(df_translations[revised_col].notna())
+
+ # Calculate revision types
+ def categorize_revision(original, revised):
+ if pd.isna(original) or pd.isna(revised):
+ return "Missing Data"
+ if str(original).strip() == str(revised).strip():
+ return "No Change"
+
+ orig_words = str(original).lower().split()
+ rev_words = str(revised).lower().split()
+
+ if len(rev_words) > len(orig_words):
+ return "Expansion"
+ elif len(rev_words) < len(orig_words):
+ return "Reduction"
+ else:
+ return "Modification"
+
+ df_lang_data['revision_type'] = df_lang_data.apply(
+ lambda row: categorize_revision(row[human_col], row[revised_col]), axis=1
+ )
+
+ # Revision statistics cards
+ rev_col1, rev_col2, rev_col3, rev_col4 = st.columns(4)
+
+ with rev_col1:
+ st.markdown(f"""
+
+
Human Translations
+
{total_human_translations}
+
+ """, unsafe_allow_html=True)
+
+ with rev_col2:
+ st.markdown(f"""
+
+
Revisions Available
+
{revisions_available}
+
+ """, unsafe_allow_html=True)
+
+ with rev_col3:
+ st.markdown(f"""
+
+
Changes Made
+
{revisions_made}
+
+ """, unsafe_allow_html=True)
+
+ with rev_col4:
+ st.markdown(f"""
+
+
Revision Rate
+
{revision_rate:.1f}%
+
+ """, unsafe_allow_html=True)
+
+ # Revision type analysis
+ st.markdown("""
+
+ 📈 Revision Pattern Analysis
+
+ """, unsafe_allow_html=True)
+
+ revision_counts = df_lang_data['revision_type'].value_counts()
+
+ if len(revision_counts) > 0:
+ # Create revision type charts
+ rev_chart_col1, rev_chart_col2 = st.columns(2)
+
+ with rev_chart_col1:
+ # Pie chart of revision types
+ fig_pie = px.pie(
+ values=revision_counts.values,
+ names=revision_counts.index,
+ title=f"Revision Types Distribution",
+ color_discrete_sequence=px.colors.qualitative.Set3
+ )
+ fig_pie.update_layout(height=400, font=dict(family="Inter", size=12))
+ st.plotly_chart(fig_pie, use_container_width=True)
+
+ with rev_chart_col2:
+ # Bar chart of revision types
+ fig_bar = px.bar(
+ x=revision_counts.values,
+ y=revision_counts.index,
+ orientation='h',
+ title=f"Revision Frequency",
+ color=revision_counts.values,
+ color_continuous_scale='viridis'
+ )
+ fig_bar.update_layout(
+ height=400,
+ xaxis_title="Count",
+ yaxis_title="Revision Type",
+ font=dict(family="Inter", size=12)
+ )
+ st.plotly_chart(fig_bar, use_container_width=True)
+
+ # Word-level revision analysis
+ st.markdown("""
+
+ 🔤 Word-Level Changes Analysis
+
+ """, unsafe_allow_html=True)
+
+ # Calculate word changes only for actual revisions
+ words_added = []
+ words_removed = []
+
+ changed_revisions = df_lang_data[df_lang_data['revision_type'] != 'No Change']
+
+ for _, row in changed_revisions.iterrows():
+ if pd.notna(row[human_col]) and pd.notna(row[revised_col]):
+ orig_words = set(str(row[human_col]).lower().split())
+ rev_words = set(str(row[revised_col]).lower().split())
+
+ added = rev_words - orig_words
+ removed = orig_words - rev_words
+
+ words_added.extend(list(added))
+ words_removed.extend(list(removed))
+
+ from collections import Counter
+ added_counts = Counter(words_added)
+ removed_counts = Counter(words_removed)
+
+ word_analysis_col1, word_analysis_col2 = st.columns(2)
+
+ with word_analysis_col1:
+ st.markdown("**🟢 Most Added Words**")
+ if added_counts:
+ top_added = dict(added_counts.most_common(15))
+
+ # Create horizontal bar chart for added words
+ fig_added = px.bar(
+ x=list(top_added.values()),
+ y=list(top_added.keys()),
+ orientation='h',
+ title="Most Frequently Added Words",
+ color=list(top_added.values()),
+ color_continuous_scale='Greens'
+ )
+ fig_added.update_layout(
+ height=400,
+ xaxis_title="Frequency",
+ yaxis_title="Words",
+ font=dict(family="Inter", size=10)
+ )
+ st.plotly_chart(fig_added, use_container_width=True)
+ else:
+ st.markdown("*No words added in revisions*")
+
+ with word_analysis_col2:
+ st.markdown("**🔴 Most Removed Words**")
+ if removed_counts:
+ top_removed = dict(removed_counts.most_common(15))
+
+ # Create horizontal bar chart for removed words
+ fig_removed = px.bar(
+ x=list(top_removed.values()),
+ y=list(top_removed.keys()),
+ orientation='h',
+ title="Most Frequently Removed Words",
+ color=list(top_removed.values()),
+ color_continuous_scale='Reds'
+ )
+ fig_removed.update_layout(
+ height=400,
+ xaxis_title="Frequency",
+ yaxis_title="Words",
+ font=dict(family="Inter", size=10)
+ )
+ st.plotly_chart(fig_removed, use_container_width=True)
+ else:
+ st.markdown("*No words removed in revisions*")
+
+ # Revision examples
+ st.markdown("""
+
+ 📝 Revision Examples
+
+ """, unsafe_allow_html=True)
+
+ # Show examples of different types of revisions
+ revision_examples = changed_revisions.head(10)
+ if len(revision_examples) > 0:
+
+ # Create tabs for different revision types
+ available_types = revision_examples['revision_type'].unique()
+ if len(available_types) > 1:
+ type_tabs = st.tabs([f"{rtype} ({len(revision_examples[revision_examples['revision_type'] == rtype])})"
+ for rtype in available_types])
+
+ for i, rtype in enumerate(available_types):
+ with type_tabs[i]:
+ type_examples = revision_examples[revision_examples['revision_type'] == rtype].head(5)
+ for idx, row in type_examples.iterrows():
+ st.markdown(f"""
+
+
Original:
+
{row[human_col]}
+
Revised:
+
{row[revised_col]}
+
Type: {row['revision_type']}
+
+ """, unsafe_allow_html=True)
+ else:
+ # Single type, show directly
+ for idx, row in revision_examples.iterrows():
+ st.markdown(f"""
+
+
Original:
+
{row[human_col]}
+
Revised:
+
{row[revised_col]}
+
Type: {row['revision_type']}
+
+ """, unsafe_allow_html=True)
+ else:
+ st.info(f"No revisions found for {selected_lang}.")
+ else:
+ st.info(f"No revision data available for analysis.")
+
+ else:
+ st.warning(f"⚠️ Revision columns not found for {selected_lang}. Expected columns: `{human_col}` and `{revised_col}`")
+
+ with analysis_tab4:
+ # Translation comparison section
+ st.markdown("""
+
+
+ 🔍 Translation Comparison & Word Analysis for {selected_lang}
+
+
+ """.format(selected_lang=selected_lang), unsafe_allow_html=True)
+
+ # Use the global language selection
+ comp_code = code
+
+ # Get available translation columns for selected language
+ available_cols = []
+ for col in df_translations.columns:
+ if col.startswith(comp_code) and col != 'english':
+ available_cols.append(col)
+
+ if len(available_cols) >= 2:
+ comp_col1, comp_col2, comp_col3 = st.columns([1, 1, 1])
+
+ with comp_col1:
+ col1_selection = st.selectbox(
+ "First Translation:",
+ available_cols,
+ key="col1_select"
+ )
+
+ with comp_col2:
+ col2_selection = st.selectbox(
+ "Second Translation:",
+ [col for col in available_cols if col != col1_selection],
+ key="col2_select"
+ )
+
+ with comp_col3:
+ # Add spacing to align button with selectboxes
+ st.markdown('
', unsafe_allow_html=True)
+ analyze_clicked = st.button(
+ "🔍 Analyze",
+ type="primary",
+ use_container_width=True,
+ key="analyze_word_diff_btn"
+ )
+
+ if analyze_clicked:
+ # Perform word analysis with ALL available data
+ def get_word_differences(text1, text2):
+ # Handle missing data by using available text
+ if pd.isna(text1) and pd.isna(text2):
+ return set(), set(), set()
+
+ # If one is missing, treat it as empty for comparison
+ words1 = set(str(text1).lower().split()) if pd.notna(text1) else set()
+ words2 = set(str(text2).lower().split()) if pd.notna(text2) else set()
+
+ only_in_1 = words1 - words2
+ only_in_2 = words2 - words1
+ common = words1 & words2
+
+ return only_in_1, only_in_2, common
+
+ # Analyze ALL rows with available data
+ unique_words_1 = []
+ unique_words_2 = []
+ common_words = []
+ all_words_1 = [] # For frequency counting
+ all_words_2 = [] # For frequency counting
+
+ # Process all rows, including those with missing revisions
+ for _, row in df_translations.iterrows():
+ # Get text from columns, using original if revision is missing
+ text1 = row[col1_selection] if pd.notna(row[col1_selection]) else None
+ text2 = row[col2_selection] if pd.notna(row[col2_selection]) else None
+
+ # Skip if both are missing
+ if text1 is None and text2 is None:
+ continue
+
+ # Collect ALL words from each column for frequency analysis
+ if text1 is not None:
+ words_from_1 = str(text1).lower().split()
+ all_words_1.extend(words_from_1)
+
+ if text2 is not None:
+ words_from_2 = str(text2).lower().split()
+ all_words_2.extend(words_from_2)
+
+ # Only do comparison if both texts exist
+ if text1 is not None and text2 is not None:
+ only_1, only_2, common = get_word_differences(text1, text2)
+ unique_words_1.extend(list(only_1))
+ unique_words_2.extend(list(only_2))
+ common_words.extend(list(common))
+
+ from collections import Counter
+
+ # Count frequencies from ALL words
+ all_freq_1 = Counter(all_words_1) # All words from column 1
+ all_freq_2 = Counter(all_words_2) # All words from column 2
+ unique_freq_1 = Counter(unique_words_1) # Only unique words
+ unique_freq_2 = Counter(unique_words_2) # Only unique words
+ common_freq = Counter(common_words) # Only common words
+
+ # Display statistics
+ st.markdown('
', unsafe_allow_html=True)
+
+ col_result1, col_result2, col_result3, col_result4 = st.columns(4)
+
+ with col_result1:
+ st.markdown(f"""
+
+
Unique to {col1_selection.replace('_', ' ').title()}
+
{len(unique_freq_1)}
+
unique words
+
+ """, unsafe_allow_html=True)
+
+ with col_result2:
+ st.markdown(f"""
+
+
Unique to {col2_selection.replace('_', ' ').title()}
+
{len(unique_freq_2)}
+
unique words
+
+ """, unsafe_allow_html=True)
+
+ with col_result3:
+ st.markdown(f"""
+
+
Common Words
+
{len(common_freq)}
+
shared words
+
+ """, unsafe_allow_html=True)
+
+ with col_result4:
+ st.markdown(f"""
+
+
Total Vocabulary
+
{len(set(all_words_1 + all_words_2))}
+
total unique words
+
+ """, unsafe_allow_html=True)
+
+ st.markdown('
', unsafe_allow_html=True)
+
+ # Word Clouds Section
+ st.markdown("""
+
+ ☁️ Word Clouds Visualization
+
+ """, unsafe_allow_html=True)
+
+ # Generate word clouds using matplotlib and wordcloud
+ try:
+ # Show loading spinner while generating word clouds
+ with st.spinner("🎨 Generating word clouds... This may take a moment."):
+ import matplotlib.pyplot as plt
+ from wordcloud import WordCloud
+ import io
+ import base64
+
+ # Function to create word cloud image (optimized)
+ def create_wordcloud_image(word_freq, title, color_scheme='viridis'):
+ if not word_freq or len(word_freq) == 0:
+ return None
+
+ try:
+ # Create word cloud with all frequency data, but limit max_words to 25
+ wordcloud = WordCloud(
+ width=300, # Reduced size
+ height=200, # Reduced size
+ background_color='white',
+ colormap=color_scheme,
+ max_words=25, # Display top 25 words
+ relative_scaling=0.6,
+ random_state=42,
+ min_font_size=8,
+ max_font_size=60,
+ prefer_horizontal=0.9,
+ collocations=False # Avoid word combinations
+ ).generate_from_frequencies(word_freq) # Use ALL frequency data
+
+ # Create matplotlib figure with smaller size
+ fig, ax = plt.subplots(figsize=(5, 3)) # Smaller figure
+ ax.imshow(wordcloud, interpolation='bilinear')
+ ax.axis('off')
+ ax.set_title(title, fontsize=10, fontweight='bold', pad=10)
+
+ # Convert to base64 for HTML display
+ buffer = io.BytesIO()
+ plt.savefig(buffer, format='png', bbox_inches='tight', dpi=100, facecolor='white') # Lower DPI
+ buffer.seek(0)
+ image_base64 = base64.b64encode(buffer.getvalue()).decode()
+ plt.close(fig) # Important: close figure to free memory
+
+ return image_base64
+ except Exception as e:
+ st.warning(f"Error creating word cloud for {title}: {str(e)}")
+ return None
+
+ # Create all word clouds in one row
+ cloud_col1, cloud_col2, cloud_col3 = st.columns(3)
+
+ with cloud_col1:
+ if unique_freq_1 and len(unique_freq_1) > 0:
+ # Use ALL unique words but display top 25 in cloud
+ img1 = create_wordcloud_image(
+ dict(unique_freq_1), # Use ALL unique words for frequency
+ f"Unique: {col1_selection.replace('_', ' ').title()}",
+ 'Reds'
+ )
+ if img1:
+ st.markdown(f'''
+
+

+
+
+ Showing top 25 of {len(unique_freq_1)} unique words
+
+ ''', unsafe_allow_html=True)
+ else:
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+ else:
+ st.markdown("""
+
+
📝
+
No unique words found
+
+ """, unsafe_allow_html=True)
+
+ with cloud_col2:
+ if unique_freq_2 and len(unique_freq_2) > 0:
+ # Use ALL unique words but display top 25 in cloud
+ img2 = create_wordcloud_image(
+ dict(unique_freq_2), # Use ALL unique words for frequency
+ f"Unique: {col2_selection.replace('_', ' ').title()}",
+ 'Greens'
+ )
+ if img2:
+ st.markdown(f'''
+
+

+
+
+ Showing top 25 of {len(unique_freq_2)} unique words
+
+ ''', unsafe_allow_html=True)
+ else:
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+ else:
+ st.markdown("""
+
+
📝
+
No unique words found
+
+ """, unsafe_allow_html=True)
+
+ with cloud_col3:
+ if common_freq and len(common_freq) > 0:
+ # Use ALL common words but display top 25 in cloud
+ img3 = create_wordcloud_image(
+ dict(common_freq), # Use ALL common words for frequency
+ "Common Words",
+ 'Blues'
+ )
+ if img3:
+ st.markdown(f'''
+
+

+
+
+ Showing top 25 of {len(common_freq)} common words
+
+ ''', unsafe_allow_html=True)
+ else:
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+ else:
+ st.markdown("""
+
+
🤝
+
No common words found
+
+ """, unsafe_allow_html=True)
+
+ except ImportError:
+ st.warning("📦 WordCloud library not available. Install with: `pip install wordcloud`")
+
+ # Fallback to top words lists
+ st.markdown("**📋 Top Unique Words (Fallback)**")
+
+ fallback_col1, fallback_col2, fallback_col3 = st.columns(3)
+
+ with fallback_col1:
+ st.markdown(f"**🔴 Unique to {col1_selection.replace('_', ' ').title()}**")
+ if unique_freq_1:
+ for word, count in unique_freq_1.most_common(10):
+ st.markdown(f"• {word} ({count})")
+ else:
+ st.markdown("*No unique words*")
+
+ with fallback_col2:
+ st.markdown(f"**🟢 Unique to {col2_selection.replace('_', ' ').title()}**")
+ if unique_freq_2:
+ for word, count in unique_freq_2.most_common(10):
+ st.markdown(f"• {word} ({count})")
+ else:
+ st.markdown("*No unique words*")
+
+ with fallback_col3:
+ st.markdown("**🔵 Common Words**")
+ if common_freq:
+ for word, count in common_freq.most_common(10):
+ st.markdown(f"• {word} ({count})")
+ else:
+ st.markdown("*No common words*")
+
+ # Word frequency bar charts as additional analysis
+ st.markdown("""
+
+ 📊 Top Words Frequency Comparison
+
+ """, unsafe_allow_html=True)
+
+ freq_col1, freq_col2 = st.columns(2)
+
+ with freq_col1:
+ if unique_freq_1:
+ top_words_1 = dict(unique_freq_1.most_common(10))
+ fig_freq1 = px.bar(
+ x=list(top_words_1.values()),
+ y=list(top_words_1.keys()),
+ orientation='h',
+ title=f"Top Unique Words: {col1_selection.replace('_', ' ').title()}",
+ color=list(top_words_1.values()),
+ color_continuous_scale='Reds'
+ )
+ fig_freq1.update_layout(
+ height=400,
+ xaxis_title="Frequency",
+ yaxis_title="Words",
+ font=dict(family="Inter", size=10)
+ )
+ st.plotly_chart(fig_freq1, use_container_width=True)
+
+ with freq_col2:
+ if unique_freq_2:
+ top_words_2 = dict(unique_freq_2.most_common(10))
+ fig_freq2 = px.bar(
+ x=list(top_words_2.values()),
+ y=list(top_words_2.keys()),
+ orientation='h',
+ title=f"Top Unique Words: {col2_selection.replace('_', ' ').title()}",
+ color=list(top_words_2.values()),
+ color_continuous_scale='Greens'
+ )
+ fig_freq2.update_layout(
+ height=400,
+ xaxis_title="Frequency",
+ yaxis_title="Words",
+ font=dict(family="Inter", size=10)
+ )
+ st.plotly_chart(fig_freq2, use_container_width=True)
+ else:
+ st.warning("⚠️ Need at least 2 translation columns for comparison analysis.")
+
+ else:
+ st.markdown("""
+
+
❌ No Data Available
+
+ Please ensure translation data files are available in the data directory.
+
+
+ """, unsafe_allow_html=True)
+
+ # Footer
+ st.markdown("---")
+ st.markdown("""
+
+ Built for DSFSI using Streamlit • Translation APIs: Gemini, GPT, NLLB (hosted locally) • Data Science for Social Impact
+
+ """, unsafe_allow_html=True)
-"""
-# Welcome to Streamlit!
-
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-
-df = pd.DataFrame({
- "x": x,
- "y": y,
- "idx": indices,
- "rand": np.random.randn(num_points),
-})
-
-st.altair_chart(alt.Chart(df, height=700, width=700)
- .mark_point(filled=True)
- .encode(
- x=alt.X("x", axis=None),
- y=alt.Y("y", axis=None),
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
- ))
\ No newline at end of file
+if __name__ == "__main__":
+ main()
\ No newline at end of file