dsfsi-lid-space / app.py
vukosi's picture
Update app.py
590ee2f verified
# coding=utf-8
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
from transformers import pipeline
import fasttext
from huggingface_hub import hf_hub_download
import json
import os
import re
import string
import base64
from typing import List, Tuple, Dict, Optional
import logging
# Configure page
st.set_page_config(
page_title="South African Language Identification",
page_icon="πŸ‡ΏπŸ‡¦",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS for better styling
st.markdown("""
<style>
.main-header {
text-align: center;
padding: 1rem 0;
background: linear-gradient(90deg, #ff6b35, #f7931e);
color: white;
border-radius: 10px;
margin-bottom: 2rem;
}
.model-card {
background: #f8f9fa;
padding: 1rem;
border-radius: 8px;
border-left: 4px solid #ff6b35;
margin: 1rem 0;
}
.result-container {
background: white;
padding: 1.5rem;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
margin: 1rem 0;
}
.metric-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 1rem;
border-radius: 8px;
text-align: center;
}
</style>
""", unsafe_allow_html=True)
# Constants and Configuration
MODEL_CONFIGS = {
"za-bert": {
"name": "ZA-BERT",
"model_id": "dsfsi/za-lid-bert",
"description": "Lightweight BERT-based model trained on South African languages",
"recommended": True
},
"xlmr-large": {
"name": "XLM-R Large",
"model_id": "dsfsi/za-xlmrlarge-lid",
"description": "XLM-RoBERTa Large model fine-tuned for SA languages"
},
"serengeti": {
"name": "Serengeti",
"model_id": "dsfsi/za-serengeti-lid",
"description": "Afri-centric model with superior performance"
},
"afriberta": {
"name": "AfriBERTa",
"model_id": "dsfsi/za-afriberta-lid",
"description": "African-focused BERT model"
},
"afro-xlmr": {
"name": "Afro-XLM-R",
"model_id": "dsfsi/za-afro-xlmr-base-lid",
"description": "African-centric XLM-RoBERTa model"
},
"afrolm": {
"name": "AfroLM",
"model_id": "dsfsi/za-afrolm-lid",
"description": "African language model"
}
}
# Utility Functions
@st.cache_data
def load_language_names() -> Dict[str, str]:
"""Load language names mapping"""
try:
with open("assets/language_names.json", 'r') as f:
return json.load(f)
except FileNotFoundError:
# Fallback mapping for common South African languages
return {
"afr": "Afrikaans",
"eng": "English",
"nso": "Northern Sotho",
"sot": "Sesotho",
"ssw": "Siswati",
"tsn": "Setswana",
"tso": "Xitsonga",
"ven": "Tshivenda",
"xho": "isiXhosa",
"zul": "isiZulu",
"nbl": "isiNdebele",
"und": "Undetermined"
}
@st.cache_resource
def load_model(model_key: str):
"""Load and cache models"""
try:
config = MODEL_CONFIGS[model_key]
model = pipeline("text-classification", model=config["model_id"])
return model
except Exception as e:
st.error(f"Error loading model {model_key}: {str(e)}")
return None
def preprocess_text(text: str) -> str:
"""Clean and preprocess input text"""
if not text or not text.strip():
return ""
# Basic cleaning
text = text.replace('\n', ' ')
# Remove problematic characters
replacement_map = {ord(c): ' ' for c in ':β€’#{|}' + string.digits}
text = text.translate(replacement_map)
# Normalize whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def get_language_name(label: str, lang_names: Dict[str, str]) -> str:
"""Get language name from label"""
if '_' in label:
iso_code = label.split('_')[0]
else:
iso_code = label
return lang_names.get(iso_code, label)
def predict_language(text: str, model, lang_names: Dict[str, str]) -> Tuple[str, float, str]:
"""Predict language for given text"""
if not model or not text.strip():
return "und", 0.0, "Undetermined"
try:
processed_text = preprocess_text(text)
if not processed_text:
return "und", 0.0, "Undetermined"
result = model(processed_text)
if isinstance(result, list) and len(result) > 0:
prediction = result[0]
label = prediction['label']
confidence = prediction['score']
language_name = get_language_name(label, lang_names)
return label, confidence, language_name
return "und", 0.0, "Undetermined"
except Exception as e:
st.error(f"Prediction error: {str(e)}")
return "und", 0.0, "Error"
def create_confidence_plot(language: str, confidence: float) -> plt.Figure:
"""Create a confidence visualization"""
fig, ax = plt.subplots(figsize=(10, 2))
# Colors
primary_color = "#ff6b35"
bg_color = "#f8f9fa"
text_color = "#2c3e50"
# Create horizontal bar
ax.barh([0], [confidence], color=primary_color, height=0.6, alpha=0.8)
ax.barh([0], [1-confidence], left=[confidence], color=bg_color, height=0.6, alpha=0.3)
# Styling
ax.set_xlim(0, 1)
ax.set_ylim(-0.5, 0.5)
ax.set_xlabel("Confidence Score", fontsize=12, color=text_color)
ax.set_title(f"Language: {language} (Confidence: {confidence:.3f})",
fontsize=14, fontweight='bold', color=text_color, pad=20)
# Remove y-axis and spines
ax.set_yticks([])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
# Add confidence text
ax.text(confidence/2, 0, f"{confidence:.1%}",
ha='center', va='center', fontweight='bold', color='white')
plt.tight_layout()
return fig
def render_paper_info():
"""Render paper information and citation"""
st.markdown("### πŸ“„ Research Paper")
col1, col2 = st.columns([2, 1])
with col1:
st.markdown("""
**"From N-grams to Pre-trained Multilingual Models For Language Identification"**
*Authors: Thapelo Andrew Sindane, Vukosi Marivate*
Published in: Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities (2024)
This research investigates N-gram models and large pre-trained multilingual models for Language Identification
across 11 South African languages, showing that Serengeti performs best across all model types.
""")
with col2:
st.markdown("""
**Links:**
- [πŸ“– Paper](https://aclanthology.org/2024.nlp4dh-1.22/)
- [πŸ€— HuggingFace](https://huggingface.co/dsfsi)
- [πŸ’» GitHub](https://github.com/dsfsi/za-lid)
""")
def render_citation():
"""Render BibTeX citation"""
citation = """@inproceedings{sindane-marivate-2024-n,
title = "From N-grams to Pre-trained Multilingual Models For Language Identification",
author = "Sindane, Thapelo Andrew and Marivate, Vukosi",
editor = "HΓ€mΓ€lΓ€inen, Mika and Γ–hman, Emily and Miyagawa, So and Alnajjar, Khalid and Bizzoni, Yuri",
booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities",
month = nov,
year = "2024",
address = "Miami, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.nlp4dh-1.22/",
doi = "10.18653/v1/2024.nlp4dh-1.22",
pages = "229--239"
}"""
st.code(citation, language='bibtex')
def main():
# Header
st.markdown("""
<div class="main-header">
<h1>πŸ‡ΏπŸ‡¦ South African Language Identification</h1>
<p>Multilingual Language Detection for South African Languages</p>
</div>
""", unsafe_allow_html=True)
# Load language names
lang_names = load_language_names()
# Sidebar
with st.sidebar:
st.header("βš™οΈ Model Configuration")
# Model selection
selected_model = st.selectbox(
"Choose Model:",
options=list(MODEL_CONFIGS.keys()),
format_func=lambda x: f"{'⭐ ' if MODEL_CONFIGS[x].get('recommended') else ''}{MODEL_CONFIGS[x]['name']}",
index=0,
help="Select the language identification model"
)
# Model info
model_config = MODEL_CONFIGS[selected_model]
st.markdown(f"""
<div class="model-card">
<h4>{model_config['name']}</h4>
<p>{model_config['description']}</p>
</div>
""", unsafe_allow_html=True)
# Supported languages
st.subheader("πŸ“‹ Supported Languages")
supported_langs = [
"🏴󠁺󠁑󠁺󠁑󠁿 Afrikaans", "πŸ‡¬πŸ‡§ English", "🌍 Northern Sotho",
"🌍 Sesotho", "🌍 Siswati", "🌍 Setswana",
"🌍 Xitsonga", "🌍 Tshivenda", "🌍 isiXhosa",
"🌍 isiZulu", "🌍 isiNdebele"
]
for lang in supported_langs:
st.write(f"β€’ {lang}")
# Main content
tab1, tab2, tab3 = st.tabs(["πŸ” Single Text", "πŸ“ Bulk Analysis", "πŸ“„ About"])
with tab1:
st.header("Single Text Analysis")
# Text input
user_text = st.text_area(
"Enter text to identify language:",
placeholder="Type or paste your text here...",
height=100,
help="Enter text in any South African language"
)
col1, col2, col3 = st.columns([1, 1, 2])
with col1:
analyze_button = st.button("πŸ” Analyze", type="primary", use_container_width=True)
with col2:
clear_button = st.button("πŸ—‘οΈ Clear", use_container_width=True)
if clear_button:
st.rerun()
if analyze_button and user_text.strip():
with st.spinner("Analyzing language..."):
# Load model
model = load_model(selected_model)
if model:
# Predict
label, confidence, language_name = predict_language(user_text, model, lang_names)
# Results
st.markdown("### πŸ“Š Results")
# Metrics
col1, col2, col3 = st.columns(3)
with col1:
st.markdown(f"""
<div class="metric-card">
<h3>{language_name}</h3>
<p>Detected Language</p>
</div>
""", unsafe_allow_html=True)
with col2:
st.markdown(f"""
<div class="metric-card">
<h3>{confidence:.1%}</h3>
<p>Confidence</p>
</div>
""", unsafe_allow_html=True)
with col3:
st.markdown(f"""
<div class="metric-card">
<h3>{label}</h3>
<p>Language Code</p>
</div>
""", unsafe_allow_html=True)
# Confidence visualization
st.markdown("### πŸ“ˆ Confidence Visualization")
fig = create_confidence_plot(language_name, confidence)
st.pyplot(fig)
else:
st.error("Failed to load the model. Please try again.")
elif analyze_button:
st.warning("Please enter some text to analyze.")
with tab2:
st.header("Bulk Text Analysis")
uploaded_file = st.file_uploader(
"Upload a text file",
type=['txt', 'csv'],
help="Upload a .txt file with one sentence per line, or a CSV file with a 'text' column"
)
if uploaded_file:
try:
# Read file
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file)
if 'text' not in df.columns:
st.error("CSV file must contain a 'text' column")
st.stop()
texts = df['text'].astype(str).tolist()
else:
content = uploaded_file.read().decode('utf-8')
texts = [line.strip() for line in content.split('\n') if line.strip()]
st.success(f"Loaded {len(texts)} texts for analysis")
if st.button("πŸš€ Analyze All", type="primary"):
model = load_model(selected_model)
if model:
results = []
progress_bar = st.progress(0)
for i, text in enumerate(texts):
label, confidence, language_name = predict_language(text, model, lang_names)
results.append({
'Text': text[:100] + '...' if len(text) > 100 else text,
'Language': language_name,
'Code': label,
'Confidence': confidence
})
progress_bar.progress((i + 1) / len(texts))
# Results DataFrame
results_df = pd.DataFrame(results)
# Display results
st.markdown("### πŸ“Š Analysis Results")
st.dataframe(results_df, use_container_width=True)
# Summary statistics
col1, col2 = st.columns(2)
with col1:
st.markdown("### πŸ“ˆ Language Distribution")
lang_counts = results_df['Language'].value_counts()
st.bar_chart(lang_counts)
with col2:
st.markdown("### πŸ“Š Average Confidence by Language")
avg_conf = results_df.groupby('Language')['Confidence'].mean().sort_values(ascending=False)
st.bar_chart(avg_conf)
# Download button
csv_data = results_df.to_csv(index=False)
st.download_button(
label="πŸ“₯ Download Results (CSV)",
data=csv_data,
file_name="language_identification_results.csv",
mime="text/csv"
)
else:
st.error("Failed to load the model.")
except Exception as e:
st.error(f"Error processing file: {str(e)}")
with tab3:
render_paper_info()
st.markdown("---")
st.markdown("### πŸ“– Citation")
render_citation()
st.markdown("---")
st.markdown("""
### πŸ›οΈ Acknowledgments
This work is part of the Data Science for Social Impact Research Group at the University of Pretoria.
**Contact:**
- πŸ“§ Email: [email protected]
- 🐦 Twitter: [@VukosiiM](https://twitter.com/VukosiiM)
- 🌐 Website: [dsfsi.github.io](https://dsfsi.github.io)
""")
if __name__ == "__main__":
main()