Spaces:
Running
Running
# coding=utf-8 | |
import streamlit as st | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import altair as alt | |
from transformers import pipeline | |
import fasttext | |
from huggingface_hub import hf_hub_download | |
import json | |
import os | |
import re | |
import string | |
import base64 | |
from typing import List, Tuple, Dict, Optional | |
import logging | |
# Configure page | |
st.set_page_config( | |
page_title="South African Language Identification", | |
page_icon="πΏπ¦", | |
layout="wide", | |
initial_sidebar_state="expanded" | |
) | |
# Custom CSS for better styling | |
st.markdown(""" | |
<style> | |
.main-header { | |
text-align: center; | |
padding: 1rem 0; | |
background: linear-gradient(90deg, #ff6b35, #f7931e); | |
color: white; | |
border-radius: 10px; | |
margin-bottom: 2rem; | |
} | |
.model-card { | |
background: #f8f9fa; | |
padding: 1rem; | |
border-radius: 8px; | |
border-left: 4px solid #ff6b35; | |
margin: 1rem 0; | |
} | |
.result-container { | |
background: white; | |
padding: 1.5rem; | |
border-radius: 10px; | |
box-shadow: 0 2px 10px rgba(0,0,0,0.1); | |
margin: 1rem 0; | |
} | |
.metric-card { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
padding: 1rem; | |
border-radius: 8px; | |
text-align: center; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Constants and Configuration | |
MODEL_CONFIGS = { | |
"za-bert": { | |
"name": "ZA-BERT", | |
"model_id": "dsfsi/za-lid-bert", | |
"description": "Lightweight BERT-based model trained on South African languages", | |
"recommended": True | |
}, | |
"xlmr-large": { | |
"name": "XLM-R Large", | |
"model_id": "dsfsi/za-xlmrlarge-lid", | |
"description": "XLM-RoBERTa Large model fine-tuned for SA languages" | |
}, | |
"serengeti": { | |
"name": "Serengeti", | |
"model_id": "dsfsi/za-serengeti-lid", | |
"description": "Afri-centric model with superior performance" | |
}, | |
"afriberta": { | |
"name": "AfriBERTa", | |
"model_id": "dsfsi/za-afriberta-lid", | |
"description": "African-focused BERT model" | |
}, | |
"afro-xlmr": { | |
"name": "Afro-XLM-R", | |
"model_id": "dsfsi/za-afro-xlmr-base-lid", | |
"description": "African-centric XLM-RoBERTa model" | |
}, | |
"afrolm": { | |
"name": "AfroLM", | |
"model_id": "dsfsi/za-afrolm-lid", | |
"description": "African language model" | |
} | |
} | |
# Utility Functions | |
def load_language_names() -> Dict[str, str]: | |
"""Load language names mapping""" | |
try: | |
with open("assets/language_names.json", 'r') as f: | |
return json.load(f) | |
except FileNotFoundError: | |
# Fallback mapping for common South African languages | |
return { | |
"afr": "Afrikaans", | |
"eng": "English", | |
"nso": "Northern Sotho", | |
"sot": "Sesotho", | |
"ssw": "Siswati", | |
"tsn": "Setswana", | |
"tso": "Xitsonga", | |
"ven": "Tshivenda", | |
"xho": "isiXhosa", | |
"zul": "isiZulu", | |
"nbl": "isiNdebele", | |
"und": "Undetermined" | |
} | |
def load_model(model_key: str): | |
"""Load and cache models""" | |
try: | |
config = MODEL_CONFIGS[model_key] | |
model = pipeline("text-classification", model=config["model_id"]) | |
return model | |
except Exception as e: | |
st.error(f"Error loading model {model_key}: {str(e)}") | |
return None | |
def preprocess_text(text: str) -> str: | |
"""Clean and preprocess input text""" | |
if not text or not text.strip(): | |
return "" | |
# Basic cleaning | |
text = text.replace('\n', ' ') | |
# Remove problematic characters | |
replacement_map = {ord(c): ' ' for c in ':β’#{|}' + string.digits} | |
text = text.translate(replacement_map) | |
# Normalize whitespace | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def get_language_name(label: str, lang_names: Dict[str, str]) -> str: | |
"""Get language name from label""" | |
if '_' in label: | |
iso_code = label.split('_')[0] | |
else: | |
iso_code = label | |
return lang_names.get(iso_code, label) | |
def predict_language(text: str, model, lang_names: Dict[str, str]) -> Tuple[str, float, str]: | |
"""Predict language for given text""" | |
if not model or not text.strip(): | |
return "und", 0.0, "Undetermined" | |
try: | |
processed_text = preprocess_text(text) | |
if not processed_text: | |
return "und", 0.0, "Undetermined" | |
result = model(processed_text) | |
if isinstance(result, list) and len(result) > 0: | |
prediction = result[0] | |
label = prediction['label'] | |
confidence = prediction['score'] | |
language_name = get_language_name(label, lang_names) | |
return label, confidence, language_name | |
return "und", 0.0, "Undetermined" | |
except Exception as e: | |
st.error(f"Prediction error: {str(e)}") | |
return "und", 0.0, "Error" | |
def create_confidence_plot(language: str, confidence: float) -> plt.Figure: | |
"""Create a confidence visualization""" | |
fig, ax = plt.subplots(figsize=(10, 2)) | |
# Colors | |
primary_color = "#ff6b35" | |
bg_color = "#f8f9fa" | |
text_color = "#2c3e50" | |
# Create horizontal bar | |
ax.barh([0], [confidence], color=primary_color, height=0.6, alpha=0.8) | |
ax.barh([0], [1-confidence], left=[confidence], color=bg_color, height=0.6, alpha=0.3) | |
# Styling | |
ax.set_xlim(0, 1) | |
ax.set_ylim(-0.5, 0.5) | |
ax.set_xlabel("Confidence Score", fontsize=12, color=text_color) | |
ax.set_title(f"Language: {language} (Confidence: {confidence:.3f})", | |
fontsize=14, fontweight='bold', color=text_color, pad=20) | |
# Remove y-axis and spines | |
ax.set_yticks([]) | |
ax.spines['top'].set_visible(False) | |
ax.spines['right'].set_visible(False) | |
ax.spines['left'].set_visible(False) | |
# Add confidence text | |
ax.text(confidence/2, 0, f"{confidence:.1%}", | |
ha='center', va='center', fontweight='bold', color='white') | |
plt.tight_layout() | |
return fig | |
def render_paper_info(): | |
"""Render paper information and citation""" | |
st.markdown("### π Research Paper") | |
col1, col2 = st.columns([2, 1]) | |
with col1: | |
st.markdown(""" | |
**"From N-grams to Pre-trained Multilingual Models For Language Identification"** | |
*Authors: Thapelo Andrew Sindane, Vukosi Marivate* | |
Published in: Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities (2024) | |
This research investigates N-gram models and large pre-trained multilingual models for Language Identification | |
across 11 South African languages, showing that Serengeti performs best across all model types. | |
""") | |
with col2: | |
st.markdown(""" | |
**Links:** | |
- [π Paper](https://aclanthology.org/2024.nlp4dh-1.22/) | |
- [π€ HuggingFace](https://huggingface.co/dsfsi) | |
- [π» GitHub](https://github.com/dsfsi/za-lid) | |
""") | |
def render_citation(): | |
"""Render BibTeX citation""" | |
citation = """@inproceedings{sindane-marivate-2024-n, | |
title = "From N-grams to Pre-trained Multilingual Models For Language Identification", | |
author = "Sindane, Thapelo Andrew and Marivate, Vukosi", | |
editor = "HΓ€mΓ€lΓ€inen, Mika and Γhman, Emily and Miyagawa, So and Alnajjar, Khalid and Bizzoni, Yuri", | |
booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities", | |
month = nov, | |
year = "2024", | |
address = "Miami, USA", | |
publisher = "Association for Computational Linguistics", | |
url = "https://aclanthology.org/2024.nlp4dh-1.22/", | |
doi = "10.18653/v1/2024.nlp4dh-1.22", | |
pages = "229--239" | |
}""" | |
st.code(citation, language='bibtex') | |
def main(): | |
# Header | |
st.markdown(""" | |
<div class="main-header"> | |
<h1>πΏπ¦ South African Language Identification</h1> | |
<p>Multilingual Language Detection for South African Languages</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# Load language names | |
lang_names = load_language_names() | |
# Sidebar | |
with st.sidebar: | |
st.header("βοΈ Model Configuration") | |
# Model selection | |
selected_model = st.selectbox( | |
"Choose Model:", | |
options=list(MODEL_CONFIGS.keys()), | |
format_func=lambda x: f"{'β ' if MODEL_CONFIGS[x].get('recommended') else ''}{MODEL_CONFIGS[x]['name']}", | |
index=0, | |
help="Select the language identification model" | |
) | |
# Model info | |
model_config = MODEL_CONFIGS[selected_model] | |
st.markdown(f""" | |
<div class="model-card"> | |
<h4>{model_config['name']}</h4> | |
<p>{model_config['description']}</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# Supported languages | |
st.subheader("π Supported Languages") | |
supported_langs = [ | |
"π΄σ Ίσ ‘σ Ίσ ‘σ Ώ Afrikaans", "π¬π§ English", "π Northern Sotho", | |
"π Sesotho", "π Siswati", "π Setswana", | |
"π Xitsonga", "π Tshivenda", "π isiXhosa", | |
"π isiZulu", "π isiNdebele" | |
] | |
for lang in supported_langs: | |
st.write(f"β’ {lang}") | |
# Main content | |
tab1, tab2, tab3 = st.tabs(["π Single Text", "π Bulk Analysis", "π About"]) | |
with tab1: | |
st.header("Single Text Analysis") | |
# Text input | |
user_text = st.text_area( | |
"Enter text to identify language:", | |
placeholder="Type or paste your text here...", | |
height=100, | |
help="Enter text in any South African language" | |
) | |
col1, col2, col3 = st.columns([1, 1, 2]) | |
with col1: | |
analyze_button = st.button("π Analyze", type="primary", use_container_width=True) | |
with col2: | |
clear_button = st.button("ποΈ Clear", use_container_width=True) | |
if clear_button: | |
st.rerun() | |
if analyze_button and user_text.strip(): | |
with st.spinner("Analyzing language..."): | |
# Load model | |
model = load_model(selected_model) | |
if model: | |
# Predict | |
label, confidence, language_name = predict_language(user_text, model, lang_names) | |
# Results | |
st.markdown("### π Results") | |
# Metrics | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.markdown(f""" | |
<div class="metric-card"> | |
<h3>{language_name}</h3> | |
<p>Detected Language</p> | |
</div> | |
""", unsafe_allow_html=True) | |
with col2: | |
st.markdown(f""" | |
<div class="metric-card"> | |
<h3>{confidence:.1%}</h3> | |
<p>Confidence</p> | |
</div> | |
""", unsafe_allow_html=True) | |
with col3: | |
st.markdown(f""" | |
<div class="metric-card"> | |
<h3>{label}</h3> | |
<p>Language Code</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# Confidence visualization | |
st.markdown("### π Confidence Visualization") | |
fig = create_confidence_plot(language_name, confidence) | |
st.pyplot(fig) | |
else: | |
st.error("Failed to load the model. Please try again.") | |
elif analyze_button: | |
st.warning("Please enter some text to analyze.") | |
with tab2: | |
st.header("Bulk Text Analysis") | |
uploaded_file = st.file_uploader( | |
"Upload a text file", | |
type=['txt', 'csv'], | |
help="Upload a .txt file with one sentence per line, or a CSV file with a 'text' column" | |
) | |
if uploaded_file: | |
try: | |
# Read file | |
if uploaded_file.name.endswith('.csv'): | |
df = pd.read_csv(uploaded_file) | |
if 'text' not in df.columns: | |
st.error("CSV file must contain a 'text' column") | |
st.stop() | |
texts = df['text'].astype(str).tolist() | |
else: | |
content = uploaded_file.read().decode('utf-8') | |
texts = [line.strip() for line in content.split('\n') if line.strip()] | |
st.success(f"Loaded {len(texts)} texts for analysis") | |
if st.button("π Analyze All", type="primary"): | |
model = load_model(selected_model) | |
if model: | |
results = [] | |
progress_bar = st.progress(0) | |
for i, text in enumerate(texts): | |
label, confidence, language_name = predict_language(text, model, lang_names) | |
results.append({ | |
'Text': text[:100] + '...' if len(text) > 100 else text, | |
'Language': language_name, | |
'Code': label, | |
'Confidence': confidence | |
}) | |
progress_bar.progress((i + 1) / len(texts)) | |
# Results DataFrame | |
results_df = pd.DataFrame(results) | |
# Display results | |
st.markdown("### π Analysis Results") | |
st.dataframe(results_df, use_container_width=True) | |
# Summary statistics | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("### π Language Distribution") | |
lang_counts = results_df['Language'].value_counts() | |
st.bar_chart(lang_counts) | |
with col2: | |
st.markdown("### π Average Confidence by Language") | |
avg_conf = results_df.groupby('Language')['Confidence'].mean().sort_values(ascending=False) | |
st.bar_chart(avg_conf) | |
# Download button | |
csv_data = results_df.to_csv(index=False) | |
st.download_button( | |
label="π₯ Download Results (CSV)", | |
data=csv_data, | |
file_name="language_identification_results.csv", | |
mime="text/csv" | |
) | |
else: | |
st.error("Failed to load the model.") | |
except Exception as e: | |
st.error(f"Error processing file: {str(e)}") | |
with tab3: | |
render_paper_info() | |
st.markdown("---") | |
st.markdown("### π Citation") | |
render_citation() | |
st.markdown("---") | |
st.markdown(""" | |
### ποΈ Acknowledgments | |
This work is part of the Data Science for Social Impact Research Group at the University of Pretoria. | |
**Contact:** | |
- π§ Email: [email protected] | |
- π¦ Twitter: [@VukosiiM](https://twitter.com/VukosiiM) | |
- π Website: [dsfsi.github.io](https://dsfsi.github.io) | |
""") | |
if __name__ == "__main__": | |
main() |