nso-en-translation

Sleeping

App Files Files Community

vukosi commited on Jun 15

Commit

aae664e

verified ·

1 Parent(s): 92737f3

Update app.py

Browse files

Files changed (1) hide show

app.py +346 -62

app.py CHANGED Viewed

@@ -1,81 +1,365 @@
 import gradio as gr
 from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 model_name = "dsfsi/nso-en-m2m100-gov"
 tokenizer = M2M100Tokenizer.from_pretrained(model_name)
 model = M2M100ForConditionalGeneration.from_pretrained(model_name)
 tokenizer.src_lang = "ns"
 model.config.forced_bos_token_id = tokenizer.get_lang_id("en")
-def translate(inp):
-    inputs = tokenizer(inp, return_tensors="pt")
-    translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en"))
-    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
-    return translated_text
-with gr.Blocks() as demo:
-    with gr.Row():
-        with gr.Column(scale=1):
-            pass
-        with gr.Column(scale=4, min_width=1000):
-            gr.Image("logo_transparent_small.png", elem_id="logo", show_label=False, width=500)
-            gr.Markdown("""
-            <h1 style='text-align: center;'>Northern Sotho to English Translation</h1>
-            <p style='text-align: center;'>This space provides a translation service from Northern Sotho to English using the M2M100 model, fine-tuned for low-resource languages. It supports researchers, linguists, and users working with Northern Sotho texts.</p>
-            """)
-        with gr.Column(scale=1):
-            pass
-    with gr.Column(variant="panel"):
-        inp_text = gr.Textbox(lines=5, placeholder="Enter Northern Sotho text (maximum 5 lines)", label="Input", elem_id="centered-input")
-        output_text = gr.Textbox(label="Output", elem_id="centered-output")
-        translate_button = gr.Button("Translate", elem_id="centered-button")
-        translate_button.click(translate, inputs=inp_text, outputs=output_text)
-    gr.Markdown("""
-    <div style='text-align: center;'>
-        <a href='https://github.com/dsfsi/nso-en-m2m100-gov' target='_blank'>GitHub</a> |
-        <a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank'>Feedback Form</a> |
-        <a href='https://arxiv.org/abs/2303.03750' target='_blank'>Arxiv</a>
-    </div>
-    <br/>
-    """)
-    with gr.Accordion("More Information", open=False):
-        gr.Markdown("""
-        <h4 style="text-align: center;">Model Description</h4>
-        <p style='text-align: center;'>This is a variant of the M2M100 model, fine-tuned on a multilingual dataset to support translation from Northern Sotho (Sepedi) to English. The model was trained with a focus on improving translation accuracy for low-resource languages.</p>
-        """)
-        gr.Markdown("""
-        <h4 style="text-align: center;">Authors</h4>
-        <div style='text-align: center;'>
-            Vukosi Marivate, Matimba Shingange, Richard Lastrucci,
-            Isheanesu Joseph Dzingirai, Jenalea Rajab
         </div>
         """)
         gr.Markdown("""
-        <h4 style="text-align: center;">Citation</h4>
-        <pre style="text-align: center; white-space: pre-wrap;">
         @inproceedings{lastrucci-etal-2023-preparing,
-            title = "Preparing the Vuk{'}uzenzele and {ZA}-gov-multilingual {S}outh {A}frican multilingual corpora",
             author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab
                       and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate",
             booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
-            month = may,
-            year = "2023",
-            address = "Dubrovnik, Croatia",
-            publisher = "Association for Computational Linguistics",
-            url = "https://aclanthology.org/2023.rail-1.3",
-            pages = "18--25"
         }
-        </pre>
-        """)
-        gr.Markdown("""
-        <h4 style="text-align: center;">DOI</h4>
-        <div style='text-align: center;'>
-            <a href="https://doi.org/10.48550/arXiv.2303.03750" target="_blank">10.48550/arXiv.2303.03750</a>
-        </div>
         """)
-demo.launch()

 import gradio as gr
+import torch
 from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+import pandas as pd
+import time
+import re
+import tempfile
+import os
+import uuid
+# Model loading
 model_name = "dsfsi/nso-en-m2m100-gov"
 tokenizer = M2M100Tokenizer.from_pretrained(model_name)
 model = M2M100ForConditionalGeneration.from_pretrained(model_name)
 tokenizer.src_lang = "ns"
 model.config.forced_bos_token_id = tokenizer.get_lang_id("en")
+# Translation function (single)
+def translate_nso_en(text):
+    if not text.strip():
+        return "Please enter Northern Sotho (Sepedi) text."
+    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
+    translated_tokens = model.generate(
+        **inputs,
+        max_length=512,
+        forced_bos_token_id=tokenizer.get_lang_id("en")
+    )
+    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
+# Linguistic analysis
+def calculate_metrics(text):
+    words = text.split()
+    word_count = len(words)
+    char_count = len(text)
+    sentence_count = len([s for s in re.split(r'[.!?]+', text) if s.strip()])
+    unique_words = len(set(words))
+    avg_word_length = sum(len(w) for w in words) / word_count if word_count else 0
+    lexical_div = unique_words / word_count if word_count else 0
+    return {
+        'char_count': char_count,
+        'word_count': word_count,
+        'sentence_count': sentence_count,
+        'unique_words': unique_words,
+        'avg_word_length': avg_word_length,
+        'lexical_diversity': lexical_div
+    }
+def create_metrics_table(src_metrics, tgt_metrics):
+    data = {
+        'Metric': ['Words', 'Characters', 'Sentences', 'Unique Words', 'Avg Word Length', 'Lexical Diversity'],
+        'Source Text': [
+            src_metrics.get('word_count', 0),
+            src_metrics.get('char_count', 0),
+            src_metrics.get('sentence_count', 0),
+            src_metrics.get('unique_words', 0),
+            f"{src_metrics.get('avg_word_length', 0):.1f}",
+            f"{src_metrics.get('lexical_diversity', 0):.3f}"
+        ],
+        'Target Text': [
+            tgt_metrics.get('word_count', 0),
+            tgt_metrics.get('char_count', 0),
+            tgt_metrics.get('sentence_count', 0),
+            tgt_metrics.get('unique_words', 0),
+            f"{tgt_metrics.get('avg_word_length', 0):.1f}",
+            f"{tgt_metrics.get('lexical_diversity', 0):.3f}"
+        ]
+    }
+    return pd.DataFrame(data)
+def translate_and_analyze(text):
+    if not text.strip():
+        return "Please enter Northern Sotho (Sepedi) text.", "No analysis available.", create_metrics_table({}, {})
+    start = time.time()
+    translated = translate_nso_en(text)
+    src_metrics = calculate_metrics(text)
+    tgt_metrics = calculate_metrics(translated)
+    elapsed = time.time() - start
+    report = f"""## 📊 Linguistic Analysis Report
+### Translation Details
+- **Processing Time**: {elapsed:.2f} seconds
+### Text Complexity Metrics
+| Metric | Source | Target | Ratio |
+|--------|--------|--------|-------|
+| Word Count | {src_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0) / max(src_metrics.get('word_count', 1), 1):.2f} |
+| Character Count | {src_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0) / max(src_metrics.get('char_count', 1), 1):.2f} |
+| Sentence Count | {src_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0) / max(src_metrics.get('sentence_count', 1), 1):.2f} |
+| Avg Word Length | {src_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0) / max(src_metrics.get('avg_word_length', 1), 1):.2f} |
+| Lexical Diversity | {src_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0) / max(src_metrics.get('lexical_diversity', 0.001), 0.001):.2f} |
+"""
+    table = create_metrics_table(src_metrics, tgt_metrics)
+    return translated, report, table
+# Batch processing
+def secure_batch_processing(file_obj):
+    if file_obj is None:
+        return "Please upload a file.", pd.DataFrame()
+    temp_dir = None
+    try:
+        session_id = str(uuid.uuid4())
+        temp_dir = tempfile.mkdtemp(prefix=f"translation_{session_id}_")
+        file_ext = os.path.splitext(file_obj.name)[1].lower()
+        if file_ext not in ['.txt', '.csv']:
+            return "Only .txt and .csv files are supported.", pd.DataFrame()
+        temp_file_path = os.path.join(temp_dir, f"upload_{session_id}{file_ext}")
+        import shutil
+        shutil.copy2(file_obj.name, temp_file_path)
+        texts = []
+        if file_ext == '.csv':
+            df = pd.read_csv(temp_file_path)
+            if df.empty:
+                return "The uploaded CSV file is empty.", pd.DataFrame()
+            texts = df.iloc[:, 0].dropna().astype(str).tolist()
+        else:
+            with open(temp_file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            texts = [line.strip() for line in content.split('\n') if line.strip()]
+        if not texts:
+            return "No text found in the uploaded file.", pd.DataFrame()
+        max_batch_size = 10
+        if len(texts) > max_batch_size:
+            texts = texts[:max_batch_size]
+            warning_msg = f"Processing limited to first {max_batch_size} entries for performance."
+        else:
+            warning_msg = ""
+        results = []
+        for i, text in enumerate(texts):
+            if len(text.strip()) == 0:
+                continue
+            if len(text) > 1000:
+                text = text[:1000] + "..."
+            translated = translate_nso_en(text)
+            results.append({
+                'Index': i + 1,
+                'Original': text[:100] + '...' if len(text) > 100 else text,
+                'Translation': translated[:100] + '...' if len(translated) > 100 else translated
+            })
+        if not results:
+            return "No valid text entries found to translate.", pd.DataFrame()
+        results_df = pd.DataFrame(results)
+        summary = f"Successfully processed {len(results)} text entries."
+        if warning_msg:
+            summary = f"{summary} {warning_msg}"
+        return summary, results_df
+    except Exception as e:
+        return f"Error processing file: {str(e)}", pd.DataFrame()
+    finally:
+        if temp_dir and os.path.exists(temp_dir):
+            try:
+                import shutil
+                shutil.rmtree(temp_dir)
+            except Exception as e:
+                print(f"Warning: Could not clean up temporary directory: {e}")
+# Examples
+EXAMPLES = [
+    ["Leina la ka ke Vukosi."],
+    ["Ke leboga thušo ya gago."],
+    ["Re a go amogela mo Pretoria."],
+    ["Go tloga ka letšatši la lehono, dilo di tlo kaonafala."],
+    ["O swanetše go hwetša thušo ge go kgonega."],
+    ["Ngwana o ya sekolong letšatšing le lengwe le le lengwe."]
+]
+# Research tools
+def detailed_analysis(text):
+    if not text.strip():
+        return {}
+    metrics = calculate_metrics(text)
+    return {
+        "basic_metrics": metrics,
+        "text_length": len(text),
+        "analysis_completed": True
+    }
+def create_gradio_interface():
+    with gr.Blocks(
+        title="🔬 Northern Sotho-English Linguistic Translation Tool",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container {font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;}
+        .main-header {text-align: center; padding: 2rem 0;}
+        .dsfsi-logo {text-align: center; margin-bottom: 1rem;}
+        .dsfsi-logo img {max-width: 300px; height: auto;}
+        .metric-table {font-size: 0.9em;}
+        """
+    ) as demo:
+        gr.HTML("""
+        <div class="dsfsi-logo">
+            <img src="https://www.dsfsi.co.za/images/logo_transparent_expanded.png" alt="DSFSI Logo" />
+        </div>
+        <div class="main-header">
+            <h1>🔬 Northern Sotho-English Linguistic Translation Tool</h1>
+            <p style="font-size: 1.1em; color: #666; max-width: 800px; margin: 0 auto;">
+                AI-powered translation system for Northern Sotho (Sepedi) to English with detailed linguistic analysis, designed for linguists, researchers, and language documentation projects.
+            </p>
         </div>
         """)
+        with gr.Tabs():
+            with gr.Tab("🌐 Translation & Analysis"):
+                gr.Markdown("""
+                ### Real-time Translation with Linguistic Analysis
+                Translate from Northern Sotho (Sepedi) to English and get detailed linguistic insights.
+                """)
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        input_text = gr.Textbox(
+                            label="Northern Sotho (Sepedi) Input",
+                            placeholder="Enter text to translate...",
+                            lines=4,
+                            max_lines=10
+                        )
+                        translate_btn = gr.Button("🔄 Translate & Analyze", variant="primary", size="lg")
+                    with gr.Column(scale=1):
+                        output_text = gr.Textbox(
+                            label="Translation (English)",
+                            lines=4,
+                            interactive=False
+                        )
+                gr.Markdown("### 📚 Example Translations")
+                gr.Examples(
+                    examples=EXAMPLES,
+                    inputs=[input_text],
+                    label="Click an example to try it:"
+                )
+                with gr.Accordion("📊 Detailed Linguistic Analysis", open=False):
+                    analysis_output = gr.Markdown(label="Analysis Report")
+                with gr.Accordion("📈 Metrics Table", open=False):
+                    metrics_table = gr.Dataframe(
+                        label="Comparative Metrics",
+                        headers=["Metric", "Source Text", "Target Text"],
+                        interactive=False
+                    )
+                translate_btn.click(
+                    fn=translate_and_analyze,
+                    inputs=input_text,
+                    outputs=[output_text, analysis_output, metrics_table]
+                )
+            with gr.Tab("📁 Batch Processing"):
+                gr.Markdown("""
+                ### Secure Corpus Analysis & Batch Translation
+                Upload text or CSV files for batch translation and analysis. Files are processed securely and temporarily.
+                """)
+                with gr.Row():
+                    with gr.Column():
+                        file_upload = gr.File(
+                            label="Upload File (Max 5MB)",
+                            file_types=[".txt", ".csv"],
+                            type="filepath",
+                            file_count="single"
+                        )
+                        batch_btn = gr.Button("🔄 Process Batch", variant="primary")
+                        gr.Markdown("""
+                        **Supported formats:**
+                        - `.txt` files: One text per line
+                        - `.csv` files: Text in first column
+                        - **Security limits**: Max 10 entries, 1000 chars per text
+                        - **Privacy**: Files are deleted after processing
+                        """)
+                    with gr.Column():
+                        batch_summary = gr.Textbox(
+                            label="Processing Summary",
+                            lines=3,
+                            interactive=False
+                        )
+                        batch_results = gr.Dataframe(
+                            label="Translation Results",
+                            interactive=False,
+                            wrap=True
+                        )
+                batch_btn.click(
+                    fn=secure_batch_processing,
+                    inputs=file_upload,
+                    outputs=[batch_summary, batch_results]
+                )
+            with gr.Tab("🔬 Research Tools"):
+                gr.Markdown("""
+                ### Advanced Linguistic Analysis Tools
+                Analyze text for linguistic features.
+                """)
+                with gr.Row():
+                    with gr.Column():
+                        research_text = gr.Textbox(
+                            label="Text for Analysis",
+                            lines=6,
+                            placeholder="Enter Northern Sotho (Sepedi) or English text...",
+                            max_lines=15
+                        )
+                        analyze_btn = gr.Button("🔍 Analyze Text", variant="primary")
+                    with gr.Column():
+                        research_output = gr.JSON(
+                            label="Detailed Analysis Results"
+                        )
+                analyze_btn.click(
+                    fn=detailed_analysis,
+                    inputs=research_text,
+                    outputs=research_output
+                )
+                gr.Markdown("""
+                ### 🗣️ About Northern Sotho (Sepedi) Language
+                **Northern Sotho (Sepedi)** is a Bantu language spoken by millions of people, primarily in:
+                - 🇿🇦 **South Africa** – Official language
+                **Key Linguistic Features:**
+                - **Language Family**: Niger-Congo → Bantu → Sotho-Tswana
+                - **Script**: Latin alphabet
+                - **Characteristics**: Agglutinative, noun-class system
+                - **ISO Code**: nso (ISO 639-2/3)
+                """)
         gr.Markdown("""
+        ---
+        ### 📚 Model Information & Citation
+        **Model Used:** [`dsfsi/nso-en-m2m100-gov`](https://huggingface.co/dsfsi/nso-en-m2m100-gov)
+        Based on Meta's M2M100, fine-tuned specifically for Northern Sotho-English by the **Data Science for Social Impact Research Group**.
+        **Training Data:** Vuk'uzenzele and ZA-gov-multilingual South African corpora.
+        ### 🔒 Privacy & Security
+        - No conversation history stored
+        - Uploaded files deleted after processing
+        - All processing in isolated temporary environments
+        - No user data persistence
+        ### 🙏 Acknowledgments
+        We thank **Thapelo Sindani** and **Zion Nia Van Wyk** for their assistance in creating this space.
+        ### 📖 Citation
+        ```bibtex
         @inproceedings{lastrucci-etal-2023-preparing,
+            title = "Preparing the Vuk'uzenzele and ZA-gov-multilingual South African multilingual corpora",
             author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab
                       and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate",
             booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
+            pages = "18--25",
+            year = "2023"
         }
+        ```
+        **Links**:
+        - [DSFSI](https://www.dsfsi.co.za/)
+        - [Model](https://huggingface.co/dsfsi/nso-en-m2m100-gov)
+        - [Vuk'uzenzele Data](https://github.com/dsfsi/vukuzenzele-nlp)
+        - [ZA-gov Data](https://github.com/dsfsi/gov-za-multilingual)
+        - [Research Feedback](https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform)
+        ---
+        **Built for the African NLP community**
         """)
+    return demo
+if __name__ == "__main__":
+    demo = create_gradio_interface()
+    demo.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )