vukosi commited on
Commit
aae664e
Β·
verified Β·
1 Parent(s): 92737f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +346 -62
app.py CHANGED
@@ -1,81 +1,365 @@
1
  import gradio as gr
 
2
  from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 
 
 
 
 
 
3
 
 
4
  model_name = "dsfsi/nso-en-m2m100-gov"
5
  tokenizer = M2M100Tokenizer.from_pretrained(model_name)
6
  model = M2M100ForConditionalGeneration.from_pretrained(model_name)
7
-
8
  tokenizer.src_lang = "ns"
9
  model.config.forced_bos_token_id = tokenizer.get_lang_id("en")
10
 
11
- def translate(inp):
12
- inputs = tokenizer(inp, return_tensors="pt")
13
- translated_tokens = model.generate(**inputs, max_length=512, forced_bos_token_id=tokenizer.get_lang_id("en"))
14
- translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
15
- return translated_text
16
-
17
- with gr.Blocks() as demo:
18
- with gr.Row():
19
- with gr.Column(scale=1):
20
- pass
21
- with gr.Column(scale=4, min_width=1000):
22
- gr.Image("logo_transparent_small.png", elem_id="logo", show_label=False, width=500)
23
- gr.Markdown("""
24
- <h1 style='text-align: center;'>Northern Sotho to English Translation</h1>
25
- <p style='text-align: center;'>This space provides a translation service from Northern Sotho to English using the M2M100 model, fine-tuned for low-resource languages. It supports researchers, linguists, and users working with Northern Sotho texts.</p>
26
- """)
27
- with gr.Column(scale=1):
28
- pass
29
-
30
- with gr.Column(variant="panel"):
31
- inp_text = gr.Textbox(lines=5, placeholder="Enter Northern Sotho text (maximum 5 lines)", label="Input", elem_id="centered-input")
32
- output_text = gr.Textbox(label="Output", elem_id="centered-output")
33
- translate_button = gr.Button("Translate", elem_id="centered-button")
34
- translate_button.click(translate, inputs=inp_text, outputs=output_text)
35
-
36
- gr.Markdown("""
37
- <div style='text-align: center;'>
38
- <a href='https://github.com/dsfsi/nso-en-m2m100-gov' target='_blank'>GitHub</a> |
39
- <a href='https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform' target='_blank'>Feedback Form</a> |
40
- <a href='https://arxiv.org/abs/2303.03750' target='_blank'>Arxiv</a>
41
- </div>
42
- <br/>
43
- """)
44
-
45
- with gr.Accordion("More Information", open=False):
46
- gr.Markdown("""
47
- <h4 style="text-align: center;">Model Description</h4>
48
- <p style='text-align: center;'>This is a variant of the M2M100 model, fine-tuned on a multilingual dataset to support translation from Northern Sotho (Sepedi) to English. The model was trained with a focus on improving translation accuracy for low-resource languages.</p>
49
- """)
50
- gr.Markdown("""
51
- <h4 style="text-align: center;">Authors</h4>
52
- <div style='text-align: center;'>
53
- Vukosi Marivate, Matimba Shingange, Richard Lastrucci,
54
- Isheanesu Joseph Dzingirai, Jenalea Rajab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  </div>
56
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  gr.Markdown("""
58
- <h4 style="text-align: center;">Citation</h4>
59
- <pre style="text-align: center; white-space: pre-wrap;">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  @inproceedings{lastrucci-etal-2023-preparing,
61
- title = "Preparing the Vuk{'}uzenzele and {ZA}-gov-multilingual {S}outh {A}frican multilingual corpora",
62
  author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab
63
  and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate",
64
  booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
65
- month = may,
66
- year = "2023",
67
- address = "Dubrovnik, Croatia",
68
- publisher = "Association for Computational Linguistics",
69
- url = "https://aclanthology.org/2023.rail-1.3",
70
- pages = "18--25"
71
  }
72
- </pre>
73
- """)
74
- gr.Markdown("""
75
- <h4 style="text-align: center;">DOI</h4>
76
- <div style='text-align: center;'>
77
- <a href="https://doi.org/10.48550/arXiv.2303.03750" target="_blank">10.48550/arXiv.2303.03750</a>
78
- </div>
 
 
79
  """)
 
80
 
81
- demo.launch()
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
  from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
4
+ import pandas as pd
5
+ import time
6
+ import re
7
+ import tempfile
8
+ import os
9
+ import uuid
10
 
11
+ # Model loading
12
  model_name = "dsfsi/nso-en-m2m100-gov"
13
  tokenizer = M2M100Tokenizer.from_pretrained(model_name)
14
  model = M2M100ForConditionalGeneration.from_pretrained(model_name)
 
15
  tokenizer.src_lang = "ns"
16
  model.config.forced_bos_token_id = tokenizer.get_lang_id("en")
17
 
18
+ # Translation function (single)
19
+ def translate_nso_en(text):
20
+ if not text.strip():
21
+ return "Please enter Northern Sotho (Sepedi) text."
22
+ inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
23
+ translated_tokens = model.generate(
24
+ **inputs,
25
+ max_length=512,
26
+ forced_bos_token_id=tokenizer.get_lang_id("en")
27
+ )
28
+ return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
29
+
30
+ # Linguistic analysis
31
+ def calculate_metrics(text):
32
+ words = text.split()
33
+ word_count = len(words)
34
+ char_count = len(text)
35
+ sentence_count = len([s for s in re.split(r'[.!?]+', text) if s.strip()])
36
+ unique_words = len(set(words))
37
+ avg_word_length = sum(len(w) for w in words) / word_count if word_count else 0
38
+ lexical_div = unique_words / word_count if word_count else 0
39
+ return {
40
+ 'char_count': char_count,
41
+ 'word_count': word_count,
42
+ 'sentence_count': sentence_count,
43
+ 'unique_words': unique_words,
44
+ 'avg_word_length': avg_word_length,
45
+ 'lexical_diversity': lexical_div
46
+ }
47
+
48
+ def create_metrics_table(src_metrics, tgt_metrics):
49
+ data = {
50
+ 'Metric': ['Words', 'Characters', 'Sentences', 'Unique Words', 'Avg Word Length', 'Lexical Diversity'],
51
+ 'Source Text': [
52
+ src_metrics.get('word_count', 0),
53
+ src_metrics.get('char_count', 0),
54
+ src_metrics.get('sentence_count', 0),
55
+ src_metrics.get('unique_words', 0),
56
+ f"{src_metrics.get('avg_word_length', 0):.1f}",
57
+ f"{src_metrics.get('lexical_diversity', 0):.3f}"
58
+ ],
59
+ 'Target Text': [
60
+ tgt_metrics.get('word_count', 0),
61
+ tgt_metrics.get('char_count', 0),
62
+ tgt_metrics.get('sentence_count', 0),
63
+ tgt_metrics.get('unique_words', 0),
64
+ f"{tgt_metrics.get('avg_word_length', 0):.1f}",
65
+ f"{tgt_metrics.get('lexical_diversity', 0):.3f}"
66
+ ]
67
+ }
68
+ return pd.DataFrame(data)
69
+
70
+ def translate_and_analyze(text):
71
+ if not text.strip():
72
+ return "Please enter Northern Sotho (Sepedi) text.", "No analysis available.", create_metrics_table({}, {})
73
+ start = time.time()
74
+ translated = translate_nso_en(text)
75
+ src_metrics = calculate_metrics(text)
76
+ tgt_metrics = calculate_metrics(translated)
77
+ elapsed = time.time() - start
78
+ report = f"""## πŸ“Š Linguistic Analysis Report
79
+
80
+ ### Translation Details
81
+ - **Processing Time**: {elapsed:.2f} seconds
82
+
83
+ ### Text Complexity Metrics
84
+ | Metric | Source | Target | Ratio |
85
+ |--------|--------|--------|-------|
86
+ | Word Count | {src_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0)} | {tgt_metrics.get('word_count', 0) / max(src_metrics.get('word_count', 1), 1):.2f} |
87
+ | Character Count | {src_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0)} | {tgt_metrics.get('char_count', 0) / max(src_metrics.get('char_count', 1), 1):.2f} |
88
+ | Sentence Count | {src_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0)} | {tgt_metrics.get('sentence_count', 0) / max(src_metrics.get('sentence_count', 1), 1):.2f} |
89
+ | Avg Word Length | {src_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0):.1f} | {tgt_metrics.get('avg_word_length', 0) / max(src_metrics.get('avg_word_length', 1), 1):.2f} |
90
+ | Lexical Diversity | {src_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0):.3f} | {tgt_metrics.get('lexical_diversity', 0) / max(src_metrics.get('lexical_diversity', 0.001), 0.001):.2f} |
91
+ """
92
+ table = create_metrics_table(src_metrics, tgt_metrics)
93
+ return translated, report, table
94
+
95
+ # Batch processing
96
+ def secure_batch_processing(file_obj):
97
+ if file_obj is None:
98
+ return "Please upload a file.", pd.DataFrame()
99
+ temp_dir = None
100
+ try:
101
+ session_id = str(uuid.uuid4())
102
+ temp_dir = tempfile.mkdtemp(prefix=f"translation_{session_id}_")
103
+ file_ext = os.path.splitext(file_obj.name)[1].lower()
104
+ if file_ext not in ['.txt', '.csv']:
105
+ return "Only .txt and .csv files are supported.", pd.DataFrame()
106
+ temp_file_path = os.path.join(temp_dir, f"upload_{session_id}{file_ext}")
107
+ import shutil
108
+ shutil.copy2(file_obj.name, temp_file_path)
109
+ texts = []
110
+ if file_ext == '.csv':
111
+ df = pd.read_csv(temp_file_path)
112
+ if df.empty:
113
+ return "The uploaded CSV file is empty.", pd.DataFrame()
114
+ texts = df.iloc[:, 0].dropna().astype(str).tolist()
115
+ else:
116
+ with open(temp_file_path, 'r', encoding='utf-8') as f:
117
+ content = f.read()
118
+ texts = [line.strip() for line in content.split('\n') if line.strip()]
119
+ if not texts:
120
+ return "No text found in the uploaded file.", pd.DataFrame()
121
+ max_batch_size = 10
122
+ if len(texts) > max_batch_size:
123
+ texts = texts[:max_batch_size]
124
+ warning_msg = f"Processing limited to first {max_batch_size} entries for performance."
125
+ else:
126
+ warning_msg = ""
127
+ results = []
128
+ for i, text in enumerate(texts):
129
+ if len(text.strip()) == 0:
130
+ continue
131
+ if len(text) > 1000:
132
+ text = text[:1000] + "..."
133
+ translated = translate_nso_en(text)
134
+ results.append({
135
+ 'Index': i + 1,
136
+ 'Original': text[:100] + '...' if len(text) > 100 else text,
137
+ 'Translation': translated[:100] + '...' if len(translated) > 100 else translated
138
+ })
139
+ if not results:
140
+ return "No valid text entries found to translate.", pd.DataFrame()
141
+ results_df = pd.DataFrame(results)
142
+ summary = f"Successfully processed {len(results)} text entries."
143
+ if warning_msg:
144
+ summary = f"{summary} {warning_msg}"
145
+ return summary, results_df
146
+ except Exception as e:
147
+ return f"Error processing file: {str(e)}", pd.DataFrame()
148
+ finally:
149
+ if temp_dir and os.path.exists(temp_dir):
150
+ try:
151
+ import shutil
152
+ shutil.rmtree(temp_dir)
153
+ except Exception as e:
154
+ print(f"Warning: Could not clean up temporary directory: {e}")
155
+
156
+ # Examples
157
+ EXAMPLES = [
158
+ ["Leina la ka ke Vukosi."],
159
+ ["Ke leboga thuΕ‘o ya gago."],
160
+ ["Re a go amogela mo Pretoria."],
161
+ ["Go tloga ka letΕ‘atΕ‘i la lehono, dilo di tlo kaonafala."],
162
+ ["O swanetΕ‘e go hwetΕ‘a thuΕ‘o ge go kgonega."],
163
+ ["Ngwana o ya sekolong letΕ‘atΕ‘ing le lengwe le le lengwe."]
164
+ ]
165
+
166
+ # Research tools
167
+ def detailed_analysis(text):
168
+ if not text.strip():
169
+ return {}
170
+ metrics = calculate_metrics(text)
171
+ return {
172
+ "basic_metrics": metrics,
173
+ "text_length": len(text),
174
+ "analysis_completed": True
175
+ }
176
+
177
+ def create_gradio_interface():
178
+ with gr.Blocks(
179
+ title="πŸ”¬ Northern Sotho-English Linguistic Translation Tool",
180
+ theme=gr.themes.Soft(),
181
+ css="""
182
+ .gradio-container {font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;}
183
+ .main-header {text-align: center; padding: 2rem 0;}
184
+ .dsfsi-logo {text-align: center; margin-bottom: 1rem;}
185
+ .dsfsi-logo img {max-width: 300px; height: auto;}
186
+ .metric-table {font-size: 0.9em;}
187
+ """
188
+ ) as demo:
189
+
190
+ gr.HTML("""
191
+ <div class="dsfsi-logo">
192
+ <img src="https://www.dsfsi.co.za/images/logo_transparent_expanded.png" alt="DSFSI Logo" />
193
+ </div>
194
+ <div class="main-header">
195
+ <h1>πŸ”¬ Northern Sotho-English Linguistic Translation Tool</h1>
196
+ <p style="font-size: 1.1em; color: #666; max-width: 800px; margin: 0 auto;">
197
+ AI-powered translation system for Northern Sotho (Sepedi) to English with detailed linguistic analysis, designed for linguists, researchers, and language documentation projects.
198
+ </p>
199
  </div>
200
  """)
201
+
202
+ with gr.Tabs():
203
+ with gr.Tab("🌐 Translation & Analysis"):
204
+ gr.Markdown("""
205
+ ### Real-time Translation with Linguistic Analysis
206
+ Translate from Northern Sotho (Sepedi) to English and get detailed linguistic insights.
207
+ """)
208
+ with gr.Row():
209
+ with gr.Column(scale=1):
210
+ input_text = gr.Textbox(
211
+ label="Northern Sotho (Sepedi) Input",
212
+ placeholder="Enter text to translate...",
213
+ lines=4,
214
+ max_lines=10
215
+ )
216
+ translate_btn = gr.Button("πŸ”„ Translate & Analyze", variant="primary", size="lg")
217
+ with gr.Column(scale=1):
218
+ output_text = gr.Textbox(
219
+ label="Translation (English)",
220
+ lines=4,
221
+ interactive=False
222
+ )
223
+ gr.Markdown("### πŸ“š Example Translations")
224
+ gr.Examples(
225
+ examples=EXAMPLES,
226
+ inputs=[input_text],
227
+ label="Click an example to try it:"
228
+ )
229
+ with gr.Accordion("πŸ“Š Detailed Linguistic Analysis", open=False):
230
+ analysis_output = gr.Markdown(label="Analysis Report")
231
+ with gr.Accordion("πŸ“ˆ Metrics Table", open=False):
232
+ metrics_table = gr.Dataframe(
233
+ label="Comparative Metrics",
234
+ headers=["Metric", "Source Text", "Target Text"],
235
+ interactive=False
236
+ )
237
+ translate_btn.click(
238
+ fn=translate_and_analyze,
239
+ inputs=input_text,
240
+ outputs=[output_text, analysis_output, metrics_table]
241
+ )
242
+
243
+ with gr.Tab("πŸ“ Batch Processing"):
244
+ gr.Markdown("""
245
+ ### Secure Corpus Analysis & Batch Translation
246
+ Upload text or CSV files for batch translation and analysis. Files are processed securely and temporarily.
247
+ """)
248
+ with gr.Row():
249
+ with gr.Column():
250
+ file_upload = gr.File(
251
+ label="Upload File (Max 5MB)",
252
+ file_types=[".txt", ".csv"],
253
+ type="filepath",
254
+ file_count="single"
255
+ )
256
+ batch_btn = gr.Button("πŸ”„ Process Batch", variant="primary")
257
+ gr.Markdown("""
258
+ **Supported formats:**
259
+ - `.txt` files: One text per line
260
+ - `.csv` files: Text in first column
261
+ - **Security limits**: Max 10 entries, 1000 chars per text
262
+ - **Privacy**: Files are deleted after processing
263
+ """)
264
+ with gr.Column():
265
+ batch_summary = gr.Textbox(
266
+ label="Processing Summary",
267
+ lines=3,
268
+ interactive=False
269
+ )
270
+ batch_results = gr.Dataframe(
271
+ label="Translation Results",
272
+ interactive=False,
273
+ wrap=True
274
+ )
275
+ batch_btn.click(
276
+ fn=secure_batch_processing,
277
+ inputs=file_upload,
278
+ outputs=[batch_summary, batch_results]
279
+ )
280
+
281
+ with gr.Tab("πŸ”¬ Research Tools"):
282
+ gr.Markdown("""
283
+ ### Advanced Linguistic Analysis Tools
284
+ Analyze text for linguistic features.
285
+ """)
286
+ with gr.Row():
287
+ with gr.Column():
288
+ research_text = gr.Textbox(
289
+ label="Text for Analysis",
290
+ lines=6,
291
+ placeholder="Enter Northern Sotho (Sepedi) or English text...",
292
+ max_lines=15
293
+ )
294
+ analyze_btn = gr.Button("πŸ” Analyze Text", variant="primary")
295
+ with gr.Column():
296
+ research_output = gr.JSON(
297
+ label="Detailed Analysis Results"
298
+ )
299
+ analyze_btn.click(
300
+ fn=detailed_analysis,
301
+ inputs=research_text,
302
+ outputs=research_output
303
+ )
304
+ gr.Markdown("""
305
+ ### πŸ—£οΈ About Northern Sotho (Sepedi) Language
306
+
307
+ **Northern Sotho (Sepedi)** is a Bantu language spoken by millions of people, primarily in:
308
+ - πŸ‡ΏπŸ‡¦ **South Africa** – Official language
309
+
310
+ **Key Linguistic Features:**
311
+ - **Language Family**: Niger-Congo β†’ Bantu β†’ Sotho-Tswana
312
+ - **Script**: Latin alphabet
313
+ - **Characteristics**: Agglutinative, noun-class system
314
+ - **ISO Code**: nso (ISO 639-2/3)
315
+ """)
316
+
317
  gr.Markdown("""
318
+ ---
319
+ ### πŸ“š Model Information & Citation
320
+
321
+ **Model Used:** [`dsfsi/nso-en-m2m100-gov`](https://huggingface.co/dsfsi/nso-en-m2m100-gov)
322
+
323
+ Based on Meta's M2M100, fine-tuned specifically for Northern Sotho-English by the **Data Science for Social Impact Research Group**.
324
+
325
+ **Training Data:** Vuk'uzenzele and ZA-gov-multilingual South African corpora.
326
+
327
+ ### πŸ”’ Privacy & Security
328
+ - No conversation history stored
329
+ - Uploaded files deleted after processing
330
+ - All processing in isolated temporary environments
331
+ - No user data persistence
332
+
333
+ ### πŸ™ Acknowledgments
334
+ We thank **Thapelo Sindani** and **Zion Nia Van Wyk** for their assistance in creating this space.
335
+
336
+ ### πŸ“– Citation
337
+ ```bibtex
338
  @inproceedings{lastrucci-etal-2023-preparing,
339
+ title = "Preparing the Vuk'uzenzele and ZA-gov-multilingual South African multilingual corpora",
340
  author = "Richard Lastrucci and Isheanesu Dzingirai and Jenalea Rajab
341
  and Andani Madodonga and Matimba Shingange and Daniel Njini and Vukosi Marivate",
342
  booktitle = "Proceedings of the Fourth workshop on Resources for African Indigenous Languages (RAIL 2023)",
343
+ pages = "18--25",
344
+ year = "2023"
 
 
 
 
345
  }
346
+ ```
347
+ **Links**:
348
+ - [DSFSI](https://www.dsfsi.co.za/)
349
+ - [Model](https://huggingface.co/dsfsi/nso-en-m2m100-gov)
350
+ - [Vuk'uzenzele Data](https://github.com/dsfsi/vukuzenzele-nlp)
351
+ - [ZA-gov Data](https://github.com/dsfsi/gov-za-multilingual)
352
+ - [Research Feedback](https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform)
353
+ ---
354
+ **Built for the African NLP community**
355
  """)
356
+ return demo
357
 
358
+ if __name__ == "__main__":
359
+ demo = create_gradio_interface()
360
+ demo.launch(
361
+ share=True,
362
+ server_name="0.0.0.0",
363
+ server_port=7860,
364
+ show_error=True
365
+ )