File size: 7,916 Bytes
b887515
 
 
9977554
 
ba4c4d4
70e7462
 
9977554
b887515
 
 
 
 
 
 
 
70e7462
4e0cddf
b887515
70e7462
 
 
 
 
 
 
 
 
 
 
2c7bfb2
70e7462
2c7bfb2
 
70e7462
2c7bfb2
 
 
 
 
 
 
 
 
 
 
70e7462
2c7bfb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70e7462
2c7bfb2
 
70e7462
9d7e270
70e7462
 
 
 
 
 
 
 
 
9d7e270
70e7462
 
 
c26c590
 
70e7462
 
9d7e270
 
 
 
 
 
541edf0
9d7e270
 
 
 
 
 
70e7462
9d7e270
70e7462
 
9d7e270
70e7462
 
 
 
 
 
 
 
 
 
541edf0
9977554
2c7bfb2
70e7462
2c7bfb2
b887515
2c7bfb2
b887515
 
 
 
 
 
 
9977554
b887515
 
 
9977554
2c7bfb2
70e7462
2c7bfb2
b887515
 
 
c26c590
70e7462
 
 
 
 
 
 
af7e6fc
c26c590
 
 
 
70e7462
b887515
70e7462
b887515
70e7462
b887515
 
70e7462
 
 
 
 
 
c26c590
 
70e7462
 
 
 
 
 
 
 
b887515
 
 
 
 
70e7462
b887515
 
 
 
 
70e7462
b887515
 
70e7462
b887515
70e7462
b887515
af7e6fc
 
 
 
c26c590
9d7e270
70e7462
b887515
70e7462
af7e6fc
9d7e270
b887515
70e7462
b887515
 
 
 
70e7462
 
 
 
 
 
 
 
b887515
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import gradio as gr
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import os
import torch
import spaces
from datasets import Dataset, load_dataset, concatenate_datasets
import time
import datetime

# Define model paths
MODEL_PATHS = {
    "Terjman-Nano-v2": "BounharAbdelaziz/Terjman-Nano-v2.0",
    "Terjman-Large-v2": "BounharAbdelaziz/Terjman-Large-v2.0",
    "Terjman-Ultra-v2": "BounharAbdelaziz/Terjman-Ultra-v2.0",
    "Terjman-Supreme-v2": "BounharAbdelaziz/Terjman-Supreme-v2.0"
}

# Load environment tokens
TOKEN = os.environ['TOKEN']

# Dataset configuration
DATASET_REPO = "BounharAbdelaziz/terjman-v2-live-translations"
# Number of translations to collect before pushing
BATCH_SIZE = 10  
# Time in seconds between pushes (1 hour)
UPDATE_INTERVAL = 3600

# Initialize dataset tracking
translations_buffer = []
last_push_time = time.time()

def preload_models():
    """ Preload models and tokenizers """
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"[INFO] Using device: {device}")
    
    # Load Nano and Large models
    nano_large_models = {}
    for model_name in ["Terjman-Nano-v2", "Terjman-Large-v2"]:
        print(f"[INFO] Loading {model_name}...")
        translator = pipeline(
            "translation",
            model=MODEL_PATHS[model_name],
            token=TOKEN,
            device=device if device.startswith("cuda") else -1
        )
        nano_large_models[model_name] = translator
    
    # Load Ultra and Supreme models
    ultra_supreme_models = {}
    for model_name in ["Terjman-Ultra-v2", "Terjman-Supreme-v2"]:
        print(f"[INFO] Loading {model_name}...")
        model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATHS[model_name], token=TOKEN).to(device)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATHS[model_name], token=TOKEN)
        translator = pipeline(
            "translation",
            model=model,
            tokenizer=tokenizer,
            device=device if device.startswith("cuda") else -1,
            src_lang="eng_Latn",
            tgt_lang="ary_Arab"
        )
        ultra_supreme_models[model_name] = translator
    
    return nano_large_models, ultra_supreme_models

def push_to_hf_dataset():
    """ Save translations in HF dataset for monitoring, preserving previous data """
    global translations_buffer, last_push_time
    
    if not translations_buffer:
        return
    
    try:
        print(f"[INFO] Pushing {len(translations_buffer)} translations to Hugging Face dataset...")
        
        # Create dataset from buffer
        new_data = Dataset.from_dict({
            "source_text": [item["source_text"] for item in translations_buffer],
            "translated_text": [item["translated_text"] for item in translations_buffer],
            "model_used": [item["model_used"] for item in translations_buffer],
            "timestamp": [item["timestamp"] for item in translations_buffer],
            "user_id": [item["user_id"] for item in translations_buffer]  # Include user ID
        })
        
        # Try to load existing dataset
        try:
            existing_dataset = load_dataset(DATASET_REPO, split="live_translations", token=TOKEN)
            print(f"[INFO] Loaded existing dataset with {len(existing_dataset)} entries")
            
            # Concatenate existing data with new data
            combined_dataset = concatenate_datasets([existing_dataset, new_data])
            print(f"[INFO] Combined dataset now has {len(combined_dataset)} entries")
        except Exception as e:
            print(f"[INFO] No existing dataset found or error loading: {str(e)}")
            print(f"[INFO] Creating new dataset")
            combined_dataset = new_data
        
        # Push to hub
        combined_dataset.push_to_hub(
            DATASET_REPO, 
            token=TOKEN,
            split="live_translations",
            private=True,
        )
        
        # Clear buffer and reset timer
        translations_buffer = []
        last_push_time = time.time()
        print("[INFO] Successfully pushed translations to Hugging Face dataset")
    
    except Exception as e:
        print(f"[ERROR] Failed to push dataset to Hugging Face: {str(e)}")
        
@spaces.GPU
def translate_nano_large(text, model_name):
    """ Translation function for Nano and Large models """
    translator = nano_large_models[model_name]
    translated = translator(
        text,
        max_length=512,
        num_beams=4,
        no_repeat_ngram_size=3,
        early_stopping=True,
        do_sample=False,
        pad_token_id=translator.tokenizer.pad_token_id,
        bos_token_id=translator.tokenizer.bos_token_id,
        eos_token_id=translator.tokenizer.eos_token_id,
    )
    return translated[0]["translation_text"]

@spaces.GPU
def translate_ultra_supreme(text, model_name):
    """ Translation function for Ultra and Supreme models """
    translator = ultra_supreme_models[model_name]
    translation = translator(text)[0]['translation_text']
    return translation

def translate_text(text, model_choice, request: gr.Request):
    """ Main translation function """
    global translations_buffer, last_push_time
    
    # Skip empty text
    if not text or text.strip() == "":
        return "Please enter text to translate."
    
    # Get the user ID (if logged in)
    user_id = "anonymous"
    if request and hasattr(request, "username") and request.username:
        user_id = request.username
    
    # Perform translation
    if model_choice in ["Terjman-Nano-v2", "Terjman-Large-v2"]:
        translation = translate_nano_large(text, model_choice)
    elif model_choice in ["Terjman-Ultra-v2", "Terjman-Supreme-v2"]:
        translation = translate_ultra_supreme(text, model_choice)
    else:
        return "Invalid model selection."
    
    # Add to buffer
    translations_buffer.append({
        "source_text": text,
        "translated_text": translation,
        "model_used": model_choice,
        "timestamp": datetime.datetime.now().isoformat(),
        "user_id": user_id  # Add the user ID to the dataset
    })
    
    # Check if it's time to push to HF
    current_time = time.time()
    if len(translations_buffer) >= BATCH_SIZE or (current_time - last_push_time) >= UPDATE_INTERVAL:
        push_to_hf_dataset()
    
    return translation

def gradio_app():
    with gr.Blocks() as app:
        gr.Markdown("# 🇲🇦 Terjman-v2")
        gr.Markdown("Choose a model and enter the English text you want to translate to Moroccan Darija.")
        
        model_choice = gr.Dropdown(
            label="Select Model",
            choices=["Terjman-Nano-v2", "Terjman-Large-v2", "Terjman-Ultra-v2", "Terjman-Supreme-v2"],
            value="Terjman-Ultra-v2"
        )
        
        input_text = gr.Textbox(label="Input Text", placeholder="Enter text to translate...", lines=3)
        output_text = gr.Textbox(label="Translated Text", interactive=False, lines=3)
        
        translate_button = gr.Button("Translate")
        
        # Link input and output
        def translate_and_update_status(text, model):
            """Wrapper function to handle translation and update status."""
            # Access the request object directly
            request = gr.Request()
            translation = translate_text(text, model, request)
            return translation
        
        translate_button.click(
            fn=translate_and_update_status,
            inputs=[input_text, model_choice],
            outputs=[output_text]
        )
    
    return app

# Run the app
if __name__ == "__main__":
    # Register shutdown handler to save remaining translations
    import atexit
    atexit.register(push_to_hf_dataset)

    # Preload all models
    nano_large_models, ultra_supreme_models = preload_models()
    
    # Launch the app
    app = gradio_app()
    app.launch()