Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,916 Bytes
b887515 9977554 ba4c4d4 70e7462 9977554 b887515 70e7462 4e0cddf b887515 70e7462 2c7bfb2 70e7462 2c7bfb2 70e7462 2c7bfb2 70e7462 2c7bfb2 70e7462 2c7bfb2 70e7462 9d7e270 70e7462 9d7e270 70e7462 c26c590 70e7462 9d7e270 541edf0 9d7e270 70e7462 9d7e270 70e7462 9d7e270 70e7462 541edf0 9977554 2c7bfb2 70e7462 2c7bfb2 b887515 2c7bfb2 b887515 9977554 b887515 9977554 2c7bfb2 70e7462 2c7bfb2 b887515 c26c590 70e7462 af7e6fc c26c590 70e7462 b887515 70e7462 b887515 70e7462 b887515 70e7462 c26c590 70e7462 b887515 70e7462 b887515 70e7462 b887515 70e7462 b887515 70e7462 b887515 af7e6fc c26c590 9d7e270 70e7462 b887515 70e7462 af7e6fc 9d7e270 b887515 70e7462 b887515 70e7462 b887515 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
import gradio as gr
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import os
import torch
import spaces
from datasets import Dataset, load_dataset, concatenate_datasets
import time
import datetime
# Define model paths
MODEL_PATHS = {
"Terjman-Nano-v2": "BounharAbdelaziz/Terjman-Nano-v2.0",
"Terjman-Large-v2": "BounharAbdelaziz/Terjman-Large-v2.0",
"Terjman-Ultra-v2": "BounharAbdelaziz/Terjman-Ultra-v2.0",
"Terjman-Supreme-v2": "BounharAbdelaziz/Terjman-Supreme-v2.0"
}
# Load environment tokens
TOKEN = os.environ['TOKEN']
# Dataset configuration
DATASET_REPO = "BounharAbdelaziz/terjman-v2-live-translations"
# Number of translations to collect before pushing
BATCH_SIZE = 10
# Time in seconds between pushes (1 hour)
UPDATE_INTERVAL = 3600
# Initialize dataset tracking
translations_buffer = []
last_push_time = time.time()
def preload_models():
""" Preload models and tokenizers """
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"[INFO] Using device: {device}")
# Load Nano and Large models
nano_large_models = {}
for model_name in ["Terjman-Nano-v2", "Terjman-Large-v2"]:
print(f"[INFO] Loading {model_name}...")
translator = pipeline(
"translation",
model=MODEL_PATHS[model_name],
token=TOKEN,
device=device if device.startswith("cuda") else -1
)
nano_large_models[model_name] = translator
# Load Ultra and Supreme models
ultra_supreme_models = {}
for model_name in ["Terjman-Ultra-v2", "Terjman-Supreme-v2"]:
print(f"[INFO] Loading {model_name}...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATHS[model_name], token=TOKEN).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATHS[model_name], token=TOKEN)
translator = pipeline(
"translation",
model=model,
tokenizer=tokenizer,
device=device if device.startswith("cuda") else -1,
src_lang="eng_Latn",
tgt_lang="ary_Arab"
)
ultra_supreme_models[model_name] = translator
return nano_large_models, ultra_supreme_models
def push_to_hf_dataset():
""" Save translations in HF dataset for monitoring, preserving previous data """
global translations_buffer, last_push_time
if not translations_buffer:
return
try:
print(f"[INFO] Pushing {len(translations_buffer)} translations to Hugging Face dataset...")
# Create dataset from buffer
new_data = Dataset.from_dict({
"source_text": [item["source_text"] for item in translations_buffer],
"translated_text": [item["translated_text"] for item in translations_buffer],
"model_used": [item["model_used"] for item in translations_buffer],
"timestamp": [item["timestamp"] for item in translations_buffer],
"user_id": [item["user_id"] for item in translations_buffer] # Include user ID
})
# Try to load existing dataset
try:
existing_dataset = load_dataset(DATASET_REPO, split="live_translations", token=TOKEN)
print(f"[INFO] Loaded existing dataset with {len(existing_dataset)} entries")
# Concatenate existing data with new data
combined_dataset = concatenate_datasets([existing_dataset, new_data])
print(f"[INFO] Combined dataset now has {len(combined_dataset)} entries")
except Exception as e:
print(f"[INFO] No existing dataset found or error loading: {str(e)}")
print(f"[INFO] Creating new dataset")
combined_dataset = new_data
# Push to hub
combined_dataset.push_to_hub(
DATASET_REPO,
token=TOKEN,
split="live_translations",
private=True,
)
# Clear buffer and reset timer
translations_buffer = []
last_push_time = time.time()
print("[INFO] Successfully pushed translations to Hugging Face dataset")
except Exception as e:
print(f"[ERROR] Failed to push dataset to Hugging Face: {str(e)}")
@spaces.GPU
def translate_nano_large(text, model_name):
""" Translation function for Nano and Large models """
translator = nano_large_models[model_name]
translated = translator(
text,
max_length=512,
num_beams=4,
no_repeat_ngram_size=3,
early_stopping=True,
do_sample=False,
pad_token_id=translator.tokenizer.pad_token_id,
bos_token_id=translator.tokenizer.bos_token_id,
eos_token_id=translator.tokenizer.eos_token_id,
)
return translated[0]["translation_text"]
@spaces.GPU
def translate_ultra_supreme(text, model_name):
""" Translation function for Ultra and Supreme models """
translator = ultra_supreme_models[model_name]
translation = translator(text)[0]['translation_text']
return translation
def translate_text(text, model_choice, request: gr.Request):
""" Main translation function """
global translations_buffer, last_push_time
# Skip empty text
if not text or text.strip() == "":
return "Please enter text to translate."
# Get the user ID (if logged in)
user_id = "anonymous"
if request and hasattr(request, "username") and request.username:
user_id = request.username
# Perform translation
if model_choice in ["Terjman-Nano-v2", "Terjman-Large-v2"]:
translation = translate_nano_large(text, model_choice)
elif model_choice in ["Terjman-Ultra-v2", "Terjman-Supreme-v2"]:
translation = translate_ultra_supreme(text, model_choice)
else:
return "Invalid model selection."
# Add to buffer
translations_buffer.append({
"source_text": text,
"translated_text": translation,
"model_used": model_choice,
"timestamp": datetime.datetime.now().isoformat(),
"user_id": user_id # Add the user ID to the dataset
})
# Check if it's time to push to HF
current_time = time.time()
if len(translations_buffer) >= BATCH_SIZE or (current_time - last_push_time) >= UPDATE_INTERVAL:
push_to_hf_dataset()
return translation
def gradio_app():
with gr.Blocks() as app:
gr.Markdown("# 🇲🇦 Terjman-v2")
gr.Markdown("Choose a model and enter the English text you want to translate to Moroccan Darija.")
model_choice = gr.Dropdown(
label="Select Model",
choices=["Terjman-Nano-v2", "Terjman-Large-v2", "Terjman-Ultra-v2", "Terjman-Supreme-v2"],
value="Terjman-Ultra-v2"
)
input_text = gr.Textbox(label="Input Text", placeholder="Enter text to translate...", lines=3)
output_text = gr.Textbox(label="Translated Text", interactive=False, lines=3)
translate_button = gr.Button("Translate")
# Link input and output
def translate_and_update_status(text, model):
"""Wrapper function to handle translation and update status."""
# Access the request object directly
request = gr.Request()
translation = translate_text(text, model, request)
return translation
translate_button.click(
fn=translate_and_update_status,
inputs=[input_text, model_choice],
outputs=[output_text]
)
return app
# Run the app
if __name__ == "__main__":
# Register shutdown handler to save remaining translations
import atexit
atexit.register(push_to_hf_dataset)
# Preload all models
nano_large_models, ultra_supreme_models = preload_models()
# Launch the app
app = gradio_app()
app.launch() |