BounharAbdelaziz commited on
Commit
9d7e270
·
verified ·
1 Parent(s): 6a5799f

fix data lost

Browse files
Files changed (1) hide show
  1. app.py +20 -10
app.py CHANGED
@@ -3,7 +3,7 @@ from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
3
  import os
4
  import torch
5
  import spaces
6
- from datasets import Dataset
7
  import time
8
  import datetime
9
 
@@ -65,7 +65,7 @@ def preload_models():
65
  return nano_large_models, ultra_supreme_models
66
 
67
  def push_to_hf_dataset():
68
- """ Save translations in HF dataset for monitoring """
69
  global translations_buffer, last_push_time
70
 
71
  if not translations_buffer:
@@ -75,18 +75,31 @@ def push_to_hf_dataset():
75
  print(f"[INFO] Pushing {len(translations_buffer)} translations to Hugging Face dataset...")
76
 
77
  # Create dataset from buffer
78
- ds = Dataset.from_dict({
79
  "source_text": [item["source_text"] for item in translations_buffer],
80
  "translated_text": [item["translated_text"] for item in translations_buffer],
81
  "model_used": [item["model_used"] for item in translations_buffer],
82
  "timestamp": [item["timestamp"] for item in translations_buffer]
83
  })
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  # Push to hub
86
- ds.push_to_hub(
87
  DATASET_REPO,
88
  token=TOKEN,
89
- split=f"live_translations",
90
  private=True,
91
  )
92
 
@@ -169,18 +182,15 @@ def gradio_app():
169
 
170
  translate_button = gr.Button("Translate")
171
 
172
- # Status message -> hidden
173
- # status = gr.Markdown(f"Translations in buffer: 0")
174
-
175
  # Link input and output
176
  def translate_and_update_status(text, model):
177
  translation = translate_text(text, model)
178
- return translation #, f"Translations in buffer: {len(translations_buffer)} (Will push when reaching {BATCH_SIZE} or after {UPDATE_INTERVAL/3600} hours)"
179
 
180
  translate_button.click(
181
  fn=translate_and_update_status,
182
  inputs=[input_text, model_choice],
183
- outputs=[output_text] #, status] Status message -> hidden
184
  )
185
 
186
  return app
 
3
  import os
4
  import torch
5
  import spaces
6
+ from datasets import Dataset, load_dataset
7
  import time
8
  import datetime
9
 
 
65
  return nano_large_models, ultra_supreme_models
66
 
67
  def push_to_hf_dataset():
68
+ """ Save translations in HF dataset for monitoring, preserving previous data """
69
  global translations_buffer, last_push_time
70
 
71
  if not translations_buffer:
 
75
  print(f"[INFO] Pushing {len(translations_buffer)} translations to Hugging Face dataset...")
76
 
77
  # Create dataset from buffer
78
+ new_data = Dataset.from_dict({
79
  "source_text": [item["source_text"] for item in translations_buffer],
80
  "translated_text": [item["translated_text"] for item in translations_buffer],
81
  "model_used": [item["model_used"] for item in translations_buffer],
82
  "timestamp": [item["timestamp"] for item in translations_buffer]
83
  })
84
 
85
+ # Try to load existing dataset
86
+ try:
87
+ existing_dataset = load_dataset(DATASET_REPO, split="live_translations", token=TOKEN)
88
+ print(f"[INFO] Loaded existing dataset with {len(existing_dataset)} entries")
89
+
90
+ # Concatenate existing data with new data
91
+ combined_dataset = existing_dataset.concatenate(new_data)
92
+ print(f"[INFO] Combined dataset now has {len(combined_dataset)} entries")
93
+ except Exception as e:
94
+ print(f"[INFO] No existing dataset found or error loading: {str(e)}")
95
+ print(f"[INFO] Creating new dataset")
96
+ combined_dataset = new_data
97
+
98
  # Push to hub
99
+ combined_dataset.push_to_hub(
100
  DATASET_REPO,
101
  token=TOKEN,
102
+ split="live_translations",
103
  private=True,
104
  )
105
 
 
182
 
183
  translate_button = gr.Button("Translate")
184
 
 
 
 
185
  # Link input and output
186
  def translate_and_update_status(text, model):
187
  translation = translate_text(text, model)
188
+ return translation
189
 
190
  translate_button.click(
191
  fn=translate_and_update_status,
192
  inputs=[input_text, model_choice],
193
+ outputs=[output_text]
194
  )
195
 
196
  return app