Spaces:
Sleeping
Sleeping
File size: 5,474 Bytes
841bd40 62c79bd 841bd40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import sqlite3
import logging
from deep_translator import GoogleTranslator, exceptions
from tqdm import tqdm
import threading
import time
from queue import Queue
# Constants
DATABASE_FILE = 'gematria.db' # Use your actual database file name
BATCH_SIZE = 1000
NUM_THREADS = 10 # Number of parallel translation threads
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Initialize the translator
translator = GoogleTranslator(source='yi', target='en')
logging.info("Translator initialized.")
# Separate Queue and tqdm
translation_queue = Queue() # Regular queue
translation_queue_tqdm = tqdm(total=0, dynamic_ncols=True, desc="Translation Queue") # tqdm for the queue
total_translations_tqdm = tqdm(total=0, dynamic_ncols=True, desc="Total Translations") # tqdm for overall progress
# Lock for database access
db_lock = threading.Lock()
translations_completed = 0 # Counter for completed translations
def translate_and_store(phrase: str) -> str:
"""Translates a Hebrew phrase to English using Google Translate."""
global translator
max_retries = 3
retries = 0
while retries < max_retries:
try:
translation = translator.translate(phrase)
return translation
except (exceptions.TranslationNotFound, exceptions.NotValidPayload,
exceptions.ServerException, exceptions.RequestError) as e:
retries += 1
logging.warning(f"Error translating phrase '{phrase}': {e}. Retrying... ({retries}/{max_retries})")
logging.error(f"Failed to translate phrase '{phrase}' after {max_retries} retries.")
return None
def translation_worker():
"""Worker thread to process translations from the queue."""
global conn, translator, translation_queue, db_lock, translation_queue_tqdm, translations_completed, total_translations_tqdm
while True:
phrase = translation_queue.get() # Get from the actual queue
translation_queue_tqdm.update() # Update the tqdm progress bar
if phrase is None: # Sentinel value to stop the thread
break
translation = translate_and_store(phrase)
# Acquire the lock before any database interaction for this phrase
with db_lock:
with sqlite3.connect(DATABASE_FILE) as conn:
cursor = conn.cursor()
if translation is not None:
cursor.execute("UPDATE results SET translation = ? WHERE words = ?", (translation, phrase))
translations_completed += 1 # Increment the global counter
total_translations_tqdm.update() # Update the overall progress bar
conn.commit()
translation_queue.task_done()
def populate_translations():
"""Populates translations for all Hebrew phrases in the database."""
global conn, translator, translation_queue, translation_queue_tqdm, total_translations_tqdm
with sqlite3.connect(DATABASE_FILE) as conn:
cursor = conn.cursor()
# Get the total count of distinct phrases needing translation
cursor.execute("SELECT COUNT(DISTINCT words) FROM results WHERE translation IS NULL")
total_phrases = cursor.fetchone()[0]
logging.info(f"Found {total_phrases} distinct phrases to translate.")
# Get distinct Hebrew phrases that need translation using a generator
cursor.execute("SELECT DISTINCT words FROM results WHERE translation IS NULL")
phrases_generator = (phrase for phrase, in cursor) # Use a generator for tqdm
# Set the total for both tqdm progress bars
translation_queue_tqdm.total = total_phrases
total_translations_tqdm.total = total_phrases
# Build the translation queue first
for phrase in phrases_generator:
translation_queue.put(phrase) # Put into the actual queue
translation_queue_tqdm.update() # Update tqdm progress bar
# Close the translation queue tqdm after it's fully populated
translation_queue_tqdm.close()
# Start worker threads AFTER the queue is built
threads = []
for _ in range(NUM_THREADS):
thread = threading.Thread(target=translation_worker)
thread.start()
threads.append(thread)
# Wait for all tasks to be completed
translation_queue.join()
# Stop worker threads
for _ in range(NUM_THREADS):
translation_queue.put(None) # Sentinel value to stop threads
for thread in threads:
thread.join()
logging.info("All translations completed.")
def save_translations_periodically():
"""Saves translations to the database every minute."""
while True:
time.sleep(60) # Wait for 1 minute
logging.info("Saving translations to the database...")
with db_lock: # Acquire the lock before saving
with sqlite3.connect(DATABASE_FILE) as conn:
conn.commit()
logging.info("Translations saved.")
if __name__ == "__main__":
# Start the translation process in a separate thread
translation_thread = threading.Thread(target=populate_translations)
translation_thread.start()
# Start the periodic saving thread
save_thread = threading.Thread(target=save_translations_periodically)
save_thread.start()
# Keep the main thread alive
while True:
time.sleep(1) |