File size: 5,474 Bytes
841bd40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62c79bd
841bd40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import sqlite3
import logging
from deep_translator import GoogleTranslator, exceptions
from tqdm import tqdm
import threading
import time
from queue import Queue

# Constants
DATABASE_FILE = 'gematria.db'  # Use your actual database file name
BATCH_SIZE = 1000
NUM_THREADS = 10  # Number of parallel translation threads

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize the translator
translator = GoogleTranslator(source='yi', target='en')
logging.info("Translator initialized.")

# Separate Queue and tqdm
translation_queue = Queue()  # Regular queue
translation_queue_tqdm = tqdm(total=0, dynamic_ncols=True, desc="Translation Queue")  # tqdm for the queue
total_translations_tqdm = tqdm(total=0, dynamic_ncols=True, desc="Total Translations") # tqdm for overall progress

# Lock for database access
db_lock = threading.Lock()
translations_completed = 0  # Counter for completed translations


def translate_and_store(phrase: str) -> str:
    """Translates a Hebrew phrase to English using Google Translate."""
    global translator
    max_retries = 3
    retries = 0
    while retries < max_retries:
        try:
            translation = translator.translate(phrase)
            return translation
        except (exceptions.TranslationNotFound, exceptions.NotValidPayload,
                exceptions.ServerException, exceptions.RequestError) as e:
            retries += 1
            logging.warning(f"Error translating phrase '{phrase}': {e}. Retrying... ({retries}/{max_retries})")
    logging.error(f"Failed to translate phrase '{phrase}' after {max_retries} retries.")
    return None


def translation_worker():
    """Worker thread to process translations from the queue."""
    global conn, translator, translation_queue, db_lock, translation_queue_tqdm, translations_completed, total_translations_tqdm

    while True:
        phrase = translation_queue.get()  # Get from the actual queue
        translation_queue_tqdm.update()  # Update the tqdm progress bar
        if phrase is None:  # Sentinel value to stop the thread
            break

        translation = translate_and_store(phrase)

        # Acquire the lock before any database interaction for this phrase
        with db_lock:
            with sqlite3.connect(DATABASE_FILE) as conn:
                cursor = conn.cursor()
                if translation is not None:
                    cursor.execute("UPDATE results SET translation = ? WHERE words = ?", (translation, phrase))
                    translations_completed += 1  # Increment the global counter
                    total_translations_tqdm.update()  # Update the overall progress bar
                conn.commit()

        translation_queue.task_done()


def populate_translations():
    """Populates translations for all Hebrew phrases in the database."""
    global conn, translator, translation_queue, translation_queue_tqdm, total_translations_tqdm

    with sqlite3.connect(DATABASE_FILE) as conn:
        cursor = conn.cursor()

        # Get the total count of distinct phrases needing translation
        cursor.execute("SELECT COUNT(DISTINCT words) FROM results WHERE translation IS NULL")
        total_phrases = cursor.fetchone()[0]

        logging.info(f"Found {total_phrases} distinct phrases to translate.")

        # Get distinct Hebrew phrases that need translation using a generator
        cursor.execute("SELECT DISTINCT words FROM results WHERE translation IS NULL")
        phrases_generator = (phrase for phrase, in cursor)  # Use a generator for tqdm

        # Set the total for both tqdm progress bars
        translation_queue_tqdm.total = total_phrases
        total_translations_tqdm.total = total_phrases

        # Build the translation queue first
        for phrase in phrases_generator:
            translation_queue.put(phrase)  # Put into the actual queue
            translation_queue_tqdm.update()  # Update tqdm progress bar

        # Close the translation queue tqdm after it's fully populated
        translation_queue_tqdm.close()

        # Start worker threads AFTER the queue is built
        threads = []
        for _ in range(NUM_THREADS):
            thread = threading.Thread(target=translation_worker)
            thread.start()
            threads.append(thread)

        # Wait for all tasks to be completed
        translation_queue.join()

        # Stop worker threads
        for _ in range(NUM_THREADS):
            translation_queue.put(None)  # Sentinel value to stop threads
        for thread in threads:
            thread.join()

        logging.info("All translations completed.")



def save_translations_periodically():
    """Saves translations to the database every minute."""
    while True:
        time.sleep(60)  # Wait for 1 minute
        logging.info("Saving translations to the database...")
        with db_lock:  # Acquire the lock before saving
            with sqlite3.connect(DATABASE_FILE) as conn:
                conn.commit()
        logging.info("Translations saved.")


if __name__ == "__main__":
    # Start the translation process in a separate thread
    translation_thread = threading.Thread(target=populate_translations)
    translation_thread.start()

    # Start the periodic saving thread
    save_thread = threading.Thread(target=save_translations_periodically)
    save_thread.start()

    # Keep the main thread alive
    while True:
        time.sleep(1)