neuralworm commited on
Commit
841bd40
1 Parent(s): 409d42b

translation cache

Browse files
Files changed (3) hide show
  1. app.py +24 -8
  2. gematria.db +2 -2
  3. populate_translations.py +144 -0
app.py CHANGED
@@ -151,14 +151,30 @@ def populate_database(start_book: int, end_book: int, max_phrase_length: int = 1
151
 
152
 
153
  def get_translation(phrase: str) -> str:
154
- """Retrieves or generates the English translation of a Hebrew phrase."""
155
- global translator, translation_cache
156
- if phrase in translation_cache:
157
- return translation_cache[phrase]
158
- else:
159
- translation = translate_and_store(phrase)
160
- translation_cache[phrase] = translation
161
- return translation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
 
164
  def translate_and_store(phrase: str) -> str:
 
151
 
152
 
153
  def get_translation(phrase: str) -> str:
154
+ """Retrieves or generates the English translation of a Hebrew phrase
155
+ and caches it in the database.
156
+ """
157
+ global conn, translator, translation_cache
158
+
159
+ # Check if the translation exists in the database
160
+ with sqlite3.connect(DATABASE_FILE) as conn:
161
+ cursor = conn.cursor()
162
+ cursor.execute("SELECT translation FROM results WHERE words = ? LIMIT 1", (phrase,))
163
+ result = cursor.fetchone()
164
+ if result and result[0]: # If a translation exists, use it
165
+ return result[0]
166
+
167
+ # If no translation in the database, translate and store it
168
+ translation = translate_and_store(phrase)
169
+ translation_cache[phrase] = translation
170
+
171
+ # Update the database with the new translation
172
+ with sqlite3.connect(DATABASE_FILE) as conn:
173
+ cursor = conn.cursor()
174
+ cursor.execute("UPDATE results SET translation = ? WHERE words = ?", (translation, phrase))
175
+ conn.commit()
176
+
177
+ return translation
178
 
179
 
180
  def translate_and_store(phrase: str) -> str:
gematria.db CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f2cbde78ee764e0b28fa7aee9862781c91521d62b99a9a344bdf4cff5237556
3
- size 417710080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:351f6765697cb8b41f09edcfba403ebded30b225ca931d60941964d1b6ab5f1d
3
+ size 418160640
populate_translations.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import logging
3
+ from deep_translator import GoogleTranslator, exceptions
4
+ from tqdm import tqdm
5
+ import threading
6
+ import time
7
+ from queue import Queue
8
+
9
+ # Constants
10
+ DATABASE_FILE = 'gematria.db' # Use your actual database file name
11
+ BATCH_SIZE = 1000
12
+ NUM_THREADS = 10 # Number of parallel translation threads
13
+
14
+ # Set up logging
15
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
+
17
+ # Initialize the translator
18
+ translator = GoogleTranslator(source='iw', target='en')
19
+ logging.info("Translator initialized.")
20
+
21
+ # Separate Queue and tqdm
22
+ translation_queue = Queue() # Regular queue
23
+ translation_queue_tqdm = tqdm(total=0, dynamic_ncols=True, desc="Translation Queue") # tqdm for the queue
24
+ total_translations_tqdm = tqdm(total=0, dynamic_ncols=True, desc="Total Translations") # tqdm for overall progress
25
+
26
+ # Lock for database access
27
+ db_lock = threading.Lock()
28
+ translations_completed = 0 # Counter for completed translations
29
+
30
+
31
+ def translate_and_store(phrase: str) -> str:
32
+ """Translates a Hebrew phrase to English using Google Translate."""
33
+ global translator
34
+ max_retries = 3
35
+ retries = 0
36
+ while retries < max_retries:
37
+ try:
38
+ translation = translator.translate(phrase)
39
+ return translation
40
+ except (exceptions.TranslationNotFound, exceptions.NotValidPayload,
41
+ exceptions.ServerException, exceptions.RequestError) as e:
42
+ retries += 1
43
+ logging.warning(f"Error translating phrase '{phrase}': {e}. Retrying... ({retries}/{max_retries})")
44
+ logging.error(f"Failed to translate phrase '{phrase}' after {max_retries} retries.")
45
+ return None
46
+
47
+
48
+ def translation_worker():
49
+ """Worker thread to process translations from the queue."""
50
+ global conn, translator, translation_queue, db_lock, translation_queue_tqdm, translations_completed, total_translations_tqdm
51
+
52
+ while True:
53
+ phrase = translation_queue.get() # Get from the actual queue
54
+ translation_queue_tqdm.update() # Update the tqdm progress bar
55
+ if phrase is None: # Sentinel value to stop the thread
56
+ break
57
+
58
+ translation = translate_and_store(phrase)
59
+
60
+ # Acquire the lock before any database interaction for this phrase
61
+ with db_lock:
62
+ with sqlite3.connect(DATABASE_FILE) as conn:
63
+ cursor = conn.cursor()
64
+ if translation is not None:
65
+ cursor.execute("UPDATE results SET translation = ? WHERE words = ?", (translation, phrase))
66
+ translations_completed += 1 # Increment the global counter
67
+ total_translations_tqdm.update() # Update the overall progress bar
68
+ conn.commit()
69
+
70
+ translation_queue.task_done()
71
+
72
+
73
+ def populate_translations():
74
+ """Populates translations for all Hebrew phrases in the database."""
75
+ global conn, translator, translation_queue, translation_queue_tqdm, total_translations_tqdm
76
+
77
+ with sqlite3.connect(DATABASE_FILE) as conn:
78
+ cursor = conn.cursor()
79
+
80
+ # Get the total count of distinct phrases needing translation
81
+ cursor.execute("SELECT COUNT(DISTINCT words) FROM results WHERE translation IS NULL")
82
+ total_phrases = cursor.fetchone()[0]
83
+
84
+ logging.info(f"Found {total_phrases} distinct phrases to translate.")
85
+
86
+ # Get distinct Hebrew phrases that need translation using a generator
87
+ cursor.execute("SELECT DISTINCT words FROM results WHERE translation IS NULL")
88
+ phrases_generator = (phrase for phrase, in cursor) # Use a generator for tqdm
89
+
90
+ # Set the total for both tqdm progress bars
91
+ translation_queue_tqdm.total = total_phrases
92
+ total_translations_tqdm.total = total_phrases
93
+
94
+ # Build the translation queue first
95
+ for phrase in phrases_generator:
96
+ translation_queue.put(phrase) # Put into the actual queue
97
+ translation_queue_tqdm.update() # Update tqdm progress bar
98
+
99
+ # Close the translation queue tqdm after it's fully populated
100
+ translation_queue_tqdm.close()
101
+
102
+ # Start worker threads AFTER the queue is built
103
+ threads = []
104
+ for _ in range(NUM_THREADS):
105
+ thread = threading.Thread(target=translation_worker)
106
+ thread.start()
107
+ threads.append(thread)
108
+
109
+ # Wait for all tasks to be completed
110
+ translation_queue.join()
111
+
112
+ # Stop worker threads
113
+ for _ in range(NUM_THREADS):
114
+ translation_queue.put(None) # Sentinel value to stop threads
115
+ for thread in threads:
116
+ thread.join()
117
+
118
+ logging.info("All translations completed.")
119
+
120
+
121
+
122
+ def save_translations_periodically():
123
+ """Saves translations to the database every minute."""
124
+ while True:
125
+ time.sleep(60) # Wait for 1 minute
126
+ logging.info("Saving translations to the database...")
127
+ with db_lock: # Acquire the lock before saving
128
+ with sqlite3.connect(DATABASE_FILE) as conn:
129
+ conn.commit()
130
+ logging.info("Translations saved.")
131
+
132
+
133
+ if __name__ == "__main__":
134
+ # Start the translation process in a separate thread
135
+ translation_thread = threading.Thread(target=populate_translations)
136
+ translation_thread.start()
137
+
138
+ # Start the periodic saving thread
139
+ save_thread = threading.Thread(target=save_translations_periodically)
140
+ save_thread.start()
141
+
142
+ # Keep the main thread alive
143
+ while True:
144
+ time.sleep(1)