Spaces:
Running
Running
import gradio as gr | |
import json | |
import re | |
import sqlite3 | |
import logging | |
from collections import defaultdict | |
from typing import Tuple, Dict, List | |
from util import process_json_files | |
from gematria import calculate_gematria | |
from deep_translator import GoogleTranslator, exceptions | |
from urllib.parse import quote_plus | |
from tqdm import tqdm # Import tqdm for progress bars | |
# Constants | |
DATABASE_FILE = 'gematria.db' | |
MAX_PHRASE_LENGTH_LIMIT = 20 # Populate database for phrases up to 5 words | |
BATCH_SIZE = 1000 # Insert phrases into database in batches | |
# Set up logging | |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(filename)s - %(lineno)d - %(message)s') | |
# Global variables | |
conn: sqlite3.Connection = None | |
translator: GoogleTranslator = None | |
book_names: Dict[int, str] = {} | |
gematria_cache: Dict[Tuple[int, int], List[Tuple[str, str, int, int]]] = {} | |
translation_cache: Dict[str, str] = {} | |
def initialize_database() -> None: | |
"""Initializes the SQLite database.""" | |
global conn | |
conn = sqlite3.connect(DATABASE_FILE, isolation_level=None) # Autocommit for faster insertion | |
cursor = conn.cursor() | |
# Create tables if they don't exist | |
cursor.execute(''' | |
CREATE TABLE IF NOT EXISTS results ( | |
gematria_sum INTEGER, | |
words TEXT, | |
translation TEXT, | |
book TEXT, | |
chapter INTEGER, | |
verse INTEGER, | |
PRIMARY KEY (gematria_sum, words, book, chapter, verse) | |
) | |
''') | |
cursor.execute(''' | |
CREATE TABLE IF NOT EXISTS processed_books ( | |
book TEXT PRIMARY KEY, | |
max_phrase_length INTEGER | |
) | |
''') | |
cursor.execute(''' | |
CREATE TABLE IF NOT EXISTS translations ( | |
hebrew_phrase TEXT PRIMARY KEY, | |
english_translation TEXT | |
) | |
''') | |
def initialize_translator() -> None: | |
"""Initializes the Google Translator.""" | |
global translator | |
translator = GoogleTranslator(source='iw', target='en') | |
logging.info("Translator initialized.") | |
def populate_database(start_book: int, end_book: int, max_phrase_length: int = 1) -> None: | |
"""Populates the database with phrases from the Tanach and their Gematria values.""" | |
global conn, book_names | |
logging.info(f"Populating database with books from {start_book} to {end_book}...") | |
cursor = conn.cursor() | |
for book_id in tqdm(range(start_book, end_book + 1), desc="Processing Books"): | |
book_data = process_json_files(book_id, book_id) # Get data for the single book | |
# process_json_files returns a dictionary with book_id as key, | |
# so access the book data directly | |
if book_id in book_data: | |
book_data = book_data[book_id] | |
if 'title' not in book_data or not isinstance(book_data['title'], str): | |
logging.warning(f"Skipping book {book_id} due to missing or invalid 'title' field.") | |
continue | |
title = book_data['title'] | |
book_names[book_id] = title | |
# Check if the book is already processed for this max_phrase_length | |
cursor.execute('''SELECT max_phrase_length FROM processed_books WHERE book = ?''', (title,)) | |
result = cursor.fetchone() | |
if result and result[0] >= max_phrase_length: | |
logging.info(f"Skipping book {title}: Already processed with max_phrase_length {result[0]}") | |
continue | |
logging.info(f"Processing book {title} with max_phrase_length {max_phrase_length}") | |
if 'text' not in book_data or not isinstance(book_data['text'], list): | |
logging.warning(f"Skipping book {book_id} due to missing or invalid 'text' field.") | |
continue | |
chapters = book_data['text'] | |
# Faster iteration with enumerate and list comprehension | |
for chapter_id, chapter in enumerate(chapters): | |
for verse_id, verse in enumerate(chapter): | |
verse_text = flatten_text(verse) | |
# Remove text in square brackets and non-Hebrew characters | |
verse_text = re.sub(r'\[.*?\]', '', verse_text) | |
verse_text = re.sub(r"[^\u05D0-\u05EA ]+", "", verse_text) | |
verse_text = re.sub(r" +", " ", verse_text) | |
words = verse_text.split() | |
# Use a generator to avoid building large lists in memory | |
for length in range(1, max_phrase_length + 1): | |
for start in range(len(words) - length + 1): | |
phrase_candidate = " ".join(words[start:start + length]) | |
gematria_sum = calculate_gematria(phrase_candidate.replace(" ", "")) | |
yield gematria_sum, phrase_candidate, title, chapter_id + 1, verse_id + 1 | |
# Mark the book as processed with the current max_phrase_length | |
cursor.execute(''' | |
INSERT OR REPLACE INTO processed_books (book, max_phrase_length) | |
VALUES (?, ?) | |
''', (title, max_phrase_length)) | |
def insert_phrases_to_db(phrases: List[Tuple[int, str, str, int, int]]) -> None: | |
"""Inserts a list of phrases into the database efficiently.""" | |
global conn | |
cursor = conn.cursor() | |
# Use executemany to insert multiple rows at once | |
cursor.executemany(''' | |
INSERT OR IGNORE INTO results (gematria_sum, words, book, chapter, verse) | |
VALUES (?, ?, ?, ?, ?) | |
''', phrases) | |
# Commit the changes outside the loop for better performance | |
conn.commit() | |
def get_translation(phrase: str) -> str: | |
"""Retrieves or generates the English translation of a Hebrew phrase.""" | |
global translator, conn, translation_cache | |
if phrase in translation_cache: | |
return translation_cache[phrase] | |
else: | |
cursor = conn.cursor() | |
cursor.execute(''' | |
SELECT english_translation FROM translations | |
WHERE hebrew_phrase = ? | |
''', (phrase,)) | |
result = cursor.fetchone() | |
if result and result[0]: | |
translation = result[0] | |
return translation | |
else: | |
translation = translate_and_store(phrase) | |
cursor.execute(''' | |
INSERT OR IGNORE INTO translations (hebrew_phrase, english_translation) | |
VALUES (?, ?) | |
''', (phrase, translation)) | |
return translation | |
def translate_and_store(phrase: str) -> str: | |
"""Translates a Hebrew phrase to English using Google Translate and handles potential errors.""" | |
global translator | |
max_retries = 3 | |
retries = 0 | |
while retries < max_retries: | |
try: | |
translation = translator.translate(phrase) | |
logging.debug(f"Translated phrase: {translation}") | |
return translation | |
except (exceptions.TranslationNotFound, exceptions.NotValidPayload, | |
exceptions.ServerException, exceptions.RequestError, requests.exceptions.ConnectionError) as e: | |
retries += 1 | |
logging.warning(f"Error translating phrase '{phrase}': {e}. Retrying... ({retries}/{max_retries})") | |
logging.error(f"Failed to translate phrase '{phrase}' after {max_retries} retries.") | |
return "[Translation Error]" | |
def search_gematria_in_db(gematria_sum: int, max_words: int) -> List[Tuple[str, str, int, int]]: | |
"""Searches the database for phrases with a given Gematria value and word count. | |
Returns phrases with word count <= max_words.""" | |
global conn | |
cursor = conn.cursor() | |
logging.debug(f"Searching for phrases with Gematria: {gematria_sum} and max words: {max_words}") | |
cursor.execute(''' | |
SELECT words, book, chapter, verse FROM results WHERE gematria_sum = ? | |
''', (gematria_sum,)) # Retrieve all matching phrases first | |
results = cursor.fetchall() | |
filtered_results = [] | |
logging.debug(f"Found {len(results)} matching phrases before filtering.") | |
for words, book, chapter, verse in results: | |
# Filter by word count (including phrases with fewer words) | |
word_count = len(words.split()) # Correctly split and count words | |
logging.debug(f"Word count for '{words}': {word_count}") | |
if word_count <= max_words: # Include phrases with word count <= max_words | |
filtered_results.append((words, book, chapter, verse)) | |
logging.debug(f"Found {len(filtered_results)} matching phrases after filtering.") | |
return filtered_results | |
def gematria_search_interface(phrase: str, max_words: int, show_translation: bool) -> str: | |
"""The main function for the Gradio interface.""" | |
if not phrase.strip(): | |
return "Please enter a phrase." | |
global conn, book_names, gematria_cache | |
conn = sqlite3.connect(DATABASE_FILE) | |
cursor = conn.cursor() | |
# Extract numbers from the input text | |
numbers = re.findall(r'\d+', phrase) | |
# Calculate Gematria for the remaining text (non-numbers) | |
text_without_numbers = re.sub(r'\d+', '', phrase) | |
phrase_gematria = calculate_gematria(text_without_numbers.replace(" ", "")) | |
# Add sum of numbers to Gematria | |
phrase_gematria += sum(int(number) for number in numbers) | |
logging.info(f"Searching for phrases with Gematria: {phrase_gematria}") | |
# Debugging output | |
logging.debug(f"Phrase Gematria: {phrase_gematria}") | |
logging.debug(f"Max Words: {max_words}") | |
# Check if Gematria is in cache for the specific max_words value | |
if (phrase_gematria, max_words) in gematria_cache: | |
matching_phrases = gematria_cache[(phrase_gematria, max_words)] | |
logging.debug(f"Retrieved matching phrases from cache for max_words: {max_words}.") | |
else: | |
# Search in the database | |
matching_phrases = search_gematria_in_db(phrase_gematria, max_words) | |
# Cache the results with the max_words value | |
gematria_cache[(phrase_gematria, max_words)] = matching_phrases | |
logging.debug(f"Retrieved matching phrases from database for max_words: {max_words}.") | |
if not matching_phrases: | |
return "No matching phrases found." | |
# Sort results by book, chapter, and verse | |
sorted_phrases = sorted(matching_phrases, key=lambda x: (int(list(book_names.keys())[list(book_names.values()).index(x[1])]), x[2], x[3])) | |
logging.debug(f"Sorted matching phrases: {sorted_phrases}") | |
# Group results by book | |
results_by_book = defaultdict(list) | |
for words, book, chapter, verse in sorted_phrases: | |
results_by_book[book].append((words, chapter, verse)) | |
logging.debug(f"Grouped results by book: {results_by_book}") | |
# Format results for display | |
results = [] | |
results.append("<div class='results-container'>") | |
for book, phrases in results_by_book.items(): | |
results.append(f"<h4>Book: {book}</h4>") # Directly display book name | |
for words, chapter, verse in phrases: | |
translation = get_translation(words) if show_translation else "" | |
link = f"https://www.biblegateway.com/passage/?search={quote_plus(book)}+{chapter}%3A{verse}&version=CJB" | |
results.append(f""" | |
<div class='result-item'> | |
<p>Chapter: {chapter}, Verse: {verse}</p> | |
<p class='hebrew-phrase'>Hebrew Phrase: {words}</p> | |
<p>Translation: {translation}</p> | |
<a href='{link}' target='_blank' class='bible-link'>[See on Bible Gateway]</a> | |
</div> | |
""") | |
results.append("</div>") # Close results-container div | |
conn.close() | |
# Add CSS styling | |
style = """ | |
<style> | |
.results-container { | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); | |
gap: 20px; | |
} | |
.result-item { | |
border: 1px solid #ccc; | |
padding: 15px; | |
border-radius: 5px; | |
box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.1); | |
} | |
.hebrew-phrase { | |
font-family: 'SBL Hebrew', 'Ezra SIL', serif; | |
direction: rtl; | |
} | |
.bible-link { | |
display: block; | |
margin-top: 10px; | |
color: #007bff; | |
text-decoration: none; | |
} | |
</style> | |
""" | |
return style + "\n".join(results) | |
def flatten_text(text: List) -> str: | |
"""Helper function to flatten nested lists into a single list.""" | |
if isinstance(text, list): | |
return " ".join(flatten_text(item) if isinstance(item, list) else item for item in text) | |
return text | |
def run_app() -> None: | |
"""Initializes and launches the Gradio app.""" | |
initialize_database() | |
initialize_translator() | |
# Pre-populate the database | |
logging.info("Starting database population...") | |
phrases_to_insert = [] # Collect phrases before inserting in bulk | |
for max_phrase_length in range(1, MAX_PHRASE_LENGTH_LIMIT + 1): # Populate for phrases up to MAX_PHRASE_LENGTH_LIMIT words | |
for gematria_sum, phrase, book, chapter, verse in tqdm(populate_database(1, 39, max_phrase_length=max_phrase_length), desc=f"Populating Database (Max Length: {max_phrase_length})"): # Books 1 to 39 | |
phrases_to_insert.append((gematria_sum, phrase, book, chapter, verse)) | |
if len(phrases_to_insert) >= BATCH_SIZE: # Insert in batches of BATCH_SIZE for efficiency | |
insert_phrases_to_db(phrases_to_insert) | |
phrases_to_insert = [] | |
if phrases_to_insert: # Insert remaining phrases | |
insert_phrases_to_db(phrases_to_insert) | |
logging.info("Database population complete.") | |
iface = gr.Interface( | |
fn=gematria_search_interface, | |
inputs=[ | |
gr.Textbox(label="Enter word(s) or numbers (e.g., 'abc', '888' or 'abc 111 777')"), | |
gr.Number(label="Max Word Count in Result Phrases", value=1, minimum=1, maximum=MAX_PHRASE_LENGTH_LIMIT), | |
gr.Checkbox(label="Show Translation", value=True) | |
], | |
outputs=gr.HTML(label="Results"), | |
title="Gematria Search in Tanach", | |
description="Search for phrases and/or numbers in the Tanach that have the same Gematria value.", | |
live=False, | |
allow_flagging="never" | |
) | |
iface.launch() | |
if __name__ == "__main__": | |
run_app() | |