Asawal-Amqran-Dictionary / app_final v1.py
abdelhaqueidali's picture
Rename app.py to app_final v1.py
ab33278 verified
import gradio as gr
import sqlite3
import html
import re
import unicodedata
from typing import List, Dict
def normalize_text(text: str, language: str) -> str:
"""Normalize text based on language rules."""
if not text:
return text
# Convert to lowercase and normalize Unicode
text = text.lower()
text = unicodedata.normalize('NFKD', text)
if language == "Arabic":
# Normalize Arabic alifs and remove diacritics
text = re.sub(r'[إأآا]', 'ا', text) # Normalize alifs
text = re.sub(r'[ىي]', 'ي', text) # Normalize ya
text = re.sub(r'[ةه]', 'ه', text) # Normalize ta marbuta and ha
# Remove Arabic diacritics (fatha, kasra, damma, etc.)
text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
elif language == "French":
# Remove French diacritics by replacing accented characters with base characters
text = ''.join(c for c in unicodedata.normalize('NFD', text)
if not unicodedata.combining(c))
elif language == "Amazigh":
# Normalize Amazigh characters
text = text.replace('ⵕ', 'ⵔ') # Treat ⵕ the same as ⵔ
text = text.replace('ⵯ', '') # Make character ⵯ unnecessary
return text
def search_dictionary(search_term: str,
language: str,
exact_match: bool,
word_match: bool,
contains: bool,
starts_with: bool,
ends_with: bool) -> str:
if not search_term or search_term.isspace():
return "<p>Please enter a search term</p>"
conn = sqlite3.connect('asawal_amqran.db')
cursor = conn.cursor()
# Normalize search term according to language-specific rules
normalized_search = normalize_text(search_term, language)
search_columns = {
"Amazigh": ["word", "latin", "construct", "plural", "acc", "accneg", "inacc",
"variante", "feminine", "fem_construct", "fem_plural",
"fem_plural_construct", "exp_zgh"],
"Arabic": ["arabic", "exp_ara", "mean_ar"],
"French": ["french", "exp_fra"]
}.get(language, [])
if not search_columns:
return "<p>Please select a language</p>"
if not any([exact_match, word_match, contains, starts_with, ends_with]):
return "<p>Please select at least one search option</p>"
priority_results = []
seen_word_ids = set()
# Get all data and filter in Python to handle normalization properly
cursor.execute("SELECT * FROM lexie")
column_names = [desc[0] for desc in cursor.description]
word_id_idx = column_names.index('word_id') if 'word_id' in column_names else -1
all_rows = cursor.fetchall()
for row in all_rows:
if word_id_idx == -1 or row[word_id_idx] in seen_word_ids:
continue
# Check each relevant column with normalization
for column_idx, column_name in enumerate(column_names):
if column_name not in search_columns:
continue
cell_value = row[column_idx]
if not cell_value:
continue
# Normalize the cell value according to language rules
normalized_cell = normalize_text(str(cell_value), language)
# Priority 1: Exact Match
if exact_match and normalized_cell == normalized_search:
seen_word_ids.add(row[word_id_idx])
priority_results.append((1, row))
break
# Priority 2: Word Match
elif word_match and (normalized_cell == normalized_search or
re.search(r'\b' + re.escape(normalized_search) + r'\b', normalized_cell)):
seen_word_ids.add(row[word_id_idx])
priority_results.append((2, row))
break
# Priority 3: Contains
elif contains and normalized_search in normalized_cell:
seen_word_ids.add(row[word_id_idx])
priority_results.append((3, row))
break
# Priority 4: Starts With
elif starts_with and normalized_cell.startswith(normalized_search):
seen_word_ids.add(row[word_id_idx])
priority_results.append((4, row))
break
# Priority 5: Ends With
elif ends_with and normalized_cell.endswith(normalized_search):
seen_word_ids.add(row[word_id_idx])
priority_results.append((5, row))
break
conn.close()
if not priority_results:
return "<p>No results found</p>"
# Sort by priority
priority_results.sort(key=lambda x: x[0])
results = [row for priority, row in priority_results]
# Format results as HTML
html_output = "<div style='font-family: Arial, sans-serif;'>"
if column_names:
for result in results:
result_dict = dict(zip(column_names, result))
html_output += "<div style='border: 1px solid #ccc; margin: 10px; padding: 15px; position: relative;'>"
if 'source' in result_dict and result_dict['source']:
html_output += f"<div style='text-align: center; font-style: italic;'>{html.escape(str(result_dict['source']))}</div>"
if 'category' in result_dict and result_dict['category']:
html_output += f"<div style='position: absolute; top: 10px; right: 10px; font-weight: bold;'>{html.escape(str(result_dict['category']))}</div>"
html_output += "<h3>Word</h3><ul>"
for field, label in [
('word', 'Word'), ('latin', 'Latin'), ('construct', 'Construct'),
('plural', 'Plural'), ('acc', 'Accusative'), ('accneg', 'Negative Accusative'),
('inacc', 'Inaccusative'), ('variante', 'Variant'), ('feminine', 'Feminine'),
('fem_construct', 'Feminine Construct'), ('fem_plural', 'Feminine Plural'),
('fem_plural_construct', 'Feminine Plural Construct')
]:
if field in result_dict and result_dict[field]:
html_output += f"<li><strong>{label}:</strong> {html.escape(str(result_dict[field]))}</li>"
html_output += "</ul>"
html_output += "<h3>Translations</h3><ul>"
if 'french' in result_dict and result_dict['french']:
html_output += f"<li><strong>French:</strong> {html.escape(str(result_dict['french']))}</li>"
if 'arabic' in result_dict and result_dict['arabic']:
html_output += f"<li><strong>Arabic:</strong> {html.escape(str(result_dict['arabic']))}</li>"
if 'mean_ar' in result_dict and result_dict['mean_ar']:
html_output += f"<li><strong>Arabic Meaning:</strong> {html.escape(str(result_dict['mean_ar']))}</li>"
html_output += "</ul>"
html_output += "<h3>Expressions</h3><ul>"
for field, label in [
('exp_zgh', 'Amazigh Expression'), ('exp_fra', 'French Expression'),
('exp_ara', 'Arabic Expression')
]:
if field in result_dict and result_dict[field]:
html_output += f"<li><strong>{label}:</strong> {html.escape(str(result_dict[field]))}</li>"
html_output += "</ul>"
html_output += "</div>"
else:
html_output = "<p>No data found</p>"
html_output += "</div>"
return html_output
# Gradio interface
with gr.Blocks(title="Dictionary Search") as demo:
gr.Markdown("# Dictionary Search")
with gr.Row():
with gr.Column(scale=1):
search_input = gr.Textbox(label="Search Term", placeholder="Enter search term...")
search_button = gr.Button("Search")
gr.Markdown("### Language Options")
language = gr.Radio(
choices=["Amazigh", "Arabic", "French"],
label="Select Language",
value="Arabic"
)
gr.Markdown("### Search Options")
exact_match = gr.Checkbox(label="Exact Match (whole cell)", value=True)
word_match = gr.Checkbox(label="Exact Word Match (within cell)", value=True)
contains = gr.Checkbox(label="Contains", value=True)
starts_with = gr.Checkbox(label="Starts With", value=False)
ends_with = gr.Checkbox(label="Ends With", value=False)
with gr.Column(scale=3):
output = gr.HTML(label="Results")
search_params = [search_input, language, exact_match, word_match, contains, starts_with, ends_with]
search_input.submit(
search_dictionary,
inputs=search_params,
outputs=output
)
search_button.click(
search_dictionary,
inputs=search_params,
outputs=output
)
demo.launch()