import gradio as gr import sqlite3 import html import re import unicodedata from typing import List, Dict def normalize_text(text: str, language: str) -> str: """Normalize text based on language rules.""" if not text: return text # Convert to lowercase and normalize Unicode text = text.lower() text = unicodedata.normalize('NFKD', text) if language == "Arabic": # Normalize Arabic alifs and remove diacritics text = re.sub(r'[إأآا]', 'ا', text) # Normalize alifs text = re.sub(r'[ىي]', 'ي', text) # Normalize ya text = re.sub(r'[ةه]', 'ه', text) # Normalize ta marbuta and ha # Remove Arabic diacritics (fatha, kasra, damma, etc.) text = re.sub(r'[\u064B-\u065F\u0670]', '', text) elif language == "French": # Remove French diacritics by replacing accented characters with base characters text = ''.join(c for c in unicodedata.normalize('NFD', text) if not unicodedata.combining(c)) elif language == "Amazigh": # Normalize Amazigh characters text = text.replace('ⵕ', 'ⵔ') # Treat ⵕ the same as ⵔ text = text.replace('ⵯ', '') # Make character ⵯ unnecessary return text def search_dictionary(search_term: str, language: str, exact_match: bool, word_match: bool, contains: bool, starts_with: bool, ends_with: bool) -> str: if not search_term or search_term.isspace(): return "

Please enter a search term

" conn = sqlite3.connect('asawal_amqran.db') cursor = conn.cursor() # Normalize search term according to language-specific rules normalized_search = normalize_text(search_term, language) search_columns = { "Amazigh": ["word", "latin", "construct", "plural", "acc", "accneg", "inacc", "variante", "feminine", "fem_construct", "fem_plural", "fem_plural_construct", "exp_zgh"], "Arabic": ["arabic", "exp_ara", "mean_ar"], "French": ["french", "exp_fra"] }.get(language, []) if not search_columns: return "

Please select a language

" if not any([exact_match, word_match, contains, starts_with, ends_with]): return "

Please select at least one search option

" # Dictionary source priority mapping (lower number = higher priority) source_priority = { "IRCAM (DGLAi)": 1, "Msmun": 2, "Tawalt (Arabic)": 3, "Tawalt (French)": 4 } priority_results = [] seen_word_ids = set() # Get all data and filter in Python to handle normalization properly cursor.execute("SELECT * FROM lexie") column_names = [desc[0] for desc in cursor.description] word_id_idx = column_names.index('word_id') if 'word_id' in column_names else -1 source_idx = column_names.index('source') if 'source' in column_names else -1 all_rows = cursor.fetchall() for row in all_rows: if word_id_idx == -1 or row[word_id_idx] in seen_word_ids: continue # Get the source priority (default to lowest priority if source not found) source_value = row[source_idx] if source_idx >= 0 else None source_priority_value = source_priority.get(source_value, 999) # Check each relevant column with normalization for column_idx, column_name in enumerate(column_names): if column_name not in search_columns: continue cell_value = row[column_idx] if not cell_value: continue # Normalize the cell value according to language rules normalized_cell = normalize_text(str(cell_value), language) # Priority based on match type (1-5) and source priority (subpriority) # Priority 1: Exact Match if exact_match and normalized_cell == normalized_search: seen_word_ids.add(row[word_id_idx]) # Use source priority as subpriority (1.001, 1.002, etc.) priority_results.append((1 + source_priority_value/1000, row)) break # Priority 2: Word Match elif word_match and (normalized_cell == normalized_search or re.search(r'\b' + re.escape(normalized_search) + r'\b', normalized_cell)): seen_word_ids.add(row[word_id_idx]) priority_results.append((2 + source_priority_value/1000, row)) break # Priority 3: Contains elif contains and normalized_search in normalized_cell: seen_word_ids.add(row[word_id_idx]) priority_results.append((3 + source_priority_value/1000, row)) break # Priority 4: Starts With elif starts_with and normalized_cell.startswith(normalized_search): seen_word_ids.add(row[word_id_idx]) priority_results.append((4 + source_priority_value/1000, row)) break # Priority 5: Ends With elif ends_with and normalized_cell.endswith(normalized_search): seen_word_ids.add(row[word_id_idx]) priority_results.append((5 + source_priority_value/1000, row)) break conn.close() if not priority_results: return "

No results found

" # Sort by priority priority_results.sort(key=lambda x: x[0]) results = [row for priority, row in priority_results] # Format results as HTML html_output = "
" if column_names: for result in results: result_dict = dict(zip(column_names, result)) html_output += "
" if 'source' in result_dict and result_dict['source']: html_output += f"
{html.escape(str(result_dict['source']))}
" if 'category' in result_dict and result_dict['category']: html_output += f"
{html.escape(str(result_dict['category']))}
" html_output += "

Word

" html_output += "

Translations

" html_output += "

Expressions

" html_output += "
" else: html_output = "

No data found

" html_output += "
" return html_output # Gradio interface with gr.Blocks(title="Dictionary Search") as demo: gr.Markdown("# Dictionary Search") with gr.Row(): with gr.Column(scale=1): search_input = gr.Textbox(label="Search Term", placeholder="Enter search term...") search_button = gr.Button("Search") gr.Markdown("### Language Options") language = gr.Radio( choices=["Amazigh", "Arabic", "French"], label="Select Language", value="Amazigh" ) gr.Markdown("### Search Options") exact_match = gr.Checkbox(label="Exact Match (whole cell)", value=True) word_match = gr.Checkbox(label="Exact Word Match (within cell)", value=True) contains = gr.Checkbox(label="Contains", value=True) starts_with = gr.Checkbox(label="Starts With", value=False) ends_with = gr.Checkbox(label="Ends With", value=False) with gr.Column(scale=3): output = gr.HTML(label="Results") search_params = [search_input, language, exact_match, word_match, contains, starts_with, ends_with] search_input.submit( search_dictionary, inputs=search_params, outputs=output ) search_button.click( search_dictionary, inputs=search_params, outputs=output ) demo.launch()