Spaces:

abdelhaqueidali
/

Asawal-Amqran-Dictionary

Running

File size: 9,870 Bytes

import gradio as gr
import sqlite3
import html
import re
import unicodedata
from typing import List, Dict

def normalize_text(text: str, language: str) -> str:
    """Normalize text based on language rules."""
    if not text:
        return text
    
    # Convert to lowercase and normalize Unicode
    text = text.lower()
    text = unicodedata.normalize('NFKD', text)
    
    if language == "Arabic":
        # Normalize Arabic alifs and remove diacritics
        text = re.sub(r'[إأآا]', 'ا', text)  # Normalize alifs
        text = re.sub(r'[ىي]', 'ي', text)    # Normalize ya
        text = re.sub(r'[ةه]', 'ه', text)    # Normalize ta marbuta and ha
        # Remove Arabic diacritics (fatha, kasra, damma, etc.)
        text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
    
    elif language == "French":
        # Remove French diacritics by replacing accented characters with base characters
        text = ''.join(c for c in unicodedata.normalize('NFD', text)
                       if not unicodedata.combining(c))
    
    elif language == "Amazigh":
        # Normalize Amazigh characters
        text = text.replace('ⵕ', 'ⵔ')  # Treat ⵕ the same as ⵔ
        text = text.replace('ⵯ', '')    # Make character ⵯ unnecessary
    
    return text

def search_dictionary(search_term: str,
                     language: str,
                     exact_match: bool,
                     word_match: bool,
                     contains: bool,
                     starts_with: bool,
                     ends_with: bool) -> str:
    if not search_term or search_term.isspace():
        return "<p>Please enter a search term</p>"

    conn = sqlite3.connect('asawal_amqran.db')
    cursor = conn.cursor()

    # Normalize search term according to language-specific rules
    normalized_search = normalize_text(search_term, language)

    search_columns = {
        "Amazigh": ["word", "latin", "construct", "plural", "acc", "accneg", "inacc",
                   "variante", "feminine", "fem_construct", "fem_plural",
                   "fem_plural_construct", "exp_zgh"],
        "Arabic": ["arabic", "exp_ara", "mean_ar"],
        "French": ["french", "exp_fra"]
    }.get(language, [])

    if not search_columns:
        return "<p>Please select a language</p>"

    if not any([exact_match, word_match, contains, starts_with, ends_with]):
        return "<p>Please select at least one search option</p>"

    # Dictionary source priority mapping (lower number = higher priority)
    source_priority = {
        "IRCAM (DGLAi)": 1,
        "Msmun": 2,
        "Tawalt (Arabic)": 3,
        "Tawalt (French)": 4
    }
    
    priority_results = []
    seen_word_ids = set()

    # Get all data and filter in Python to handle normalization properly
    cursor.execute("SELECT * FROM lexie")
    column_names = [desc[0] for desc in cursor.description]
    word_id_idx = column_names.index('word_id') if 'word_id' in column_names else -1
    source_idx = column_names.index('source') if 'source' in column_names else -1
    all_rows = cursor.fetchall()
    
    for row in all_rows:
        if word_id_idx == -1 or row[word_id_idx] in seen_word_ids:
            continue
            
        # Get the source priority (default to lowest priority if source not found)
        source_value = row[source_idx] if source_idx >= 0 else None
        source_priority_value = source_priority.get(source_value, 999)
            
        # Check each relevant column with normalization
        for column_idx, column_name in enumerate(column_names):
            if column_name not in search_columns:
                continue
                
            cell_value = row[column_idx]
            if not cell_value:
                continue
                
            # Normalize the cell value according to language rules
            normalized_cell = normalize_text(str(cell_value), language)
            
            # Priority based on match type (1-5) and source priority (subpriority)
            
            # Priority 1: Exact Match
            if exact_match and normalized_cell == normalized_search:
                seen_word_ids.add(row[word_id_idx])
                # Use source priority as subpriority (1.001, 1.002, etc.)
                priority_results.append((1 + source_priority_value/1000, row))
                break
                
            # Priority 2: Word Match
            elif word_match and (normalized_cell == normalized_search or 
                                re.search(r'\b' + re.escape(normalized_search) + r'\b', normalized_cell)):
                seen_word_ids.add(row[word_id_idx])
                priority_results.append((2 + source_priority_value/1000, row))
                break
                
            # Priority 3: Contains
            elif contains and normalized_search in normalized_cell:
                seen_word_ids.add(row[word_id_idx])
                priority_results.append((3 + source_priority_value/1000, row))
                break
                
            # Priority 4: Starts With
            elif starts_with and normalized_cell.startswith(normalized_search):
                seen_word_ids.add(row[word_id_idx])
                priority_results.append((4 + source_priority_value/1000, row))
                break
                
            # Priority 5: Ends With
            elif ends_with and normalized_cell.endswith(normalized_search):
                seen_word_ids.add(row[word_id_idx])
                priority_results.append((5 + source_priority_value/1000, row))
                break

    conn.close()

    if not priority_results:
        return "<p>No results found</p>"

    # Sort by priority
    priority_results.sort(key=lambda x: x[0])
    results = [row for priority, row in priority_results]

    # Format results as HTML
    html_output = "<div style='font-family: Arial, sans-serif;'>"
    if column_names:
        for result in results:
           result_dict = dict(zip(column_names, result))

           html_output += "<div style='border: 1px solid #ccc; margin: 10px; padding: 15px; position: relative;'>"

           if 'source' in result_dict and result_dict['source']:
               html_output += f"<div style='text-align: center; font-style: italic;'>{html.escape(str(result_dict['source']))}</div>"
           if 'category' in result_dict and result_dict['category']:
               html_output += f"<div style='position: absolute; top: 10px; right: 10px; font-weight: bold;'>{html.escape(str(result_dict['category']))}</div>"

           html_output += "<h3>Word</h3><ul>"
           for field, label in [
               ('word', 'Word'), ('latin', 'Latin'), ('construct', 'Construct'),
               ('plural', 'Plural'), ('acc', 'Accusative'), ('accneg', 'Negative Accusative'),
               ('inacc', 'Inaccusative'), ('variante', 'Variant'), ('feminine', 'Feminine'),
               ('fem_construct', 'Feminine Construct'), ('fem_plural', 'Feminine Plural'),
               ('fem_plural_construct', 'Feminine Plural Construct')
           ]:
               if field in result_dict and result_dict[field]:
                   html_output += f"<li><strong>{label}:</strong> {html.escape(str(result_dict[field]))}</li>"
           html_output += "</ul>"

           html_output += "<h3>Translations</h3><ul>"
           if 'french' in result_dict and result_dict['french']:
               html_output += f"<li><strong>French:</strong> {html.escape(str(result_dict['french']))}</li>"
           if 'arabic' in result_dict and result_dict['arabic']:
               html_output += f"<li><strong>Arabic:</strong> {html.escape(str(result_dict['arabic']))}</li>"
           if 'mean_ar' in result_dict and result_dict['mean_ar']:
               html_output += f"<li><strong>Arabic Meaning:</strong> {html.escape(str(result_dict['mean_ar']))}</li>"
           html_output += "</ul>"

           html_output += "<h3>Expressions</h3><ul>"
           for field, label in [
               ('exp_zgh', 'Amazigh Expression'), ('exp_fra', 'French Expression'),
               ('exp_ara', 'Arabic Expression')
           ]:
               if field in result_dict and result_dict[field]:
                   html_output += f"<li><strong>{label}:</strong> {html.escape(str(result_dict[field]))}</li>"
           html_output += "</ul>"

           html_output += "</div>"
    else:
        html_output = "<p>No data found</p>"
    html_output += "</div>"
    return html_output

# Gradio interface
with gr.Blocks(title="Dictionary Search") as demo:
    gr.Markdown("# Dictionary Search")

    with gr.Row():
        with gr.Column(scale=1):
            search_input = gr.Textbox(label="Search Term", placeholder="Enter search term...")
            search_button = gr.Button("Search")

            gr.Markdown("### Language Options")
            language = gr.Radio(
                choices=["Amazigh", "Arabic", "French"],
                label="Select Language",
                value="Amazigh"
            )

            gr.Markdown("### Search Options")
            exact_match = gr.Checkbox(label="Exact Match (whole cell)", value=True)
            word_match = gr.Checkbox(label="Exact Word Match (within cell)", value=True)
            contains = gr.Checkbox(label="Contains", value=True)
            starts_with = gr.Checkbox(label="Starts With", value=False)
            ends_with = gr.Checkbox(label="Ends With", value=False)

        with gr.Column(scale=3):
            output = gr.HTML(label="Results")
    search_params = [search_input, language, exact_match, word_match, contains, starts_with, ends_with]
    search_input.submit(
        search_dictionary,
        inputs=search_params,
        outputs=output
    )
    search_button.click(
        search_dictionary,
        inputs=search_params,
        outputs=output
    )

demo.launch()