Spaces:

abdelhaqueidali
/

Asawal-Amqran-Dictionary

Running

App Files Files Community

Asawal-Amqran-Dictionary / app_final v1.py

abdelhaqueidali

Rename app.py to app_final v1.py

ab33278 verified 4 months ago

raw

history blame contribute delete

9.02 kB

	import gradio as gr
	import sqlite3
	import html
	import re
	import unicodedata
	from typing import List, Dict

	def normalize_text(text: str, language: str) -> str:
	"""Normalize text based on language rules."""
	if not text:
	return text

	# Convert to lowercase and normalize Unicode
	text = text.lower()
	text = unicodedata.normalize('NFKD', text)

	if language == "Arabic":
	# Normalize Arabic alifs and remove diacritics
	text = re.sub(r'[إأآا]', 'ا', text) # Normalize alifs
	text = re.sub(r'[ىي]', 'ي', text) # Normalize ya
	text = re.sub(r'[ةه]', 'ه', text) # Normalize ta marbuta and ha
	# Remove Arabic diacritics (fatha, kasra, damma, etc.)
	text = re.sub(r'[\u064B-\u065F\u0670]', '', text)

	elif language == "French":
	# Remove French diacritics by replacing accented characters with base characters
	text = ''.join(c for c in unicodedata.normalize('NFD', text)
	if not unicodedata.combining(c))

	elif language == "Amazigh":
	# Normalize Amazigh characters
	text = text.replace('ⵕ', 'ⵔ') # Treat ⵕ the same as ⵔ
	text = text.replace('ⵯ', '') # Make character ⵯ unnecessary

	return text

	def search_dictionary(search_term: str,
	language: str,
	exact_match: bool,
	word_match: bool,
	contains: bool,
	starts_with: bool,
	ends_with: bool) -> str:
	if not search_term or search_term.isspace():
	return "<p>Please enter a search term</p>"

	conn = sqlite3.connect('asawal_amqran.db')
	cursor = conn.cursor()

	# Normalize search term according to language-specific rules
	normalized_search = normalize_text(search_term, language)

	search_columns = {
	"Amazigh": ["word", "latin", "construct", "plural", "acc", "accneg", "inacc",
	"variante", "feminine", "fem_construct", "fem_plural",
	"fem_plural_construct", "exp_zgh"],
	"Arabic": ["arabic", "exp_ara", "mean_ar"],
	"French": ["french", "exp_fra"]
	}.get(language, [])

	if not search_columns:
	return "<p>Please select a language</p>"

	if not any([exact_match, word_match, contains, starts_with, ends_with]):
	return "<p>Please select at least one search option</p>"

	priority_results = []
	seen_word_ids = set()

	# Get all data and filter in Python to handle normalization properly
	cursor.execute("SELECT * FROM lexie")
	column_names = [desc[0] for desc in cursor.description]
	word_id_idx = column_names.index('word_id') if 'word_id' in column_names else -1
	all_rows = cursor.fetchall()

	for row in all_rows:
	if word_id_idx == -1 or row[word_id_idx] in seen_word_ids:
	continue

	# Check each relevant column with normalization
	for column_idx, column_name in enumerate(column_names):
	if column_name not in search_columns:
	continue

	cell_value = row[column_idx]
	if not cell_value:
	continue

	# Normalize the cell value according to language rules
	normalized_cell = normalize_text(str(cell_value), language)

	# Priority 1: Exact Match
	if exact_match and normalized_cell == normalized_search:
	seen_word_ids.add(row[word_id_idx])
	priority_results.append((1, row))
	break

	# Priority 2: Word Match
	elif word_match and (normalized_cell == normalized_search or
	re.search(r'\b' + re.escape(normalized_search) + r'\b', normalized_cell)):
	seen_word_ids.add(row[word_id_idx])
	priority_results.append((2, row))
	break

	# Priority 3: Contains
	elif contains and normalized_search in normalized_cell:
	seen_word_ids.add(row[word_id_idx])
	priority_results.append((3, row))
	break

	# Priority 4: Starts With
	elif starts_with and normalized_cell.startswith(normalized_search):
	seen_word_ids.add(row[word_id_idx])
	priority_results.append((4, row))
	break

	# Priority 5: Ends With
	elif ends_with and normalized_cell.endswith(normalized_search):
	seen_word_ids.add(row[word_id_idx])
	priority_results.append((5, row))
	break

	conn.close()

	if not priority_results:
	return "<p>No results found</p>"

	# Sort by priority
	priority_results.sort(key=lambda x: x[0])
	results = [row for priority, row in priority_results]

	# Format results as HTML
	html_output = "<div style='font-family: Arial, sans-serif;'>"
	if column_names:
	for result in results:
	result_dict = dict(zip(column_names, result))

	html_output += "<div style='border: 1px solid #ccc; margin: 10px; padding: 15px; position: relative;'>"

	if 'source' in result_dict and result_dict['source']:
	html_output += f"<div style='text-align: center; font-style: italic;'>{html.escape(str(result_dict['source']))}</div>"
	if 'category' in result_dict and result_dict['category']:
	html_output += f"<div style='position: absolute; top: 10px; right: 10px; font-weight: bold;'>{html.escape(str(result_dict['category']))}</div>"

	html_output += "<h3>Word</h3><ul>"
	for field, label in [
	('word', 'Word'), ('latin', 'Latin'), ('construct', 'Construct'),
	('plural', 'Plural'), ('acc', 'Accusative'), ('accneg', 'Negative Accusative'),
	('inacc', 'Inaccusative'), ('variante', 'Variant'), ('feminine', 'Feminine'),
	('fem_construct', 'Feminine Construct'), ('fem_plural', 'Feminine Plural'),
	('fem_plural_construct', 'Feminine Plural Construct')
	]:
	if field in result_dict and result_dict[field]:
	html_output += f"<li><strong>{label}:</strong> {html.escape(str(result_dict[field]))}</li>"
	html_output += "</ul>"

	html_output += "<h3>Translations</h3><ul>"
	if 'french' in result_dict and result_dict['french']:
	html_output += f"<li><strong>French:</strong> {html.escape(str(result_dict['french']))}</li>"
	if 'arabic' in result_dict and result_dict['arabic']:
	html_output += f"<li><strong>Arabic:</strong> {html.escape(str(result_dict['arabic']))}</li>"
	if 'mean_ar' in result_dict and result_dict['mean_ar']:
	html_output += f"<li><strong>Arabic Meaning:</strong> {html.escape(str(result_dict['mean_ar']))}</li>"
	html_output += "</ul>"

	html_output += "<h3>Expressions</h3><ul>"
	for field, label in [
	('exp_zgh', 'Amazigh Expression'), ('exp_fra', 'French Expression'),
	('exp_ara', 'Arabic Expression')
	]:
	if field in result_dict and result_dict[field]:
	html_output += f"<li><strong>{label}:</strong> {html.escape(str(result_dict[field]))}</li>"
	html_output += "</ul>"

	html_output += "</div>"
	else:
	html_output = "<p>No data found</p>"
	html_output += "</div>"
	return html_output

	# Gradio interface
	with gr.Blocks(title="Dictionary Search") as demo:
	gr.Markdown("# Dictionary Search")

	with gr.Row():
	with gr.Column(scale=1):
	search_input = gr.Textbox(label="Search Term", placeholder="Enter search term...")
	search_button = gr.Button("Search")

	gr.Markdown("### Language Options")
	language = gr.Radio(
	choices=["Amazigh", "Arabic", "French"],
	label="Select Language",
	value="Arabic"
	)

	gr.Markdown("### Search Options")
	exact_match = gr.Checkbox(label="Exact Match (whole cell)", value=True)
	word_match = gr.Checkbox(label="Exact Word Match (within cell)", value=True)
	contains = gr.Checkbox(label="Contains", value=True)
	starts_with = gr.Checkbox(label="Starts With", value=False)
	ends_with = gr.Checkbox(label="Ends With", value=False)

	with gr.Column(scale=3):
	output = gr.HTML(label="Results")
	search_params = [search_input, language, exact_match, word_match, contains, starts_with, ends_with]
	search_input.submit(
	search_dictionary,
	inputs=search_params,
	outputs=output
	)
	search_button.click(
	search_dictionary,
	inputs=search_params,
	outputs=output
	)

	demo.launch()