Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import sqlite3
|
3 |
+
import html
|
4 |
+
import re
|
5 |
+
import unicodedata
|
6 |
+
from typing import List, Dict
|
7 |
+
|
8 |
+
def normalize_text(text: str, language: str) -> str:
|
9 |
+
"""Normalize text based on language rules."""
|
10 |
+
if not text:
|
11 |
+
return text
|
12 |
+
|
13 |
+
# Convert to lowercase and normalize Unicode
|
14 |
+
text = text.lower()
|
15 |
+
text = unicodedata.normalize('NFKD', text)
|
16 |
+
|
17 |
+
if language == "Arabic":
|
18 |
+
# Normalize Arabic alifs and remove diacritics
|
19 |
+
text = re.sub(r'[إأآا]', 'ا', text) # Normalize alifs
|
20 |
+
text = re.sub(r'[ىي]', 'ي', text) # Normalize ya
|
21 |
+
text = re.sub(r'[ةه]', 'ه', text) # Normalize ta marbuta and ha
|
22 |
+
# Remove Arabic diacritics (fatha, kasra, damma, etc.)
|
23 |
+
text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
|
24 |
+
|
25 |
+
elif language == "French":
|
26 |
+
# Remove French diacritics by replacing accented characters with base characters
|
27 |
+
text = ''.join(c for c in unicodedata.normalize('NFD', text)
|
28 |
+
if not unicodedata.combining(c))
|
29 |
+
|
30 |
+
elif language == "Amazigh":
|
31 |
+
# Normalize Amazigh characters
|
32 |
+
text = text.replace('ⵕ', 'ⵔ') # Treat ⵕ the same as ⵔ
|
33 |
+
text = text.replace('ⵯ', '') # Make character ⵯ unnecessary
|
34 |
+
|
35 |
+
return text
|
36 |
+
|
37 |
+
def search_dictionary(search_term: str,
|
38 |
+
language: str,
|
39 |
+
exact_match: bool,
|
40 |
+
word_match: bool,
|
41 |
+
contains: bool,
|
42 |
+
starts_with: bool,
|
43 |
+
ends_with: bool) -> str:
|
44 |
+
if not search_term or search_term.isspace():
|
45 |
+
return "<p>Please enter a search term</p>"
|
46 |
+
|
47 |
+
conn = sqlite3.connect('asawal_amqran.db')
|
48 |
+
cursor = conn.cursor()
|
49 |
+
|
50 |
+
# Normalize search term according to language-specific rules
|
51 |
+
normalized_search = normalize_text(search_term, language)
|
52 |
+
|
53 |
+
search_columns = {
|
54 |
+
"Amazigh": ["word", "latin", "construct", "plural", "acc", "accneg", "inacc",
|
55 |
+
"variante", "feminine", "fem_construct", "fem_plural",
|
56 |
+
"fem_plural_construct", "exp_zgh"],
|
57 |
+
"Arabic": ["arabic", "exp_ara", "mean_ar"],
|
58 |
+
"French": ["french", "exp_fra"]
|
59 |
+
}.get(language, [])
|
60 |
+
|
61 |
+
if not search_columns:
|
62 |
+
return "<p>Please select a language</p>"
|
63 |
+
|
64 |
+
if not any([exact_match, word_match, contains, starts_with, ends_with]):
|
65 |
+
return "<p>Please select at least one search option</p>"
|
66 |
+
|
67 |
+
priority_results = []
|
68 |
+
seen_word_ids = set()
|
69 |
+
|
70 |
+
# Get all data and filter in Python to handle normalization properly
|
71 |
+
cursor.execute("SELECT * FROM lexie")
|
72 |
+
column_names = [desc[0] for desc in cursor.description]
|
73 |
+
word_id_idx = column_names.index('word_id') if 'word_id' in column_names else -1
|
74 |
+
all_rows = cursor.fetchall()
|
75 |
+
|
76 |
+
for row in all_rows:
|
77 |
+
if word_id_idx == -1 or row[word_id_idx] in seen_word_ids:
|
78 |
+
continue
|
79 |
+
|
80 |
+
# Check each relevant column with normalization
|
81 |
+
for column_idx, column_name in enumerate(column_names):
|
82 |
+
if column_name not in search_columns:
|
83 |
+
continue
|
84 |
+
|
85 |
+
cell_value = row[column_idx]
|
86 |
+
if not cell_value:
|
87 |
+
continue
|
88 |
+
|
89 |
+
# Normalize the cell value according to language rules
|
90 |
+
normalized_cell = normalize_text(str(cell_value), language)
|
91 |
+
|
92 |
+
# Priority 1: Exact Match
|
93 |
+
if exact_match and normalized_cell == normalized_search:
|
94 |
+
seen_word_ids.add(row[word_id_idx])
|
95 |
+
priority_results.append((1, row))
|
96 |
+
break
|
97 |
+
|
98 |
+
# Priority 2: Word Match
|
99 |
+
elif word_match and (normalized_cell == normalized_search or
|
100 |
+
re.search(r'\b' + re.escape(normalized_search) + r'\b', normalized_cell)):
|
101 |
+
seen_word_ids.add(row[word_id_idx])
|
102 |
+
priority_results.append((2, row))
|
103 |
+
break
|
104 |
+
|
105 |
+
# Priority 3: Contains
|
106 |
+
elif contains and normalized_search in normalized_cell:
|
107 |
+
seen_word_ids.add(row[word_id_idx])
|
108 |
+
priority_results.append((3, row))
|
109 |
+
break
|
110 |
+
|
111 |
+
# Priority 4: Starts With
|
112 |
+
elif starts_with and normalized_cell.startswith(normalized_search):
|
113 |
+
seen_word_ids.add(row[word_id_idx])
|
114 |
+
priority_results.append((4, row))
|
115 |
+
break
|
116 |
+
|
117 |
+
# Priority 5: Ends With
|
118 |
+
elif ends_with and normalized_cell.endswith(normalized_search):
|
119 |
+
seen_word_ids.add(row[word_id_idx])
|
120 |
+
priority_results.append((5, row))
|
121 |
+
break
|
122 |
+
|
123 |
+
conn.close()
|
124 |
+
|
125 |
+
if not priority_results:
|
126 |
+
return "<p>No results found</p>"
|
127 |
+
|
128 |
+
# Sort by priority
|
129 |
+
priority_results.sort(key=lambda x: x[0])
|
130 |
+
results = [row for priority, row in priority_results]
|
131 |
+
|
132 |
+
# Format results as HTML
|
133 |
+
html_output = "<div style='font-family: Arial, sans-serif;'>"
|
134 |
+
if column_names:
|
135 |
+
for result in results:
|
136 |
+
result_dict = dict(zip(column_names, result))
|
137 |
+
|
138 |
+
html_output += "<div style='border: 1px solid #ccc; margin: 10px; padding: 15px; position: relative;'>"
|
139 |
+
|
140 |
+
if 'source' in result_dict and result_dict['source']:
|
141 |
+
html_output += f"<div style='text-align: center; font-style: italic;'>{html.escape(str(result_dict['source']))}</div>"
|
142 |
+
if 'category' in result_dict and result_dict['category']:
|
143 |
+
html_output += f"<div style='position: absolute; top: 10px; right: 10px; font-weight: bold;'>{html.escape(str(result_dict['category']))}</div>"
|
144 |
+
|
145 |
+
html_output += "<h3>Word</h3><ul>"
|
146 |
+
for field, label in [
|
147 |
+
('word', 'Word'), ('latin', 'Latin'), ('construct', 'Construct'),
|
148 |
+
('plural', 'Plural'), ('acc', 'Accusative'), ('accneg', 'Negative Accusative'),
|
149 |
+
('inacc', 'Inaccusative'), ('variante', 'Variant'), ('feminine', 'Feminine'),
|
150 |
+
('fem_construct', 'Feminine Construct'), ('fem_plural', 'Feminine Plural'),
|
151 |
+
('fem_plural_construct', 'Feminine Plural Construct')
|
152 |
+
]:
|
153 |
+
if field in result_dict and result_dict[field]:
|
154 |
+
html_output += f"<li><strong>{label}:</strong> {html.escape(str(result_dict[field]))}</li>"
|
155 |
+
html_output += "</ul>"
|
156 |
+
|
157 |
+
html_output += "<h3>Translations</h3><ul>"
|
158 |
+
if 'french' in result_dict and result_dict['french']:
|
159 |
+
html_output += f"<li><strong>French:</strong> {html.escape(str(result_dict['french']))}</li>"
|
160 |
+
if 'arabic' in result_dict and result_dict['arabic']:
|
161 |
+
html_output += f"<li><strong>Arabic:</strong> {html.escape(str(result_dict['arabic']))}</li>"
|
162 |
+
if 'mean_ar' in result_dict and result_dict['mean_ar']:
|
163 |
+
html_output += f"<li><strong>Arabic Meaning:</strong> {html.escape(str(result_dict['mean_ar']))}</li>"
|
164 |
+
html_output += "</ul>"
|
165 |
+
|
166 |
+
html_output += "<h3>Expressions</h3><ul>"
|
167 |
+
for field, label in [
|
168 |
+
('exp_zgh', 'Amazigh Expression'), ('exp_fra', 'French Expression'),
|
169 |
+
('exp_ara', 'Arabic Expression')
|
170 |
+
]:
|
171 |
+
if field in result_dict and result_dict[field]:
|
172 |
+
html_output += f"<li><strong>{label}:</strong> {html.escape(str(result_dict[field]))}</li>"
|
173 |
+
html_output += "</ul>"
|
174 |
+
|
175 |
+
html_output += "</div>"
|
176 |
+
else:
|
177 |
+
html_output = "<p>No data found</p>"
|
178 |
+
html_output += "</div>"
|
179 |
+
return html_output
|
180 |
+
|
181 |
+
# Gradio interface
|
182 |
+
with gr.Blocks(title="Dictionary Search") as demo:
|
183 |
+
gr.Markdown("# Dictionary Search")
|
184 |
+
|
185 |
+
with gr.Row():
|
186 |
+
with gr.Column(scale=1):
|
187 |
+
search_input = gr.Textbox(label="Search Term", placeholder="Enter search term...")
|
188 |
+
search_button = gr.Button("Search")
|
189 |
+
|
190 |
+
gr.Markdown("### Language Options")
|
191 |
+
language = gr.Radio(
|
192 |
+
choices=["Amazigh", "Arabic", "French"],
|
193 |
+
label="Select Language",
|
194 |
+
value="Arabic"
|
195 |
+
)
|
196 |
+
|
197 |
+
gr.Markdown("### Search Options")
|
198 |
+
exact_match = gr.Checkbox(label="Exact Match (whole cell)", value=True)
|
199 |
+
word_match = gr.Checkbox(label="Exact Word Match (within cell)", value=True)
|
200 |
+
contains = gr.Checkbox(label="Contains", value=True)
|
201 |
+
starts_with = gr.Checkbox(label="Starts With", value=False)
|
202 |
+
ends_with = gr.Checkbox(label="Ends With", value=False)
|
203 |
+
|
204 |
+
with gr.Column(scale=3):
|
205 |
+
output = gr.HTML(label="Results")
|
206 |
+
search_params = [search_input, language, exact_match, word_match, contains, starts_with, ends_with]
|
207 |
+
search_input.submit(
|
208 |
+
search_dictionary,
|
209 |
+
inputs=search_params,
|
210 |
+
outputs=output
|
211 |
+
)
|
212 |
+
search_button.click(
|
213 |
+
search_dictionary,
|
214 |
+
inputs=search_params,
|
215 |
+
outputs=output
|
216 |
+
)
|
217 |
+
|
218 |
+
demo.launch()
|