Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -25,12 +25,24 @@ df2_links = df2["link"].values
|
|
25 |
df3_questions = df3["question"].values
|
26 |
df3_links = df3["url"].values
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def arabic_word_tokenize(text):
|
29 |
if not isinstance(text, str):
|
30 |
return []
|
31 |
-
# Remove diacritics
|
32 |
text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
|
33 |
-
|
|
|
|
|
|
|
34 |
|
35 |
def compute_word_overlap(query, questions):
|
36 |
query_words = set(arabic_word_tokenize(query))
|
|
|
25 |
df3_questions = df3["question"].values
|
26 |
df3_links = df3["url"].values
|
27 |
|
28 |
+
|
29 |
+
import re
|
30 |
+
|
31 |
+
ARABIC_STOPWORDS = {
|
32 |
+
'ูู', 'ู
ู', 'ุฅูู', 'ุนู', 'ู
ุน', 'ูุฐุง', 'ูุฐู', 'ุฐูู', 'ุชูู',
|
33 |
+
'ุงูุชู', 'ุงูุฐู', 'ู
ุง', 'ูุง', 'ุฃู', 'ุฃู', 'ููู', 'ูุฏ', 'ุญูู
', 'ูุงู',
|
34 |
+
'ูุงู', 'ูุงูุช', 'ูููู', 'ุชููู', 'ูู', 'ููุง', 'ููู
', 'ู', 'ุฃู
', 'ุฅู'
|
35 |
+
}
|
36 |
+
|
37 |
def arabic_word_tokenize(text):
|
38 |
if not isinstance(text, str):
|
39 |
return []
|
40 |
+
# Remove diacritics
|
41 |
text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
|
42 |
+
# Extract only Arabic words (length โฅ 2)
|
43 |
+
tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
|
44 |
+
return [t for t in tokens if t not in ARABIC_STOPWORDS]
|
45 |
+
|
46 |
|
47 |
def compute_word_overlap(query, questions):
|
48 |
query_words = set(arabic_word_tokenize(query))
|