mohbay commited on
Commit
6dce45a
ยท
verified ยท
1 Parent(s): 50ad04d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -2
app.py CHANGED
@@ -25,12 +25,24 @@ df2_links = df2["link"].values
25
  df3_questions = df3["question"].values
26
  df3_links = df3["url"].values
27
 
 
 
 
 
 
 
 
 
 
28
  def arabic_word_tokenize(text):
29
  if not isinstance(text, str):
30
  return []
31
- # Remove diacritics for better matching
32
  text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
33
- return re.findall(r'[\u0600-\u06FF\w]+', text.lower())
 
 
 
34
 
35
  def compute_word_overlap(query, questions):
36
  query_words = set(arabic_word_tokenize(query))
 
25
  df3_questions = df3["question"].values
26
  df3_links = df3["url"].values
27
 
28
+
29
+ import re
30
+
31
+ ARABIC_STOPWORDS = {
32
+ 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
33
+ 'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
34
+ 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ู„ู‡', 'ู„ู‡ุง', 'ู„ู‡ู…', 'ูˆ', 'ุฃู…', 'ุฅู†'
35
+ }
36
+
37
  def arabic_word_tokenize(text):
38
  if not isinstance(text, str):
39
  return []
40
+ # Remove diacritics
41
  text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
42
+ # Extract only Arabic words (length โ‰ฅ 2)
43
+ tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
44
+ return [t for t in tokens if t not in ARABIC_STOPWORDS]
45
+
46
 
47
  def compute_word_overlap(query, questions):
48
  query_words = set(arabic_word_tokenize(query))