Batnini commited on
Commit
a66426f
·
verified ·
1 Parent(s): 4d10dd4

Update tools/quran_search.py

Browse files
Files changed (1) hide show
  1. tools/quran_search.py +75 -59
tools/quran_search.py CHANGED
@@ -5,43 +5,55 @@ import numpy as np
5
  from sentence_transformers import SentenceTransformer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  from config import QURAN_DATA_SOURCES, MODEL_NAME, CHUNK_SIZE
 
8
 
9
  class QuranSearchEngine:
10
  def __init__(self):
11
- self.api_url = "https://quranapi.pages.dev/api/"
12
  self.logger = logging.getLogger(__name__)
 
13
  self.surahs = None
14
  self.all_verses = [] # List of {'surah_id': int, 'verse_num': int, 'text': str}
15
  self.verse_embeddings = None
16
  self.model = None
17
- self._load_surahs()
 
18
  self._load_all_verses_and_embeddings()
 
19
 
20
- def _load_surahs(self):
21
- try:
22
- response = requests.get(f"{self.api_url}surah.json", timeout=5)
23
- response.raise_for_status()
24
- self.surahs = response.json() # List of surah dicts
25
- except Exception as e:
26
- self.logger.error(f"Failed to fetch surahs: {e}")
27
- self.surahs = self._load_fallback_surahs()
 
 
 
 
 
 
28
 
29
  def _load_all_verses_and_embeddings(self):
30
- """Load all verses and precompute embeddings"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  try:
32
- for surah_id in range(1, 115): # 1 to 114
33
- response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
34
- response.raise_for_status()
35
- data = response.json()
36
- verses = data.get('arabic1', []) # Arabic with tashkeel
37
- for verse_num, text in enumerate(verses, start=1):
38
- self.all_verses.append({
39
- 'surah_id': surah_id,
40
- 'verse_num': verse_num,
41
- 'text': text
42
- })
43
-
44
- # Precompute embeddings in chunks
45
  self.model = SentenceTransformer(MODEL_NAME)
46
  verse_texts = [v['text'] for v in self.all_verses]
47
  self.verse_embeddings = []
@@ -50,39 +62,44 @@ class QuranSearchEngine:
50
  embeddings = self.model.encode(chunk, convert_to_tensor=False)
51
  self.verse_embeddings.append(embeddings)
52
  self.verse_embeddings = np.vstack(self.verse_embeddings)
53
-
54
  except Exception as e:
55
- self.logger.error(f"Failed to load verses/embeddings: {e}")
56
- # Fallback: Use limited hardcoded data (no embeddings)
57
- self.all_verses = [
58
- {'surah_id': 1, 'verse_num': 1, 'text': "بِسْمِ ٱللَّهِ ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ"},
59
- # Add more if needed, but limited
60
- ]
61
- self.verse_embeddings = None # Will use keyword fallback in search
62
 
63
  def get_surahs(self):
64
  if self.surahs:
65
  return [
66
- (f"{s['surahNameArabicLong']} ({s['surahNameTranslation']})", i + 1)
67
- for i, s in enumerate(self.surahs)
68
  ]
69
  return self._load_fallback_surahs()
70
 
71
  def get_surah_text(self, surah_id):
72
- try:
73
- response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
74
- response.raise_for_status()
75
- data = response.json()
76
- verses = data['arabic1']
77
- return "\n\n".join(f"آية {i + 1}: {v}" for i, v in enumerate(verses))
78
- except Exception as e:
79
- self.logger.error(f"Failed to fetch surah {surah_id}: {e}")
80
- return self._load_fallback_verse()
81
-
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def search_verses(self, query, top_k=5):
83
- """Semantic search for verses based on meaning"""
84
- if self.verse_embeddings is None:
85
- # Fallback to simple keyword search if embeddings failed
86
  return self._keyword_fallback_search(query, top_k)
87
 
88
  try:
@@ -93,9 +110,9 @@ class QuranSearchEngine:
93
  results = []
94
  for idx in top_indices:
95
  verse = self.all_verses[idx]
96
- surah_name = self.surahs[verse['surah_id'] - 1]['surahNameArabicLong']
97
  results.append(
98
- f"سورة {surah_name} - آية {verse['verse_num']}:\n{verse['text']}\n(تشابه: {similarities[idx]:.2f})"
99
  )
100
  return "\n\n".join(results)
101
  except Exception as e:
@@ -103,38 +120,37 @@ class QuranSearchEngine:
103
  return "حدث خطأ أثناء البحث. جرب مرة أخرى."
104
 
105
  def _keyword_fallback_search(self, query, top_k=5):
106
- """Simple keyword fallback if semantic fails"""
107
  query_lower = query.lower()
108
  matches = []
109
  for verse in self.all_verses:
110
  if query_lower in verse['text'].lower():
111
- surah_name = self.surahs[verse['surah_id'] - 1]['surahNameArabicLong'] if self.surahs else f"سورة {verse['surah_id']}"
112
  matches.append(f"سورة {surah_name} - آية {verse['verse_num']}:\n{verse['text']}")
113
  return "\n\n".join(matches[:top_k]) or "لا توجد نتائج مطابقة."
114
 
115
- # Existing fallback methods (unchanged)
116
  def _load_fallback_surahs(self):
117
  try:
118
  for source in QURAN_DATA_SOURCES:
119
  try:
120
  df = pd.read_csv(source)
121
  return [
122
- (f"{row['name_arabic']} ({row['name_english']})", row['surah_id'])
123
  for _, row in df.drop_duplicates(subset=['surah_id']).iterrows()
124
  ]
125
  except:
126
  continue
127
  return [
128
- ("سورة الفاتحة (The Opening)", 1),
129
- ("سورة البقرة (The Cow)", 2),
130
- ("سورة آل عمران (The Family of Imran)", 3)
131
  ]
132
  except Exception as e:
133
  self.logger.error(f"Failed to load fallback surahs: {e}")
134
  return [
135
- ("سورة الفاتحة (The Opening)", 1),
136
- ("سورة البقرة (The Cow)", 2),
137
- ("سورة آل عمران (The Family of Imran)", 3)
138
  ]
139
 
140
  def _load_fallback_verse(self):
 
5
  from sentence_transformers import SentenceTransformer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  from config import QURAN_DATA_SOURCES, MODEL_NAME, CHUNK_SIZE
8
+ import time
9
 
10
  class QuranSearchEngine:
11
  def __init__(self):
12
+ self.full_quran_url = "https://cdn.jsdelivr.net/npm/[email protected]/dist/quran.json"
13
  self.logger = logging.getLogger(__name__)
14
+ self.full_quran = None
15
  self.surahs = None
16
  self.all_verses = [] # List of {'surah_id': int, 'verse_num': int, 'text': str}
17
  self.verse_embeddings = None
18
  self.model = None
19
+ self._load_full_quran()
20
+ print(f"Surahs loaded: {len(self.surahs) if self.surahs else 0}") # Debug
21
  self._load_all_verses_and_embeddings()
22
+ print(f"Verses loaded: {len(self.all_verses)}") # Debug
23
 
24
+ def _load_full_quran(self):
25
+ max_retries = 3
26
+ for attempt in range(max_retries):
27
+ try:
28
+ response = requests.get(self.full_quran_url, timeout=10)
29
+ response.raise_for_status()
30
+ self.full_quran = response.json() # Array of surah dicts
31
+ self.surahs = self.full_quran
32
+ break
33
+ except Exception as e:
34
+ self.logger.error(f"Attempt {attempt + 1}/{max_retries} failed to fetch full Quran: {e}")
35
+ if attempt == max_retries - 1:
36
+ self.surahs = self._load_fallback_surahs()
37
+ time.sleep(2 ** attempt)
38
 
39
  def _load_all_verses_and_embeddings(self):
40
+ if not self.full_quran:
41
+ self.logger.error("No full Quran loaded, skipping verse loading")
42
+ self.all_verses = [
43
+ {'surah_id': 1, 'verse_num': 1, 'text': "بِسْمِ ٱللَّهِ ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ"},
44
+ ]
45
+ return
46
+
47
+ for surah in self.full_quran:
48
+ surah_id = surah['id']
49
+ for verse in surah['verses']:
50
+ self.all_verses.append({
51
+ 'surah_id': surah_id,
52
+ 'verse_num': verse['id'],
53
+ 'text': verse['text']
54
+ })
55
+
56
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  self.model = SentenceTransformer(MODEL_NAME)
58
  verse_texts = [v['text'] for v in self.all_verses]
59
  self.verse_embeddings = []
 
62
  embeddings = self.model.encode(chunk, convert_to_tensor=False)
63
  self.verse_embeddings.append(embeddings)
64
  self.verse_embeddings = np.vstack(self.verse_embeddings)
 
65
  except Exception as e:
66
+ self.logger.error(f"Failed to compute embeddings: {e}")
67
+ self.verse_embeddings = None
 
 
 
 
 
68
 
69
  def get_surahs(self):
70
  if self.surahs:
71
  return [
72
+ (s['name'], s['id'])
73
+ for s in self.surahs
74
  ]
75
  return self._load_fallback_surahs()
76
 
77
  def get_surah_text(self, surah_id):
78
+ if self.full_quran:
79
+ try:
80
+ surah = self.full_quran[surah_id - 1]
81
+ verses = surah['verses']
82
+ return "\n\n".join(f"آية {v['id']}: {v['text']}" for v in verses)
83
+ except IndexError:
84
+ self.logger.error(f"Surah {surah_id} not found in cached data")
85
+
86
+ # Fallback if cache failed
87
+ max_retries = 3
88
+ for attempt in range(max_retries):
89
+ try:
90
+ response = requests.get(f"https://quranapi.pages.dev/api/{surah_id}.json", timeout=10)
91
+ response.raise_for_status()
92
+ data = response.json()
93
+ verses = data['arabic1']
94
+ return "\n\n".join(f"آية {i + 1}: {v}" for i, v in enumerate(verses))
95
+ except Exception as e:
96
+ self.logger.error(f"Attempt {attempt + 1}/{max_retries} failed to fetch surah {surah_id}: {e}")
97
+ if attempt == max_retries - 1:
98
+ return self._load_fallback_verse()
99
+ time.sleep(2 ** attempt)
100
+
101
  def search_verses(self, query, top_k=5):
102
+ if self.verse_embeddings is None or not self.all_verses:
 
 
103
  return self._keyword_fallback_search(query, top_k)
104
 
105
  try:
 
110
  results = []
111
  for idx in top_indices:
112
  verse = self.all_verses[idx]
113
+ surah_name = self.surahs[verse['surah_id'] - 1]['name']
114
  results.append(
115
+ f"سورة {surah_name} - آية {verse['verse_num']}:\n{verse['text']}"
116
  )
117
  return "\n\n".join(results)
118
  except Exception as e:
 
120
  return "حدث خطأ أثناء البحث. جرب مرة أخرى."
121
 
122
  def _keyword_fallback_search(self, query, top_k=5):
 
123
  query_lower = query.lower()
124
  matches = []
125
  for verse in self.all_verses:
126
  if query_lower in verse['text'].lower():
127
+ surah_name = self.surahs[verse['surah_id'] - 1]['name'] if self.surahs else f"سورة {verse['surah_id']}"
128
  matches.append(f"سورة {surah_name} - آية {verse['verse_num']}:\n{verse['text']}")
129
  return "\n\n".join(matches[:top_k]) or "لا توجد نتائج مطابقة."
130
 
131
+ # Fallback methods (unchanged)
132
  def _load_fallback_surahs(self):
133
  try:
134
  for source in QURAN_DATA_SOURCES:
135
  try:
136
  df = pd.read_csv(source)
137
  return [
138
+ (row['name_arabic'], row['surah_id'])
139
  for _, row in df.drop_duplicates(subset=['surah_id']).iterrows()
140
  ]
141
  except:
142
  continue
143
  return [
144
+ ("الفاتحة", 1),
145
+ ("البقرة", 2),
146
+ ("آل عمران", 3)
147
  ]
148
  except Exception as e:
149
  self.logger.error(f"Failed to load fallback surahs: {e}")
150
  return [
151
+ ("الفاتحة", 1),
152
+ ("البقرة", 2),
153
+ ("آل عمران", 3)
154
  ]
155
 
156
  def _load_fallback_verse(self):