Batnini commited on
Commit
c3539cc
·
verified ·
1 Parent(s): 906a6e5

Update tools/quran_search.py

Browse files
Files changed (1) hide show
  1. tools/quran_search.py +56 -37
tools/quran_search.py CHANGED
@@ -5,6 +5,7 @@ from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from config import MODEL_NAME, CHUNK_SIZE
7
  import time
 
8
 
9
  class QuranSearchEngine:
10
  def __init__(self):
@@ -13,18 +14,23 @@ class QuranSearchEngine:
13
  self.surahs = None
14
  self.all_verses = [] # List of {'surah_id': int, 'verse_num': int, 'text': str}
15
  self.verse_embeddings = None
16
- self.model = None
17
- print("Starting QuranSearchEngine initialization...") # Debug
18
- self._load_full_quran()
19
- print(f"Surahs loaded: {len(self.surahs) if self.surahs else 0}") # Debug
20
- self._load_all_verses_and_embeddings()
21
- print(f"Verses loaded: {len(self.all_verses)}") # Debug
22
-
 
 
 
 
 
23
  def _load_full_quran(self):
24
- max_retries = 3
25
  for attempt in range(max_retries):
26
  try:
27
- response = requests.get(f"{self.api_url}surah.json", timeout=10)
28
  response.raise_for_status()
29
  self.surahs = response.json()
30
  for i, s in enumerate(self.surahs):
@@ -33,15 +39,14 @@ class QuranSearchEngine:
33
  except Exception as e:
34
  self.logger.error(f"Attempt {attempt + 1}/{max_retries} failed to fetch surahs: {e}")
35
  if attempt == max_retries - 1:
36
- self.surahs = self._load_fallback_surahs()
37
  time.sleep(2 ** attempt)
38
-
39
- # Load verses
40
  if self.surahs:
41
  for surah in self.surahs:
42
  surah_id = surah['id']
43
  try:
44
- response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
45
  response.raise_for_status()
46
  data = response.json()
47
  verses = data['arabic1']
@@ -54,18 +59,22 @@ class QuranSearchEngine:
54
  except Exception as e:
55
  self.logger.error(f"Failed to fetch verses for surah {surah_id}: {e}")
56
 
57
- if not self.all_verses:
58
- self.all_verses = [
59
- {'surah_id': 1, 'verse_num': 1, 'text': "بِسْمِ ٱللَّهِ ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ"},
60
- # Add more if needed
61
- ]
62
-
 
 
63
  def _load_all_verses_and_embeddings(self):
64
  if not self.all_verses:
65
  return
66
 
67
  try:
 
68
  self.model = SentenceTransformer(MODEL_NAME)
 
69
  verse_texts = [v['text'] for v in self.all_verses]
70
  self.verse_embeddings = []
71
  for i in range(0, len(verse_texts), CHUNK_SIZE):
@@ -73,23 +82,22 @@ class QuranSearchEngine:
73
  embeddings = self.model.encode(chunk, convert_to_tensor=False)
74
  self.verse_embeddings.append(embeddings)
75
  self.verse_embeddings = np.vstack(self.verse_embeddings)
 
76
  except Exception as e:
77
- self.logger.error(f"Failed to compute embeddings: {e}")
78
  self.verse_embeddings = None
79
-
 
80
  def get_surahs(self):
81
  if self.surahs:
82
- return [
83
- (s['surahNameArabicLong'], s['id'])
84
- for s in self.surahs
85
- ]
86
  return self._load_fallback_surahs()
87
-
88
  def get_surah_text(self, surah_id):
89
  max_retries = 3
90
  for attempt in range(max_retries):
91
  try:
92
- response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
93
  response.raise_for_status()
94
  data = response.json()
95
  verses = data['arabic1']
@@ -99,12 +107,29 @@ class QuranSearchEngine:
99
  if attempt == max_retries - 1:
100
  return self._load_fallback_verse()
101
  time.sleep(2 ** attempt)
102
-
103
  def search_verses(self, query, top_k=5):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  if self.verse_embeddings is None or not self.all_verses:
105
  return self._keyword_fallback_search(query, top_k)
106
 
107
  try:
 
108
  query_embedding = self.model.encode([query], convert_to_tensor=False)
109
  similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
110
  top_indices = np.argsort(similarities)[-top_k:][::-1]
@@ -113,12 +138,10 @@ class QuranSearchEngine:
113
  for idx in top_indices:
114
  verse = self.all_verses[idx]
115
  surah_name = self.surahs[verse['surah_id'] - 1]['surahNameArabicLong']
116
- results.append(
117
- f"سورة {surah_name} - آية {verse['verse_num']}:\n{verse['text']}"
118
- )
119
  return "\n\n".join(results)
120
  except Exception as e:
121
- self.logger.error(f"Search failed: {e}")
122
  return "حدث خطأ أثناء البحث. جرب مرة أخرى."
123
 
124
  def _keyword_fallback_search(self, query, top_k=5):
@@ -131,11 +154,7 @@ class QuranSearchEngine:
131
  return "\n\n".join(matches[:top_k]) or "لا توجد نتائج مطابقة."
132
 
133
  def _load_fallback_surahs(self):
134
- return [
135
- ("الفاتحة", 1),
136
- ("البقرة", 2),
137
- ("آل عمران", 3)
138
- ]
139
 
140
  def _load_fallback_verse(self):
141
  return "بسم الله الرحمن الرحيم\nالله لا إله إلا هو الحي القيوم"
 
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from config import MODEL_NAME, CHUNK_SIZE
7
  import time
8
+ import sys
9
 
10
  class QuranSearchEngine:
11
  def __init__(self):
 
14
  self.surahs = None
15
  self.all_verses = [] # List of {'surah_id': int, 'verse_num': int, 'text': str}
16
  self.verse_embeddings = None
17
+ self.model = None # Deferred loading
18
+ print("Starting QuranSearchEngine initialization at", time.ctime(), file=sys.stderr) # Debug
19
+ try:
20
+ self._load_full_quran()
21
+ print(f"Surahs loaded: {len(self.surahs) if self.surahs else 0}", file=sys.stderr) # Debug
22
+ self._load_all_verses_and_embeddings()
23
+ print(f"Verses loaded: {len(self.all_verses)}", file=sys.stderr) # Debug
24
+ except Exception as e:
25
+ self.logger.error(f"Initialization failed: {e}", exc_info=True)
26
+ print(f"Initialization error: {e}", file=sys.stderr)
27
+ self._load_fallback_data() # Ensure minimal startup
28
+
29
  def _load_full_quran(self):
30
+ max_retries = 5 # Increased retries
31
  for attempt in range(max_retries):
32
  try:
33
+ response = requests.get(f"{self.api_url}surah.json", timeout=15) # Increased timeout
34
  response.raise_for_status()
35
  self.surahs = response.json()
36
  for i, s in enumerate(self.surahs):
 
39
  except Exception as e:
40
  self.logger.error(f"Attempt {attempt + 1}/{max_retries} failed to fetch surahs: {e}")
41
  if attempt == max_retries - 1:
42
+ self._load_fallback_data()
43
  time.sleep(2 ** attempt)
44
+
 
45
  if self.surahs:
46
  for surah in self.surahs:
47
  surah_id = surah['id']
48
  try:
49
+ response = requests.get(f"{self.api_url}{surah_id}.json", timeout=15)
50
  response.raise_for_status()
51
  data = response.json()
52
  verses = data['arabic1']
 
59
  except Exception as e:
60
  self.logger.error(f"Failed to fetch verses for surah {surah_id}: {e}")
61
 
62
+ def _load_fallback_data(self):
63
+ self.logger.warning("Falling back to minimal data due to API failure")
64
+ self.surahs = self._load_fallback_surahs()
65
+ self.all_verses = [
66
+ {'surah_id': 1, 'verse_num': 1, 'text': "بِسْمِ ٱللَّهِ ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ"},
67
+ {'surah_id': 1, 'verse_num': 2, 'text': "ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ"}
68
+ ]
69
+
70
  def _load_all_verses_and_embeddings(self):
71
  if not self.all_verses:
72
  return
73
 
74
  try:
75
+ print("Attempting to load model...", file=sys.stderr) # Debug
76
  self.model = SentenceTransformer(MODEL_NAME)
77
+ print("Model loaded successfully", file=sys.stderr) # Debug
78
  verse_texts = [v['text'] for v in self.all_verses]
79
  self.verse_embeddings = []
80
  for i in range(0, len(verse_texts), CHUNK_SIZE):
 
82
  embeddings = self.model.encode(chunk, convert_to_tensor=False)
83
  self.verse_embeddings.append(embeddings)
84
  self.verse_embeddings = np.vstack(self.verse_embeddings)
85
+ print("Embeddings computed successfully", file=sys.stderr) # Debug
86
  except Exception as e:
87
+ self.logger.error(f"Failed to compute embeddings: {e}", exc_info=True)
88
  self.verse_embeddings = None
89
+ self.logger.warning("Falling back to keyword-based search due to embedding failure")
90
+
91
  def get_surahs(self):
92
  if self.surahs:
93
+ return [(s['surahNameArabicLong'], s['id']) for s in self.surahs]
 
 
 
94
  return self._load_fallback_surahs()
95
+
96
  def get_surah_text(self, surah_id):
97
  max_retries = 3
98
  for attempt in range(max_retries):
99
  try:
100
+ response = requests.get(f"{self.api_url}{surah_id}.json", timeout=15)
101
  response.raise_for_status()
102
  data = response.json()
103
  verses = data['arabic1']
 
107
  if attempt == max_retries - 1:
108
  return self._load_fallback_verse()
109
  time.sleep(2 ** attempt)
110
+
111
  def search_verses(self, query, top_k=5):
112
+ if self.model is None:
113
+ try:
114
+ print("Loading model on demand...", file=sys.stderr)
115
+ self.model = SentenceTransformer(MODEL_NAME)
116
+ print("Model loaded successfully", file=sys.stderr)
117
+ verse_texts = [v['text'] for v in self.all_verses]
118
+ self.verse_embeddings = []
119
+ for i in range(0, len(verse_texts), CHUNK_SIZE):
120
+ chunk = verse_texts[i:i + CHUNK_SIZE]
121
+ embeddings = self.model.encode(chunk, convert_to_tensor=False)
122
+ self.verse_embeddings.append(embeddings)
123
+ self.verse_embeddings = np.vstack(self.verse_embeddings)
124
+ except Exception as e:
125
+ self.logger.error(f"Failed to load model on demand: {e}", exc_info=True)
126
+ self.verse_embeddings = None
127
+
128
  if self.verse_embeddings is None or not self.all_verses:
129
  return self._keyword_fallback_search(query, top_k)
130
 
131
  try:
132
+ print(f"Encoding query: {query}", file=sys.stderr) # Debug
133
  query_embedding = self.model.encode([query], convert_to_tensor=False)
134
  similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
135
  top_indices = np.argsort(similarities)[-top_k:][::-1]
 
138
  for idx in top_indices:
139
  verse = self.all_verses[idx]
140
  surah_name = self.surahs[verse['surah_id'] - 1]['surahNameArabicLong']
141
+ results.append(f"سورة {surah_name} - آية {verse['verse_num']}:\n{verse['text']}")
 
 
142
  return "\n\n".join(results)
143
  except Exception as e:
144
+ self.logger.error(f"Search failed: {e}", exc_info=True)
145
  return "حدث خطأ أثناء البحث. جرب مرة أخرى."
146
 
147
  def _keyword_fallback_search(self, query, top_k=5):
 
154
  return "\n\n".join(matches[:top_k]) or "لا توجد نتائج مطابقة."
155
 
156
  def _load_fallback_surahs(self):
157
+ return [("الفاتحة", 1), ("البقرة", 2), ("آل عمران", 3)]
 
 
 
 
158
 
159
  def _load_fallback_verse(self):
160
  return "بسم الله الرحمن الرحيم\nالله لا إله إلا هو الحي القيوم"