Batnini commited on
Commit
035762a
·
verified ·
1 Parent(s): 01939f6

Update tools/quran_search.py

Browse files
Files changed (1) hide show
  1. tools/quran_search.py +50 -48
tools/quran_search.py CHANGED
@@ -3,9 +3,8 @@ import logging
3
  import numpy as np
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
- from config import QURAN_DATA_SOURCES, MODEL_NAME, CHUNK_SIZE
7
  import time
8
- import sys
9
 
10
  class QuranSearchEngine:
11
  def __init__(self):
@@ -15,31 +14,36 @@ class QuranSearchEngine:
15
  self.all_verses = [] # List of {'surah_id': int, 'verse_num': int, 'text': str}
16
  self.verse_embeddings = None
17
  self.model = None
18
- print("Starting QuranSearchEngine initialization at", time.ctime(), file=sys.stderr) # Debug to stderr
19
- try:
20
- self._load_full_quran()
21
- print(f"Surahs loaded: {len(self.surahs) if self.surahs else 0}", file=sys.stderr) # Debug
22
- self._load_all_verses_and_embeddings()
23
- print(f"Verses loaded: {len(self.all_verses)}", file=sys.stderr) # Debug
24
- except Exception as e:
25
- self.logger.error(f"Initialization failed: {e}", exc_info=True)
26
- print(f"Initialization error: {e}", file=sys.stderr)
27
- self._load_fallback_data() # Ensure minimal startup
28
-
29
  def _load_full_quran(self):
30
- max_retries = 5 # Increased retries
31
  for attempt in range(max_retries):
32
  try:
33
- response = requests.get(f"{self.api_url}surah.json", timeout=15) # Increased timeout
34
  response.raise_for_status()
35
  self.surahs = response.json()
36
  for i, s in enumerate(self.surahs):
37
  s['id'] = i + 1
38
- self.all_verses = [] # Reset verses
39
- for surah_id in range(1, 115):
40
- surah_response = requests.get(f"{self.api_url}{surah_id}.json", timeout=15)
41
- surah_response.raise_for_status()
42
- data = surah_response.json()
 
 
 
 
 
 
 
 
 
 
43
  verses = data['arabic1']
44
  for verse_num, text in enumerate(verses, start=1):
45
  self.all_verses.append({
@@ -47,29 +51,21 @@ class QuranSearchEngine:
47
  'verse_num': verse_num,
48
  'text': text
49
  })
50
- break
51
- except Exception as e:
52
- self.logger.error(f"Attempt {attempt + 1}/{max_retries} failed to fetch Quran data: {e}")
53
- if attempt == max_retries - 1:
54
- self._load_fallback_data()
55
- time.sleep(2 ** attempt) # Exponential backoff
56
-
57
- def _load_fallback_data(self):
58
- self.logger.warning("Falling back to minimal data due to API failure")
59
- self.surahs = self._load_fallback_surahs()
60
- self.all_verses = [
61
- {'surah_id': 1, 'verse_num': 1, 'text': "بِسْمِ ٱللَّهِ ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ"},
62
- {'surah_id': 1, 'verse_num': 2, 'text': "ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ"}
63
- ]
64
 
 
 
 
 
 
 
65
  def _load_all_verses_and_embeddings(self):
66
  if not self.all_verses:
67
  return
68
 
69
  try:
70
- print("Attempting to load model...", file=sys.stderr) # Debug
71
  self.model = SentenceTransformer(MODEL_NAME)
72
- print("Model loaded successfully", file=sys.stderr) # Debug
73
  verse_texts = [v['text'] for v in self.all_verses]
74
  self.verse_embeddings = []
75
  for i in range(0, len(verse_texts), CHUNK_SIZE):
@@ -77,22 +73,23 @@ class QuranSearchEngine:
77
  embeddings = self.model.encode(chunk, convert_to_tensor=False)
78
  self.verse_embeddings.append(embeddings)
79
  self.verse_embeddings = np.vstack(self.verse_embeddings)
80
- print("Embeddings computed successfully", file=sys.stderr) # Debug
81
  except Exception as e:
82
- self.logger.error(f"Failed to compute embeddings: {e}", exc_info=True)
83
  self.verse_embeddings = None
84
- self.logger.warning("Falling back to keyword-based search due to embedding failure")
85
-
86
  def get_surahs(self):
87
  if self.surahs:
88
- return [(s['surahNameArabicLong'], s['id']) for s in self.surahs]
 
 
 
89
  return self._load_fallback_surahs()
90
-
91
  def get_surah_text(self, surah_id):
92
  max_retries = 3
93
  for attempt in range(max_retries):
94
  try:
95
- response = requests.get(f"{self.api_url}{surah_id}.json", timeout=15)
96
  response.raise_for_status()
97
  data = response.json()
98
  verses = data['arabic1']
@@ -102,13 +99,12 @@ class QuranSearchEngine:
102
  if attempt == max_retries - 1:
103
  return self._load_fallback_verse()
104
  time.sleep(2 ** attempt)
105
-
106
  def search_verses(self, query, top_k=5):
107
  if self.verse_embeddings is None or not self.all_verses:
108
  return self._keyword_fallback_search(query, top_k)
109
 
110
  try:
111
- print(f"Encoding query: {query}", file=sys.stderr) # Debug
112
  query_embedding = self.model.encode([query], convert_to_tensor=False)
113
  similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
114
  top_indices = np.argsort(similarities)[-top_k:][::-1]
@@ -117,10 +113,12 @@ class QuranSearchEngine:
117
  for idx in top_indices:
118
  verse = self.all_verses[idx]
119
  surah_name = self.surahs[verse['surah_id'] - 1]['surahNameArabicLong']
120
- results.append(f"سورة {surah_name} - آية {verse['verse_num']}:\n{verse['text']}")
 
 
121
  return "\n\n".join(results)
122
  except Exception as e:
123
- self.logger.error(f"Search failed: {e}", exc_info=True)
124
  return "حدث خطأ أثناء البحث. جرب مرة أخرى."
125
 
126
  def _keyword_fallback_search(self, query, top_k=5):
@@ -133,7 +131,11 @@ class QuranSearchEngine:
133
  return "\n\n".join(matches[:top_k]) or "لا توجد نتائج مطابقة."
134
 
135
  def _load_fallback_surahs(self):
136
- return [("الفاتحة", 1), ("البقرة", 2), ("آل عمران", 3)]
 
 
 
 
137
 
138
  def _load_fallback_verse(self):
139
  return "بسم الله الرحمن الرحيم\nالله لا إله إلا هو الحي القيوم"
 
3
  import numpy as np
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
+ from config import MODEL_NAME, CHUNK_SIZE
7
  import time
 
8
 
9
  class QuranSearchEngine:
10
  def __init__(self):
 
14
  self.all_verses = [] # List of {'surah_id': int, 'verse_num': int, 'text': str}
15
  self.verse_embeddings = None
16
  self.model = None
17
+ print("Starting QuranSearchEngine initialization...") # Debug
18
+ self._load_full_quran()
19
+ print(f"Surahs loaded: {len(self.surahs) if self.surahs else 0}") # Debug
20
+ self._load_all_verses_and_embeddings()
21
+ print(f"Verses loaded: {len(self.all_verses)}") # Debug
22
+
 
 
 
 
 
23
  def _load_full_quran(self):
24
+ max_retries = 3
25
  for attempt in range(max_retries):
26
  try:
27
+ response = requests.get(f"{self.api_url}surah.json", timeout=10)
28
  response.raise_for_status()
29
  self.surahs = response.json()
30
  for i, s in enumerate(self.surahs):
31
  s['id'] = i + 1
32
+ break
33
+ except Exception as e:
34
+ self.logger.error(f"Attempt {attempt + 1}/{max_retries} failed to fetch surahs: {e}")
35
+ if attempt == max_retries - 1:
36
+ self.surahs = self._load_fallback_surahs()
37
+ time.sleep(2 ** attempt)
38
+
39
+ # Load verses
40
+ if self.surahs:
41
+ for surah in self.surahs:
42
+ surah_id = surah['id']
43
+ try:
44
+ response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
45
+ response.raise_for_status()
46
+ data = response.json()
47
  verses = data['arabic1']
48
  for verse_num, text in enumerate(verses, start=1):
49
  self.all_verses.append({
 
51
  'verse_num': verse_num,
52
  'text': text
53
  })
54
+ except Exception as e:
55
+ self.logger.error(f"Failed to fetch verses for surah {surah_id}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ if not self.all_verses:
58
+ self.all_verses = [
59
+ {'surah_id': 1, 'verse_num': 1, 'text': "بِسْمِ ٱللَّهِ ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ"},
60
+ # Add more if needed
61
+ ]
62
+
63
  def _load_all_verses_and_embeddings(self):
64
  if not self.all_verses:
65
  return
66
 
67
  try:
 
68
  self.model = SentenceTransformer(MODEL_NAME)
 
69
  verse_texts = [v['text'] for v in self.all_verses]
70
  self.verse_embeddings = []
71
  for i in range(0, len(verse_texts), CHUNK_SIZE):
 
73
  embeddings = self.model.encode(chunk, convert_to_tensor=False)
74
  self.verse_embeddings.append(embeddings)
75
  self.verse_embeddings = np.vstack(self.verse_embeddings)
 
76
  except Exception as e:
77
+ self.logger.error(f"Failed to compute embeddings: {e}")
78
  self.verse_embeddings = None
79
+
 
80
  def get_surahs(self):
81
  if self.surahs:
82
+ return [
83
+ (s['surahNameArabicLong'], s['id'])
84
+ for s in self.surahs
85
+ ]
86
  return self._load_fallback_surahs()
87
+
88
  def get_surah_text(self, surah_id):
89
  max_retries = 3
90
  for attempt in range(max_retries):
91
  try:
92
+ response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
93
  response.raise_for_status()
94
  data = response.json()
95
  verses = data['arabic1']
 
99
  if attempt == max_retries - 1:
100
  return self._load_fallback_verse()
101
  time.sleep(2 ** attempt)
102
+
103
  def search_verses(self, query, top_k=5):
104
  if self.verse_embeddings is None or not self.all_verses:
105
  return self._keyword_fallback_search(query, top_k)
106
 
107
  try:
 
108
  query_embedding = self.model.encode([query], convert_to_tensor=False)
109
  similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
110
  top_indices = np.argsort(similarities)[-top_k:][::-1]
 
113
  for idx in top_indices:
114
  verse = self.all_verses[idx]
115
  surah_name = self.surahs[verse['surah_id'] - 1]['surahNameArabicLong']
116
+ results.append(
117
+ f"سورة {surah_name} - آية {verse['verse_num']}:\n{verse['text']}"
118
+ )
119
  return "\n\n".join(results)
120
  except Exception as e:
121
+ self.logger.error(f"Search failed: {e}")
122
  return "حدث خطأ أثناء البحث. جرب مرة أخرى."
123
 
124
  def _keyword_fallback_search(self, query, top_k=5):
 
131
  return "\n\n".join(matches[:top_k]) or "لا توجد نتائج مطابقة."
132
 
133
  def _load_fallback_surahs(self):
134
+ return [
135
+ ("الفاتحة", 1),
136
+ ("البقرة", 2),
137
+ ("آل عمران", 3)
138
+ ]
139
 
140
  def _load_fallback_verse(self):
141
  return "بسم الله الرحمن الرحيم\nالله لا إله إلا هو الحي القيوم"