Batnini commited on
Commit
ed37d44
·
verified ·
1 Parent(s): de24a99

Update tools/quran_search.py

Browse files
Files changed (1) hide show
  1. tools/quran_search.py +92 -24
tools/quran_search.py CHANGED
@@ -1,60 +1,129 @@
1
  import requests
2
  import logging
3
  import pandas as pd
4
- from config import QURAN_DATA_SOURCES
 
 
 
5
 
6
  class QuranSearchEngine:
7
  def __init__(self):
8
- self.api_url = "https://quranapi.pages.dev/api/" # Correct base URL (no trailing slash needed, but consistent)
9
  self.logger = logging.getLogger(__name__)
 
 
 
 
 
 
10
 
11
- def get_surahs(self):
12
- """Get ALL 114 surahs - guaranteed working"""
13
  try:
14
  response = requests.get(f"{self.api_url}surah.json", timeout=5)
15
  response.raise_for_status()
16
- surahs = response.json() # Array of surah objects
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  return [
18
  (f"{s['surahNameArabicLong']} ({s['surahNameTranslation']})", i + 1)
19
- for i, s in enumerate(surahs)
20
  ]
21
- except Exception as e:
22
- self.logger.error(f"Failed to fetch surahs: {e}")
23
- # Fallback to local Quran data
24
- return self._load_fallback_surahs()
25
 
26
  def get_surah_text(self, surah_id):
27
- """Get FULL surah text - tested working"""
28
  try:
29
- response = requests.get(
30
- f"{self.api_url}{surah_id}.json",
31
- timeout=10
32
- )
33
  response.raise_for_status()
34
  data = response.json()
35
- verses = data['arabic1'] # Arabic with tashkeel (Uthmani-like)
36
- return "\n\n".join(
37
- f"آية {i + 1}: {v}"
38
- for i, v in enumerate(verses)
39
- )
40
  except Exception as e:
41
  self.logger.error(f"Failed to fetch surah {surah_id}: {e}")
42
  return self._load_fallback_verse()
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  def _load_fallback_surahs(self):
45
- """Load surah list from local CSV fallback"""
46
  try:
47
  for source in QURAN_DATA_SOURCES:
48
  try:
49
  df = pd.read_csv(source)
50
- # Assuming CSV has columns: surah_id, name_arabic, name_english (adjust if needed)
51
  return [
52
  (f"{row['name_arabic']} ({row['name_english']})", row['surah_id'])
53
  for _, row in df.drop_duplicates(subset=['surah_id']).iterrows()
54
  ]
55
  except:
56
  continue
57
- # Hardcoded fallback if all sources fail
58
  return [
59
  ("سورة الفاتحة (The Opening)", 1),
60
  ("سورة البقرة (The Cow)", 2),
@@ -69,5 +138,4 @@ class QuranSearchEngine:
69
  ]
70
 
71
  def _load_fallback_verse(self):
72
- """Load a fallback verse text"""
73
  return "بسم الله الرحمن الرحيم\nالله لا إله إلا هو الحي القيوم"
 
1
  import requests
2
  import logging
3
  import pandas as pd
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from config import QURAN_DATA_SOURCES, MODEL_NAME, CHUNK_SIZE
8
 
9
  class QuranSearchEngine:
10
  def __init__(self):
11
+ self.api_url = "https://quranapi.pages.dev/api/"
12
  self.logger = logging.getLogger(__name__)
13
+ self.surahs = None
14
+ self.all_verses = [] # List of {'surah_id': int, 'verse_num': int, 'text': str}
15
+ self.verse_embeddings = None
16
+ self.model = None
17
+ self._load_surahs()
18
+ self._load_all_verses_and_embeddings()
19
 
20
+ def _load_surahs(self):
 
21
  try:
22
  response = requests.get(f"{self.api_url}surah.json", timeout=5)
23
  response.raise_for_status()
24
+ self.surahs = response.json() # List of surah dicts
25
+ except Exception as e:
26
+ self.logger.error(f"Failed to fetch surahs: {e}")
27
+ self.surahs = self._load_fallback_surahs()
28
+
29
+ def _load_all_verses_and_embeddings(self):
30
+ """Load all verses and precompute embeddings"""
31
+ try:
32
+ for surah_id in range(1, 115): # 1 to 114
33
+ response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
34
+ response.raise_for_status()
35
+ data = response.json()
36
+ verses = data.get('arabic1', []) # Arabic with tashkeel
37
+ for verse_num, text in enumerate(verses, start=1):
38
+ self.all_verses.append({
39
+ 'surah_id': surah_id,
40
+ 'verse_num': verse_num,
41
+ 'text': text
42
+ })
43
+
44
+ # Precompute embeddings in chunks
45
+ self.model = SentenceTransformer(MODEL_NAME)
46
+ verse_texts = [v['text'] for v in self.all_verses]
47
+ self.verse_embeddings = []
48
+ for i in range(0, len(verse_texts), CHUNK_SIZE):
49
+ chunk = verse_texts[i:i + CHUNK_SIZE]
50
+ embeddings = self.model.encode(chunk, convert_to_tensor=False)
51
+ self.verse_embeddings.append(embeddings)
52
+ self.verse_embeddings = np.vstack(self.verse_embeddings)
53
+
54
+ except Exception as e:
55
+ self.logger.error(f"Failed to load verses/embeddings: {e}")
56
+ # Fallback: Use limited hardcoded data (no embeddings)
57
+ self.all_verses = [
58
+ {'surah_id': 1, 'verse_num': 1, 'text': "بِسْمِ ٱللَّهِ ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ"},
59
+ # Add more if needed, but limited
60
+ ]
61
+ self.verse_embeddings = None # Will use keyword fallback in search
62
+
63
+ def get_surahs(self):
64
+ if self.surahs:
65
  return [
66
  (f"{s['surahNameArabicLong']} ({s['surahNameTranslation']})", i + 1)
67
+ for i, s in enumerate(self.surahs)
68
  ]
69
+ return self._load_fallback_surahs()
 
 
 
70
 
71
  def get_surah_text(self, surah_id):
 
72
  try:
73
+ response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
 
 
 
74
  response.raise_for_status()
75
  data = response.json()
76
+ verses = data['arabic1']
77
+ return "\n\n".join(f"آية {i + 1}: {v}" for i, v in enumerate(verses))
 
 
 
78
  except Exception as e:
79
  self.logger.error(f"Failed to fetch surah {surah_id}: {e}")
80
  return self._load_fallback_verse()
81
 
82
+ def search_verses(self, query, top_k=5):
83
+ """Semantic search for verses based on meaning"""
84
+ if self.verse_embeddings is None:
85
+ # Fallback to simple keyword search if embeddings failed
86
+ return self._keyword_fallback_search(query, top_k)
87
+
88
+ try:
89
+ query_embedding = self.model.encode([query], convert_to_tensor=False)
90
+ similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
91
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
92
+
93
+ results = []
94
+ for idx in top_indices:
95
+ verse = self.all_verses[idx]
96
+ surah_name = self.surahs[verse['surah_id'] - 1]['surahNameArabicLong']
97
+ results.append(
98
+ f"سورة {surah_name} - آية {verse['verse_num']}:\n{verse['text']}\n(تشابه: {similarities[idx]:.2f})"
99
+ )
100
+ return "\n\n".join(results)
101
+ except Exception as e:
102
+ self.logger.error(f"Search failed: {e}")
103
+ return "حدث خطأ أثناء البحث. جرب مرة أخرى."
104
+
105
+ def _keyword_fallback_search(self, query, top_k=5):
106
+ """Simple keyword fallback if semantic fails"""
107
+ query_lower = query.lower()
108
+ matches = []
109
+ for verse in self.all_verses:
110
+ if query_lower in verse['text'].lower():
111
+ surah_name = self.surahs[verse['surah_id'] - 1]['surahNameArabicLong'] if self.surahs else f"سورة {verse['surah_id']}"
112
+ matches.append(f"سورة {surah_name} - آية {verse['verse_num']}:\n{verse['text']}")
113
+ return "\n\n".join(matches[:top_k]) or "لا توجد نتائج مطابقة."
114
+
115
+ # Existing fallback methods (unchanged)
116
  def _load_fallback_surahs(self):
 
117
  try:
118
  for source in QURAN_DATA_SOURCES:
119
  try:
120
  df = pd.read_csv(source)
 
121
  return [
122
  (f"{row['name_arabic']} ({row['name_english']})", row['surah_id'])
123
  for _, row in df.drop_duplicates(subset=['surah_id']).iterrows()
124
  ]
125
  except:
126
  continue
 
127
  return [
128
  ("سورة الفاتحة (The Opening)", 1),
129
  ("سورة البقرة (The Cow)", 2),
 
138
  ]
139
 
140
  def _load_fallback_verse(self):
 
141
  return "بسم الله الرحمن الرحيم\nالله لا إله إلا هو الحي القيوم"