Batnini commited on
Commit
75db848
·
verified ·
1 Parent(s): 38c6863

Update tools/quran_search.py

Browse files
Files changed (1) hide show
  1. tools/quran_search.py +54 -44
tools/quran_search.py CHANGED
@@ -4,60 +4,70 @@ from sklearn.metrics.pairwise import cosine_similarity
4
  import numpy as np
5
  import requests
6
  from io import StringIO
 
7
 
8
  class QuranSearchEngine:
9
  def __init__(self):
10
  self.data_loaded = False
11
- self.quran_df = None
12
  self.model = None
13
- self.verse_embeddings = None
14
-
 
 
 
 
 
15
  def load_data(self):
16
  if not self.data_loaded:
17
  try:
18
- # Load Quran data with error handling
19
- url = "https://raw.githubusercontent.com/mafahim/quran-json/main/quran_clean.csv"
20
- response = requests.get(url)
21
- response.raise_for_status() # Raise error for bad status
22
-
23
- # Use StringIO to read the CSV content
24
- self.quran_df = pd.read_csv(StringIO(response.text))
25
-
26
- # Verify required columns exist
27
- if not all(col in self.quran_df.columns for col in ['surah', 'ayah', 'text']):
28
- raise ValueError("CSV file doesn't contain required columns")
29
-
30
- # Load model with error handling
31
- self.model = SentenceTransformer(
32
- 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
33
- device='cpu'
34
- )
35
-
36
- # Encode verses
37
- self.verse_embeddings = self.model.encode(
38
- self.quran_df['text'].tolist(),
39
- show_progress_bar=False
40
- )
 
 
 
 
 
 
 
 
 
41
 
42
- self.data_loaded = True
43
 
44
  except Exception as e:
45
- print(f"Error loading Quran data: {str(e)}")
46
- # Create empty dataframe if loading fails
47
- self.quran_df = pd.DataFrame(columns=['surah', 'ayah', 'text'])
48
- self.verse_embeddings = np.array([])
 
 
 
49
 
50
  def search(self, query, top_k=5):
51
  self.load_data()
52
 
53
- if self.quran_df.empty:
54
- return [{
55
- "surah": "Error",
56
- "ayah": "1",
57
- "text": "Failed to load Quran data. Please try again later.",
58
- "similarity": "0.00"
59
- }]
60
-
61
  try:
62
  query_embedding = self.model.encode([query])
63
  similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
@@ -67,18 +77,18 @@ class QuranSearchEngine:
67
  for idx in top_indices:
68
  verse = self.quran_df.iloc[idx]
69
  results.append({
70
- "surah": verse['surah'],
71
- "ayah": verse['ayah'],
72
- "text": verse['text'],
73
  "similarity": f"{similarities[idx]:.2f}"
74
  })
75
  return results
76
 
77
  except Exception as e:
78
- print(f"Search error: {str(e)}")
79
  return [{
80
  "surah": "Error",
81
  "ayah": "1",
82
- "text": "An error occurred during search. Please try a different query.",
83
  "similarity": "0.00"
84
  }]
 
4
  import numpy as np
5
  import requests
6
  from io import StringIO
7
+ import logging
8
 
9
  class QuranSearchEngine:
10
  def __init__(self):
11
  self.data_loaded = False
12
+ self.quran_df = pd.DataFrame(columns=['surah', 'ayah', 'text'])
13
  self.model = None
14
+ self.verse_embeddings = np.array([])
15
+ self.alternative_urls = [
16
+ "https://cdn.jsdelivr.net/gh/mafahim/quran-json/quran_clean.csv",
17
+ "https://raw.githubusercontent.com/mafahim/quran-json/main/quran_clean.csv",
18
+ "https://gitlab.com/mafahim/quran-json/-/raw/main/quran_clean.csv"
19
+ ]
20
+
21
  def load_data(self):
22
  if not self.data_loaded:
23
  try:
24
+ # Try multiple data sources
25
+ for url in self.alternative_urls:
26
+ try:
27
+ response = requests.get(url, timeout=10)
28
+ response.raise_for_status()
29
+ self.quran_df = pd.read_csv(StringIO(response.text))
30
+
31
+ if not all(col in self.quran_df.columns for col in ['surah', 'ayah', 'text']):
32
+ raise ValueError("Missing required columns")
33
+
34
+ # Load model with smaller chunk size for low-memory environments
35
+ self.model = SentenceTransformer(
36
+ 'paraphrase-multilingual-MiniLM-L12-v2',
37
+ device='cpu'
38
+ )
39
+
40
+ # Process in chunks to avoid memory issues
41
+ texts = self.quran_df['text'].tolist()
42
+ chunk_size = 50
43
+ embeddings = []
44
+ for i in range(0, len(texts), chunk_size):
45
+ chunk = texts[i:i + chunk_size]
46
+ embeddings.append(self.model.encode(chunk))
47
+ self.verse_embeddings = np.concatenate(embeddings)
48
+
49
+ self.data_loaded = True
50
+ logging.info("Quran data loaded successfully")
51
+ return
52
+
53
+ except Exception as e:
54
+ logging.warning(f"Failed to load from {url}: {str(e)}")
55
+ continue
56
 
57
+ raise Exception("All data sources failed")
58
 
59
  except Exception as e:
60
+ logging.error(f"Critical error loading Quran data: {str(e)}")
61
+ self.quran_df = pd.DataFrame({
62
+ 'surah': ['Error'],
63
+ 'ayah': ['1'],
64
+ 'text': ['Failed to load Quran data. Please try again later.']
65
+ })
66
+ self.verse_embeddings = np.array([[0]]) # Dummy embedding
67
 
68
  def search(self, query, top_k=5):
69
  self.load_data()
70
 
 
 
 
 
 
 
 
 
71
  try:
72
  query_embedding = self.model.encode([query])
73
  similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
 
77
  for idx in top_indices:
78
  verse = self.quran_df.iloc[idx]
79
  results.append({
80
+ "surah": str(verse['surah']),
81
+ "ayah": str(verse['ayah']),
82
+ "text": str(verse['text']),
83
  "similarity": f"{similarities[idx]:.2f}"
84
  })
85
  return results
86
 
87
  except Exception as e:
88
+ logging.error(f"Search error: {str(e)}")
89
  return [{
90
  "surah": "Error",
91
  "ayah": "1",
92
+ "text": "An error occurred during search. Please try again.",
93
  "similarity": "0.00"
94
  }]