Batnini commited on
Commit
38c6863
·
verified ·
1 Parent(s): a64035a

Update tools/quran_search.py

Browse files
Files changed (1) hide show
  1. tools/quran_search.py +71 -21
tools/quran_search.py CHANGED
@@ -2,33 +2,83 @@ import pandas as pd
2
  from sentence_transformers import SentenceTransformer
3
  from sklearn.metrics.pairwise import cosine_similarity
4
  import numpy as np
 
 
5
 
6
  class QuranSearchEngine:
7
  def __init__(self):
8
  self.data_loaded = False
9
-
 
 
 
10
  def load_data(self):
11
  if not self.data_loaded:
12
- self.quran_df = pd.read_csv("https://raw.githubusercontent.com/mafahim/quran-json/main/quran_clean.csv")
13
- self.model = SentenceTransformer(
14
- 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
15
- )
16
- self.verse_embeddings = self.model.encode(self.quran_df['text'].tolist())
17
- self.data_loaded = True
18
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def search(self, query, top_k=5):
20
  self.load_data()
21
- query_embedding = self.model.encode([query])
22
- similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
23
- top_indices = np.argsort(similarities)[-top_k:][::-1]
24
 
25
- results = []
26
- for idx in top_indices:
27
- verse = self.quran_df.iloc[idx]
28
- results.append({
29
- "surah": verse['surah'],
30
- "ayah": verse['ayah'],
31
- "text": verse['text'],
32
- "similarity": f"{similarities[idx]:.2f}"
33
- })
34
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from sentence_transformers import SentenceTransformer
3
  from sklearn.metrics.pairwise import cosine_similarity
4
  import numpy as np
5
+ import requests
6
+ from io import StringIO
7
 
8
  class QuranSearchEngine:
9
  def __init__(self):
10
  self.data_loaded = False
11
+ self.quran_df = None
12
+ self.model = None
13
+ self.verse_embeddings = None
14
+
15
  def load_data(self):
16
  if not self.data_loaded:
17
+ try:
18
+ # Load Quran data with error handling
19
+ url = "https://raw.githubusercontent.com/mafahim/quran-json/main/quran_clean.csv"
20
+ response = requests.get(url)
21
+ response.raise_for_status() # Raise error for bad status
22
+
23
+ # Use StringIO to read the CSV content
24
+ self.quran_df = pd.read_csv(StringIO(response.text))
25
+
26
+ # Verify required columns exist
27
+ if not all(col in self.quran_df.columns for col in ['surah', 'ayah', 'text']):
28
+ raise ValueError("CSV file doesn't contain required columns")
29
+
30
+ # Load model with error handling
31
+ self.model = SentenceTransformer(
32
+ 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
33
+ device='cpu'
34
+ )
35
+
36
+ # Encode verses
37
+ self.verse_embeddings = self.model.encode(
38
+ self.quran_df['text'].tolist(),
39
+ show_progress_bar=False
40
+ )
41
+
42
+ self.data_loaded = True
43
+
44
+ except Exception as e:
45
+ print(f"Error loading Quran data: {str(e)}")
46
+ # Create empty dataframe if loading fails
47
+ self.quran_df = pd.DataFrame(columns=['surah', 'ayah', 'text'])
48
+ self.verse_embeddings = np.array([])
49
+
50
  def search(self, query, top_k=5):
51
  self.load_data()
 
 
 
52
 
53
+ if self.quran_df.empty:
54
+ return [{
55
+ "surah": "Error",
56
+ "ayah": "1",
57
+ "text": "Failed to load Quran data. Please try again later.",
58
+ "similarity": "0.00"
59
+ }]
60
+
61
+ try:
62
+ query_embedding = self.model.encode([query])
63
+ similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
64
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
65
+
66
+ results = []
67
+ for idx in top_indices:
68
+ verse = self.quran_df.iloc[idx]
69
+ results.append({
70
+ "surah": verse['surah'],
71
+ "ayah": verse['ayah'],
72
+ "text": verse['text'],
73
+ "similarity": f"{similarities[idx]:.2f}"
74
+ })
75
+ return results
76
+
77
+ except Exception as e:
78
+ print(f"Search error: {str(e)}")
79
+ return [{
80
+ "surah": "Error",
81
+ "ayah": "1",
82
+ "text": "An error occurred during search. Please try a different query.",
83
+ "similarity": "0.00"
84
+ }]