Spaces:

Batnini
/

radius

Paused

App Files Files Community

Batnini commited on 16 days ago

Commit

75db848

verified ·

1 Parent(s): 38c6863

Update tools/quran_search.py

Browse files

Files changed (1) hide show

tools/quran_search.py +54 -44

tools/quran_search.py CHANGED Viewed

@@ -4,60 +4,70 @@ from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 import requests
 from io import StringIO
 class QuranSearchEngine:
     def __init__(self):
         self.data_loaded = False
-        self.quran_df = None
         self.model = None
-        self.verse_embeddings = None
     def load_data(self):
         if not self.data_loaded:
             try:
-                # Load Quran data with error handling
-                url = "https://raw.githubusercontent.com/mafahim/quran-json/main/quran_clean.csv"
-                response = requests.get(url)
-                response.raise_for_status()  # Raise error for bad status
-                # Use StringIO to read the CSV content
-                self.quran_df = pd.read_csv(StringIO(response.text))
-                # Verify required columns exist
-                if not all(col in self.quran_df.columns for col in ['surah', 'ayah', 'text']):
-                    raise ValueError("CSV file doesn't contain required columns")
-                # Load model with error handling
-                self.model = SentenceTransformer(
-                    'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
-                    device='cpu'
-                )
-                # Encode verses
-                self.verse_embeddings = self.model.encode(
-                    self.quran_df['text'].tolist(),
-                    show_progress_bar=False
-                )
-                self.data_loaded = True
             except Exception as e:
-                print(f"Error loading Quran data: {str(e)}")
-                # Create empty dataframe if loading fails
-                self.quran_df = pd.DataFrame(columns=['surah', 'ayah', 'text'])
-                self.verse_embeddings = np.array([])
     def search(self, query, top_k=5):
         self.load_data()
-        if self.quran_df.empty:
-            return [{
-                "surah": "Error",
-                "ayah": "1",
-                "text": "Failed to load Quran data. Please try again later.",
-                "similarity": "0.00"
-            }]
         try:
             query_embedding = self.model.encode([query])
             similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
@@ -67,18 +77,18 @@ class QuranSearchEngine:
             for idx in top_indices:
                 verse = self.quran_df.iloc[idx]
                 results.append({
-                    "surah": verse['surah'],
-                    "ayah": verse['ayah'],
-                    "text": verse['text'],
                     "similarity": f"{similarities[idx]:.2f}"
                 })
             return results
         except Exception as e:
-            print(f"Search error: {str(e)}")
             return [{
                 "surah": "Error",
                 "ayah": "1",
-                "text": "An error occurred during search. Please try a different query.",
                 "similarity": "0.00"
             }]

 import numpy as np
 import requests
 from io import StringIO
+import logging
 class QuranSearchEngine:
     def __init__(self):
         self.data_loaded = False
+        self.quran_df = pd.DataFrame(columns=['surah', 'ayah', 'text'])
         self.model = None
+        self.verse_embeddings = np.array([])
+        self.alternative_urls = [
+            "https://cdn.jsdelivr.net/gh/mafahim/quran-json/quran_clean.csv",
+            "https://raw.githubusercontent.com/mafahim/quran-json/main/quran_clean.csv",
+            "https://gitlab.com/mafahim/quran-json/-/raw/main/quran_clean.csv"
+        ]
     def load_data(self):
         if not self.data_loaded:
             try:
+                # Try multiple data sources
+                for url in self.alternative_urls:
+                    try:
+                        response = requests.get(url, timeout=10)
+                        response.raise_for_status()
+                        self.quran_df = pd.read_csv(StringIO(response.text))
+                        if not all(col in self.quran_df.columns for col in ['surah', 'ayah', 'text']):
+                            raise ValueError("Missing required columns")
+                        # Load model with smaller chunk size for low-memory environments
+                        self.model = SentenceTransformer(
+                            'paraphrase-multilingual-MiniLM-L12-v2',
+                            device='cpu'
+                        )
+                        # Process in chunks to avoid memory issues
+                        texts = self.quran_df['text'].tolist()
+                        chunk_size = 50
+                        embeddings = []
+                        for i in range(0, len(texts), chunk_size):
+                            chunk = texts[i:i + chunk_size]
+                            embeddings.append(self.model.encode(chunk))
+                        self.verse_embeddings = np.concatenate(embeddings)
+                        self.data_loaded = True
+                        logging.info("Quran data loaded successfully")
+                        return
+                    except Exception as e:
+                        logging.warning(f"Failed to load from {url}: {str(e)}")
+                        continue
+                raise Exception("All data sources failed")
             except Exception as e:
+                logging.error(f"Critical error loading Quran data: {str(e)}")
+                self.quran_df = pd.DataFrame({
+                    'surah': ['Error'],
+                    'ayah': ['1'],
+                    'text': ['Failed to load Quran data. Please try again later.']
+                })
+                self.verse_embeddings = np.array([[0]])  # Dummy embedding
     def search(self, query, top_k=5):
         self.load_data()
         try:
             query_embedding = self.model.encode([query])
             similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
             for idx in top_indices:
                 verse = self.quran_df.iloc[idx]
                 results.append({
+                    "surah": str(verse['surah']),
+                    "ayah": str(verse['ayah']),
+                    "text": str(verse['text']),
                     "similarity": f"{similarities[idx]:.2f}"
                 })
             return results
         except Exception as e:
+            logging.error(f"Search error: {str(e)}")
             return [{
                 "surah": "Error",
                 "ayah": "1",
+                "text": "An error occurred during search. Please try again.",
                 "similarity": "0.00"
             }]