Spaces:

realrohilbansal
/

course-search-av

Runtime error

App Files Files Community

Rohil Bansal commited on Dec 10, 2024

Commit

ee7ea09

1 Parent(s): dd0bddd

dataframe improved

Browse files

Files changed (2) hide show

course_search/app/gradio_app.py +16 -2
course_search/search_system/rag_system.py +71 -52

course_search/app/gradio_app.py CHANGED Viewed

@@ -21,7 +21,7 @@ class CourseSearchApp:
         """Initialize RAG system and load data"""
         try:
             # Construct path to data file
-            data_path = Path(__file__).parent.parent.parent / 'data' / 'courses_with_embeddings.pkl'
             if not data_path.exists():
                 raise FileNotFoundError(f"Data file not found at: {data_path}")
@@ -30,9 +30,23 @@ class CourseSearchApp:
             df = pd.read_pickle(str(data_path))
             logger.info(f"Loaded {len(df)} courses from {data_path}")
             # Initialize RAG system
             self.rag_system = RAGSystem()
-            self.rag_system.load_and_process_data(df)
             logger.info("Components loaded successfully")
         except Exception as e:

         """Initialize RAG system and load data"""
         try:
             # Construct path to data file
+            data_path = Path(__file__).parent.parent.parent / 'data' / 'courses.pkl'
             if not data_path.exists():
                 raise FileNotFoundError(f"Data file not found at: {data_path}")
             df = pd.read_pickle(str(data_path))
             logger.info(f"Loaded {len(df)} courses from {data_path}")
+            # Validate DataFrame
+            if len(df) == 0:
+                raise ValueError("Empty DataFrame loaded")
+            required_columns = ['title', 'description', 'curriculum', 'url']
+            missing_columns = [col for col in required_columns if col not in df.columns]
+            if missing_columns:
+                raise ValueError(f"Missing required columns: {missing_columns}")
             # Initialize RAG system
             self.rag_system = RAGSystem()
+            # Create cache directory
+            cache_dir = data_path.parent / 'cache'
+            cache_dir.mkdir(exist_ok=True)
+            self.rag_system.load_and_process_data(df, cache_dir=cache_dir)
             logger.info("Components loaded successfully")
         except Exception as e:

course_search/search_system/rag_system.py CHANGED Viewed

@@ -9,71 +9,90 @@ import logging
 from typing import List, Dict
 import os
 from dotenv import load_dotenv
 logger = logging.getLogger(__name__)
 class RAGSystem:
     def __init__(self):
-        """Initialize the RAG system with LangChain components"""
-        load_dotenv()
-        # Initialize embedding model
-        self.embeddings = HuggingFaceEmbeddings(
-            model_name="sentence-transformers/all-MiniLM-L6-v2"
-        )
-        # Initialize text splitter for chunking
-        self.text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=500,
-            chunk_overlap=50
-        )
-        self.vector_store = None
-        self.qa_chain = None
-    def load_and_process_data(self, df: pd.DataFrame) -> None:
-        """
-        Load course data and create vector store
-        """
         try:
-            # Prepare documents from DataFrame
-            loader = DataFrameLoader(
-                data_frame=df,
-                page_content_column="description"
-            )
-            documents = loader.load()
-            for doc, row in zip(documents, df.itertuples()):
-                doc.metadata = {
-                    "title": row.title,
-                    "url": row.url,
-                    # Add other metadata fields as needed
-                }
-            # Split documents into chunks
-            splits = self.text_splitter.split_documents(documents)
-            # Create vector store
-            self.vector_store = FAISS.from_documents(
-                splits,
-                self.embeddings
-            )
-            # Initialize QA chain
-            llm = HuggingFaceHub(
-                repo_id="google/flan-t5-small",
-                huggingfacehub_api_token=os.getenv('HUGGINGFACE_API_TOKEN')
-            )
-            self.qa_chain = RetrievalQA.from_chain_type(
-                llm=llm,
-                chain_type="stuff",
-                retriever=self.vector_store.as_retriever()
             )
-            logger.info("RAG system initialized successfully")
         except Exception as e:
-            logger.error(f"Error initializing RAG system: {str(e)}")
             raise
     def search_courses(self, query: str, top_k: int = 5) -> List[Dict]:

 from typing import List, Dict
 import os
 from dotenv import load_dotenv
+from pathlib import Path
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
 logger = logging.getLogger(__name__)
 class RAGSystem:
     def __init__(self):
+        """Initialize the RAG system"""
+        try:
+            self.model = SentenceTransformer('all-MiniLM-L6-v2')
+            self.embeddings = None
+            self.index = None
+            self.df = None
+            logger.info("RAG system initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing RAG system: {str(e)}")
+            raise
+    def load_and_process_data(self, df: pd.DataFrame, cache_dir: Path = None):
+        """Load and process the course data with caching support"""
         try:
+            # Validate input
+            if df is None or len(df) == 0:
+                raise ValueError("Empty or None DataFrame provided")
+            required_columns = ['title', 'description', 'curriculum', 'url']
+            missing_columns = [col for col in required_columns if col not in df.columns]
+            if missing_columns:
+                raise ValueError(f"Missing required columns: {missing_columns}")
+            self.df = df
+            vector_dimension = 384  # dimension for all-MiniLM-L6-v2
+            # Try loading from cache first
+            if cache_dir is not None:
+                cache_dir.mkdir(exist_ok=True)
+                embeddings_path = cache_dir / 'course_embeddings.npy'
+                index_path = cache_dir / 'faiss_index.bin'
+                if embeddings_path.exists() and index_path.exists():
+                    logger.info("Loading cached embeddings and index...")
+                    try:
+                        self.embeddings = np.load(str(embeddings_path))
+                        self.index = faiss.read_index(str(index_path))
+                        logger.info("Successfully loaded cached data")
+                        return
+                    except Exception as e:
+                        logger.warning(f"Failed to load cache: {e}. Computing new embeddings...")
+            # Compute new embeddings
+            logger.info("Computing course embeddings...")
+            texts = [
+                f"{row['title']}. {row['description']}"
+                for _, row in df.iterrows()
+            ]
+            if not texts:
+                raise ValueError("No texts to encode")
+            self.embeddings = self.model.encode(
+                texts,
+                show_progress_bar=True,
+                convert_to_numpy=True
             )
+            if self.embeddings.size == 0:
+                raise ValueError("Failed to generate embeddings")
+            # Create and populate FAISS index
+            self.index = faiss.IndexFlatL2(vector_dimension)
+            self.index.add(self.embeddings.astype('float32'))
+            # Save to cache if directory provided
+            if cache_dir is not None:
+                logger.info("Saving embeddings and index to cache...")
+                np.save(str(embeddings_path), self.embeddings)
+                faiss.write_index(self.index, str(index_path))
+            logger.info(f"Successfully processed {len(df)} courses")
         except Exception as e:
+            logger.error(f"Error processing data: {str(e)}")
             raise
     def search_courses(self, query: str, top_k: int = 5) -> List[Dict]: