File size: 6,621 Bytes
8778311
 
2ed2129
8778311
2ed2129
 
ee7ea09
2ed2129
8778311
2ed2129
 
 
 
ee7ea09
 
 
 
 
 
 
 
 
 
2ed2129
ee7ea09
 
2ed2129
ee7ea09
 
 
2ed2129
ee7ea09
 
 
 
2ed2129
ee7ea09
 
2ed2129
ee7ea09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ed2129
ee7ea09
 
 
 
 
 
 
2ed2129
 
ee7ea09
 
 
 
 
 
 
 
 
 
 
 
 
 
2ed2129
 
ee7ea09
2ed2129
 
56912a0
 
2ed2129
56912a0
 
 
2ed2129
56912a0
821284f
2ed2129
56912a0
 
 
 
2ed2129
56912a0
2ed2129
56912a0
 
 
 
 
 
 
 
 
 
 
 
 
 
2ed2129
56912a0
 
2ed2129
 
56912a0
 
 
 
2ed2129
56912a0
 
2ed2129
56912a0
2ed2129
56912a0
8778311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import faiss
import logging
from typing import List, Dict
from pathlib import Path

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class RAGSystem:
    def __init__(self):
        """Initialize the RAG system"""
        try:
            self.model = SentenceTransformer('all-MiniLM-L6-v2')
            self.embeddings = None
            self.index = None
            self.df = None
            logger.info("RAG system initialized successfully")
        except Exception as e:
            logger.error(f"Error initializing RAG system: {str(e)}")
            raise

    def load_and_process_data(self, df: pd.DataFrame, cache_dir: Path = None):
        """Load and process the course data with caching support"""
        try:
            # Validate input
            if df is None or len(df) == 0:
                raise ValueError("Empty or None DataFrame provided")
            
            required_columns = ['title', 'description', 'curriculum', 'url']
            missing_columns = [col for col in required_columns if col not in df.columns]
            if missing_columns:
                raise ValueError(f"Missing required columns: {missing_columns}")
            
            self.df = df
            vector_dimension = 384  # dimension for all-MiniLM-L6-v2
            
            # Try loading from cache first
            if cache_dir is not None:
                cache_dir.mkdir(exist_ok=True)
                embeddings_path = cache_dir / 'course_embeddings.npy'
                index_path = cache_dir / 'faiss_index.bin'
                
                if embeddings_path.exists() and index_path.exists():
                    logger.info("Loading cached embeddings and index...")
                    try:
                        self.embeddings = np.load(str(embeddings_path))
                        self.index = faiss.read_index(str(index_path))
                        logger.info("Successfully loaded cached data")
                        return
                    except Exception as e:
                        logger.warning(f"Failed to load cache: {e}. Computing new embeddings...")
            
            # Compute new embeddings
            logger.info("Computing course embeddings...")
            texts = [
                f"{row['title']}. {row['description']}"
                for _, row in df.iterrows()
            ]
            
            if not texts:
                raise ValueError("No texts to encode")
            
            self.embeddings = self.model.encode(
                texts,
                show_progress_bar=True,
                convert_to_numpy=True
            )
            
            if self.embeddings.size == 0:
                raise ValueError("Failed to generate embeddings")
            
            # Create and populate FAISS index
            self.index = faiss.IndexFlatL2(vector_dimension)
            self.index.add(self.embeddings.astype('float32'))
            
            # Save to cache if directory provided
            if cache_dir is not None:
                logger.info("Saving embeddings and index to cache...")
                np.save(str(embeddings_path), self.embeddings)
                faiss.write_index(self.index, str(index_path))
            
            logger.info(f"Successfully processed {len(df)} courses")
            
        except Exception as e:
            logger.error(f"Error processing data: {str(e)}")
            raise

    def search_courses(self, query: str, top_k: int = 5) -> Dict:
        """Search for courses using semantic search with improved ranking"""
        try:
            # Ensure the FAISS index is initialized
            if self.index is None:
                raise ValueError("FAISS index not initialized. Please load data first.")
            
            # Get query embedding
            query_embedding = self.model.encode([query], convert_to_numpy=True)
            
            # Get initial similarity scores
            D, I = self.index.search(query_embedding.reshape(1, -1), top_k * 2)
            distances = D[0]
            indices = I[0]
            
            # Get results with additional metadata
            results = []
            for dist, idx in zip(distances, indices):
                course = self.df.iloc[idx].to_dict()
                
                # Calculate relevance score components
                title_similarity = self.calculate_text_similarity(query, course['title'])
                desc_similarity = self.calculate_text_similarity(query, course['description'])
                
                # Combine scores with weights
                final_score = (
                    0.4 * (1 - dist) +
                    0.4 * title_similarity +
                    0.2 * desc_similarity
                )
                
                results.append({
                    **course,
                    'relevance_score': final_score
                })
            
            # Sort by final relevance score and take top_k
            results.sort(key=lambda x: x['relevance_score'], reverse=True)
            results = results[:top_k]
            
            return {
                'query': query,
                'results': results
            }
        
        except Exception as e:
            logger.error(f"Error in search_courses: {str(e)}")
            raise

    def calculate_text_similarity(self, text1: str, text2: str) -> float:
        """
        Calculate text similarity between two strings using word overlap
        
        Args:
            text1 (str): First text string
            text2 (str): Second text string
            
        Returns:
            float: Similarity score between 0 and 1
        """
        try:
            # Convert to lowercase and split into words
            text1 = str(text1).lower()
            text2 = str(text2).lower()
            
            words1 = set(text1.split())
            words2 = set(text2.split())
            
            if not words1 or not words2:
                return 0.0
            
            # Calculate Jaccard similarity
            intersection = len(words1.intersection(words2))
            union = len(words1.union(words2))
            
            if union == 0:
                return 0.0
            
            similarity = intersection / union
            
            return similarity
            
        except Exception as e:
            logger.error(f"Error calculating text similarity: {str(e)}")
            return 0.0