File size: 19,125 Bytes
e07ee7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 |
import os
import re
import logging
import nltk
from io import BytesIO
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2
import docx2txt
from functools import lru_cache
logger = logging.getLogger(__name__)
# Try to import sentence-transformers
try:
from sentence_transformers import SentenceTransformer
HAVE_TRANSFORMERS = True
except ImportError:
HAVE_TRANSFORMERS = False
# Try to download NLTK data if not already present
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
try:
nltk.download('punkt', quiet=True)
except:
pass
try:
nltk.data.find('corpora/stopwords')
except LookupError:
try:
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
except:
STOPWORDS = set(['the', 'and', 'a', 'in', 'to', 'of', 'is', 'it', 'that', 'for', 'with', 'as', 'on', 'by'])
class EnhancedRAGSearch:
def __init__(self):
self.file_texts = []
self.chunks = [] # Document chunks for more targeted search
self.chunk_metadata = [] # Metadata for each chunk
self.file_metadata = []
self.languages = []
self.model = None
# Try to load the sentence transformer model if available
if HAVE_TRANSFORMERS:
try:
# Use a small, efficient model
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.use_transformer = True
logger.info("Using sentence-transformers for RAG")
except Exception as e:
logger.warning(f"Error loading sentence-transformer: {e}")
self.use_transformer = False
else:
self.use_transformer = False
# Fallback to TF-IDF if transformers not available
if not self.use_transformer:
self.vectorizer = TfidfVectorizer(
stop_words='english',
ngram_range=(1, 2), # Use bigrams for better context
max_features=15000, # Use more features for better representation
min_df=1 # Include rare terms
)
self.vectors = None
self.chunk_vectors = None
def add_file(self, file_data, file_info):
"""Add a file to the search index with improved processing"""
file_ext = os.path.splitext(file_info['filename'])[1].lower()
text = self.extract_text(file_data, file_ext)
if text:
# Store the whole document text
self.file_texts.append(text)
self.file_metadata.append(file_info)
# Try to detect language
try:
# Simple language detection based on stopwords
words = re.findall(r'\b\w+\b', text.lower())
english_stopwords_ratio = len([w for w in words[:100] if w in STOPWORDS]) / max(1, len(words[:100]))
lang = 'en' if english_stopwords_ratio > 0.2 else 'unknown'
self.languages.append(lang)
except:
self.languages.append('en') # Default to English
# Create chunks for more granular search
chunks = self.create_chunks(text)
for chunk in chunks:
self.chunks.append(chunk)
self.chunk_metadata.append({
'file_info': file_info,
'chunk_size': len(chunk),
'file_index': len(self.file_texts) - 1
})
return True
return False
def create_chunks(self, text, chunk_size=1000, overlap=200):
"""Split text into overlapping chunks for better search precision"""
try:
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= chunk_size:
current_chunk += sentence + " "
else:
# Add current chunk if it has content
if current_chunk:
chunks.append(current_chunk.strip())
# Start new chunk with overlap from previous chunk
if len(current_chunk) > overlap:
# Find the last space within the overlap region
overlap_text = current_chunk[-overlap:]
last_space = overlap_text.rfind(' ')
if last_space != -1:
current_chunk = current_chunk[-(overlap-last_space):] + sentence + " "
else:
current_chunk = sentence + " "
else:
current_chunk = sentence + " "
# Add the last chunk if it has content
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
except:
# Fallback to simpler chunking approach
chunks = []
for i in range(0, len(text), chunk_size - overlap):
chunk = text[i:i + chunk_size]
if chunk:
chunks.append(chunk)
return chunks
def extract_text(self, file_data, file_ext):
"""Extract text from different file types with enhanced support"""
try:
if file_ext.lower() == '.pdf':
reader = PyPDF2.PdfReader(BytesIO(file_data))
text = ""
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text += extracted + "\n"
return text
elif file_ext.lower() in ['.docx', '.doc']:
return docx2txt.process(BytesIO(file_data))
elif file_ext.lower() in ['.txt', '.csv', '.json', '.html', '.htm']:
# Handle both UTF-8 and other common encodings
try:
return file_data.decode('utf-8', errors='ignore')
except:
encodings = ['latin-1', 'iso-8859-1', 'windows-1252']
for enc in encodings:
try:
return file_data.decode(enc, errors='ignore')
except:
pass
# Last resort fallback
return file_data.decode('utf-8', errors='ignore')
elif file_ext.lower() in ['.pptx', '.ppt', '.xlsx', '.xls']:
return f"[Content of {file_ext} file - install additional libraries for full text extraction]"
else:
return ""
except Exception as e:
logger.error(f"Error extracting text: {e}")
return ""
def build_index(self):
"""Build both document and chunk search indices"""
if not self.file_texts:
return False
try:
if self.use_transformer:
# Use sentence transformer models for embeddings
logger.info("Building document and chunk embeddings with transformer model...")
self.vectors = self.model.encode(self.file_texts, show_progress_bar=False)
# Build chunk-level index if we have chunks
if self.chunks:
# Process in batches to avoid memory issues
batch_size = 32
chunk_vectors = []
for i in range(0, len(self.chunks), batch_size):
batch = self.chunks[i:i+batch_size]
batch_vectors = self.model.encode(batch, show_progress_bar=False)
chunk_vectors.append(batch_vectors)
self.chunk_vectors = np.vstack(chunk_vectors)
else:
# Build document-level index
self.vectors = self.vectorizer.fit_transform(self.file_texts)
# Build chunk-level index if we have chunks
if self.chunks:
self.chunk_vectors = self.vectorizer.transform(self.chunks)
return True
except Exception as e:
logger.error(f"Error building search index: {e}")
return False
def expand_query(self, query):
"""Add related terms to query for better recall - mini LLM function"""
# Dictionary of related terms for common keywords
expansions = {
"exam": ["test", "assessment", "quiz", "paper", "exam paper", "past paper", "past exam"],
"test": ["exam", "quiz", "assessment", "paper"],
"document": ["file", "paper", "report", "doc", "documentation"],
"manual": ["guide", "instruction", "documentation", "handbook"],
"tutorial": ["guide", "instructions", "how-to", "lesson"],
"article": ["paper", "publication", "journal", "research"],
"research": ["study", "investigation", "paper", "analysis"],
"book": ["textbook", "publication", "volume", "edition"],
"thesis": ["dissertation", "paper", "research", "study"],
"report": ["document", "paper", "analysis", "summary"],
"assignment": ["homework", "task", "project", "work"],
"lecture": ["class", "presentation", "talk", "lesson"],
"notes": ["annotations", "summary", "outline", "study material"],
"syllabus": ["curriculum", "course outline", "program", "plan"],
"paper": ["document", "article", "publication", "exam", "test"],
"question": ["problem", "query", "exercise", "inquiry"],
"solution": ["answer", "resolution", "explanation", "result"],
"reference": ["source", "citation", "bibliography", "resource"],
"analysis": ["examination", "study", "evaluation", "assessment"],
"guide": ["manual", "instruction", "handbook", "tutorial"],
"worksheet": ["exercise", "activity", "handout", "practice"],
"review": ["evaluation", "assessment", "critique", "feedback"],
"material": ["resource", "content", "document", "information"],
"data": ["information", "statistics", "figures", "numbers"]
}
# Enhanced query expansion simulating a mini-LLM
query_words = re.findall(r'\b\w+\b', query.lower())
expanded_terms = set()
# Directly add expansions from our dictionary
for word in query_words:
if word in expansions:
expanded_terms.update(expansions[word])
# Add common academic file formats if not already included
if any(term in query.lower() for term in ["file", "document", "download", "paper"]):
if not any(ext in query.lower() for ext in ["pdf", "docx", "ppt", "excel"]):
expanded_terms.update(["pdf", "docx", "pptx", "xlsx"])
# Add special academic terms when the query seems related to education
if any(term in query.lower() for term in ["course", "university", "college", "school", "class"]):
expanded_terms.update(["syllabus", "lecture", "notes", "textbook"])
# Return original query plus expanded terms
if expanded_terms:
expanded_query = f"{query} {' '.join(expanded_terms)}"
logger.info(f"Expanded query: '{query}' -> '{expanded_query}'")
return expanded_query
return query
@lru_cache(maxsize=8)
def search(self, query, top_k=5, search_chunks=True):
"""Enhanced search with both document and chunk-level search"""
if self.vectors is None:
return []
# Simulate a small LLM by expanding the query with related terms
expanded_query = self.expand_query(query)
try:
results = []
if self.use_transformer:
# Transform the query to embedding
query_vector = self.model.encode([expanded_query])[0]
# First search at document level for higher-level matches
if self.vectors is not None:
# Compute similarities between query and documents
doc_similarities = cosine_similarity(
query_vector.reshape(1, -1),
self.vectors
).flatten()
top_doc_indices = doc_similarities.argsort()[-top_k:][::-1]
for i, idx in enumerate(top_doc_indices):
if doc_similarities[idx] > 0.2: # Threshold to exclude irrelevant results
results.append({
'file_info': self.file_metadata[idx],
'score': float(doc_similarities[idx]),
'rank': i+1,
'match_type': 'document',
'language': self.languages[idx] if idx < len(self.languages) else 'unknown'
})
# Then search at chunk level for more specific matches if enabled
if search_chunks and self.chunk_vectors is not None:
# Compute similarities between query and chunks
chunk_similarities = cosine_similarity(
query_vector.reshape(1, -1),
self.chunk_vectors
).flatten()
top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1] # Get more chunk results
# Use a set to avoid duplicate file results
seen_files = set(r['file_info']['url'] for r in results)
for i, idx in enumerate(top_chunk_indices):
if chunk_similarities[idx] > 0.25: # Higher threshold for chunks
file_index = self.chunk_metadata[idx]['file_index']
file_info = self.file_metadata[file_index]
# Only add if we haven't already included this file
if file_info['url'] not in seen_files:
seen_files.add(file_info['url'])
results.append({
'file_info': file_info,
'score': float(chunk_similarities[idx]),
'rank': len(results) + 1,
'match_type': 'chunk',
'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown',
'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx]
})
# Stop after we've found enough results
if len(results) >= top_k*1.5:
break
else:
# Fallback to TF-IDF if transformers not available
query_vector = self.vectorizer.transform([expanded_query])
# First search at document level
if self.vectors is not None:
doc_similarities = cosine_similarity(query_vector, self.vectors).flatten()
top_doc_indices = doc_similarities.argsort()[-top_k:][::-1]
for i, idx in enumerate(top_doc_indices):
if doc_similarities[idx] > 0.1: # Threshold to exclude irrelevant results
results.append({
'file_info': self.file_metadata[idx],
'score': float(doc_similarities[idx]),
'rank': i+1,
'match_type': 'document',
'language': self.languages[idx] if idx < len(self.languages) else 'unknown'
})
# Then search at chunk level if enabled
if search_chunks and self.chunk_vectors is not None:
chunk_similarities = cosine_similarity(query_vector, self.chunk_vectors).flatten()
top_chunk_indices = chunk_similarities.argsort()[-top_k*2:][::-1]
# Avoid duplicates
seen_files = set(r['file_info']['url'] for r in results)
for i, idx in enumerate(top_chunk_indices):
if chunk_similarities[idx] > 0.15:
file_index = self.chunk_metadata[idx]['file_index']
file_info = self.file_metadata[file_index]
if file_info['url'] not in seen_files:
seen_files.add(file_info['url'])
results.append({
'file_info': file_info,
'score': float(chunk_similarities[idx]),
'rank': len(results) + 1,
'match_type': 'chunk',
'language': self.languages[file_index] if file_index < len(self.languages) else 'unknown',
'chunk_preview': self.chunks[idx][:200] + "..." if len(self.chunks[idx]) > 200 else self.chunks[idx]
})
if len(results) >= top_k*1.5:
break
# Sort combined results by score
results.sort(key=lambda x: x['score'], reverse=True)
# Re-rank and truncate
for i, result in enumerate(results[:top_k]):
result['rank'] = i+1
return results[:top_k]
except Exception as e:
logger.error(f"Error during search: {e}")
return []
def clear_cache(self):
"""Clear search cache and free memory"""
if hasattr(self.search, 'cache_clear'):
self.search.cache_clear()
# Clear vectors to free memory
self.vectors = None
self.chunk_vectors = None
# Force garbage collection
import gc
gc.collect() |