smart / rag_utils.py
Shakir60's picture
Update rag_utils.py
1413086 verified
# rag_utils.py
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import logging
from typing import List, Dict, Any
import numpy as np
from tqdm import tqdm
import streamlit as st
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RAGSystem:
def __init__(self):
"""Initialize RAG system with custom embeddings and configurations"""
try:
self.embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2",
model_kwargs={'device': 'cuda' if st.cuda.is_available() else 'cpu'}
)
self.vector_store = None
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n\n", "\n", ". ", ", ", " ", ""]
)
logger.info("RAG system initialized successfully")
except Exception as e:
logger.error(f"Error initializing RAG system: {str(e)}")
raise
def _create_documents(self, knowledge_base: Dict) -> List[Document]:
"""Create documents from knowledge base with structured format"""
documents = []
try:
for damage_type, cases in knowledge_base.items():
for case in cases:
# Create a detailed document for each case
technical_info = f"""
Technical Analysis for {damage_type}:
Severity Level: {case['severity']}
Detailed Description: {case['description']}
Primary Location: {case['location']}
Required Expertise: {case['required_expertise']}
"""
repair_info = f"""
Repair and Maintenance Information:
Repair Methods: {' -> '.join(case['repair_method'])}
Estimated Cost Range: {case['estimated_cost']}
Expected Timeframe: {case['timeframe']}
"""
safety_info = f"""
Safety and Prevention Guidelines:
Immediate Actions Required: {case['immediate_action']}
Preventive Measures: {case['prevention']}
Critical Considerations: Special attention needed for {damage_type} in {case['location']}
"""
# Combine all information
doc_text = f"{technical_info}\n{repair_info}\n{safety_info}"
# Create metadata for better retrieval
metadata = {
'damage_type': damage_type,
'severity': case['severity'],
'location': case['location'],
'document_type': 'construction_damage_analysis'
}
documents.append(Document(
page_content=doc_text,
metadata=metadata
))
logger.info(f"Created {len(documents)} documents from knowledge base")
return documents
except Exception as e:
logger.error(f"Error creating documents: {str(e)}")
raise
def initialize_knowledge_base(self, knowledge_base: Dict):
"""Initialize vector store with construction knowledge"""
try:
# Create documents
documents = self._create_documents(knowledge_base)
# Split documents into chunks
splits = self.text_splitter.split_documents(documents)
# Create vector store
self.vector_store = FAISS.from_documents(
documents=splits,
embedding=self.embeddings
)
logger.info("Knowledge base initialized successfully")
except Exception as e:
logger.error(f"Error initializing knowledge base: {str(e)}")
raise
def _format_response(self, docs: List[Document], damage_type: str, confidence: float) -> Dict[str, List[str]]:
"""Format retrieved documents into structured response"""
response = {
"technical_details": [],
"safety_considerations": [],
"expert_recommendations": []
}
try:
for doc in docs:
content = doc.page_content
# Parse technical details
if "Technical Analysis" in content:
response["technical_details"].append(
f"For {damage_type} (Confidence: {confidence:.1f}%):\n" +
content.split("Technical Analysis")[1].split("Repair")[0].strip()
)
# Parse safety considerations
if "Safety and Prevention" in content:
response["safety_considerations"].append(
content.split("Safety and Prevention")[1].strip()
)
# Parse repair recommendations
if "Repair and Maintenance" in content:
response["expert_recommendations"].append(
content.split("Repair and Maintenance")[1].split("Safety")[0].strip()
)
return response
except Exception as e:
logger.error(f"Error formatting response: {str(e)}")
raise
def get_enhanced_analysis(
self,
damage_type: str,
confidence: float,
custom_query: str = None
) -> Dict[str, List[str]]:
"""Get enhanced analysis with optional custom query support"""
try:
if not self.vector_store:
raise ValueError("Vector store not initialized")
# Prepare query
if custom_query:
query = f"{custom_query} for {damage_type} damage"
else:
query = f"""
Provide detailed analysis for {damage_type} damage with {confidence}% confidence level.
Include technical assessment, safety considerations, and repair recommendations.
"""
# Get relevant documents
docs = self.vector_store.similarity_search(
query=query,
k=3, # Get top 3 most relevant documents
fetch_k=5 # Fetch top 5 for better diversity
)
# Format and return response
return self._format_response(docs, damage_type, confidence)
except Exception as e:
logger.error(f"Error getting enhanced analysis: {str(e)}")
return {
"technical_details": [f"Error retrieving analysis: {str(e)}"],
"safety_considerations": ["Please try again or contact support."],
"expert_recommendations": ["System currently unavailable."]
}
def get_similar_cases(self, damage_type: str, confidence: float) -> List[Dict[str, Any]]:
"""Get similar damage cases for comparison"""
try:
if not self.vector_store:
raise ValueError("Vector store not initialized")
query = f"Find similar cases of {damage_type} damage"
docs = self.vector_store.similarity_search(query, k=3)
similar_cases = []
for doc in docs:
if doc.metadata['damage_type'] != damage_type: # Avoid same damage type
similar_cases.append({
'damage_type': doc.metadata['damage_type'],
'severity': doc.metadata['severity'],
'location': doc.metadata['location'],
'details': doc.page_content[:200] + '...' # First 200 chars
})
return similar_cases
except Exception as e:
logger.error(f"Error getting similar cases: {str(e)}")
return []