# rag_utils.py from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.docstore.document import Document import logging from typing import List, Dict, Any import numpy as np from tqdm import tqdm import streamlit as st # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class RAGSystem: def __init__(self): """Initialize RAG system with custom embeddings and configurations""" try: self.embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': 'cuda' if st.cuda.is_available() else 'cpu'} ) self.vector_store = None self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, separators=["\n\n", "\n", ". ", ", ", " ", ""] ) logger.info("RAG system initialized successfully") except Exception as e: logger.error(f"Error initializing RAG system: {str(e)}") raise def _create_documents(self, knowledge_base: Dict) -> List[Document]: """Create documents from knowledge base with structured format""" documents = [] try: for damage_type, cases in knowledge_base.items(): for case in cases: # Create a detailed document for each case technical_info = f""" Technical Analysis for {damage_type}: Severity Level: {case['severity']} Detailed Description: {case['description']} Primary Location: {case['location']} Required Expertise: {case['required_expertise']} """ repair_info = f""" Repair and Maintenance Information: Repair Methods: {' -> '.join(case['repair_method'])} Estimated Cost Range: {case['estimated_cost']} Expected Timeframe: {case['timeframe']} """ safety_info = f""" Safety and Prevention Guidelines: Immediate Actions Required: {case['immediate_action']} Preventive Measures: {case['prevention']} Critical Considerations: Special attention needed for {damage_type} in {case['location']} """ # Combine all information doc_text = f"{technical_info}\n{repair_info}\n{safety_info}" # Create metadata for better retrieval metadata = { 'damage_type': damage_type, 'severity': case['severity'], 'location': case['location'], 'document_type': 'construction_damage_analysis' } documents.append(Document( page_content=doc_text, metadata=metadata )) logger.info(f"Created {len(documents)} documents from knowledge base") return documents except Exception as e: logger.error(f"Error creating documents: {str(e)}") raise def initialize_knowledge_base(self, knowledge_base: Dict): """Initialize vector store with construction knowledge""" try: # Create documents documents = self._create_documents(knowledge_base) # Split documents into chunks splits = self.text_splitter.split_documents(documents) # Create vector store self.vector_store = FAISS.from_documents( documents=splits, embedding=self.embeddings ) logger.info("Knowledge base initialized successfully") except Exception as e: logger.error(f"Error initializing knowledge base: {str(e)}") raise def _format_response(self, docs: List[Document], damage_type: str, confidence: float) -> Dict[str, List[str]]: """Format retrieved documents into structured response""" response = { "technical_details": [], "safety_considerations": [], "expert_recommendations": [] } try: for doc in docs: content = doc.page_content # Parse technical details if "Technical Analysis" in content: response["technical_details"].append( f"For {damage_type} (Confidence: {confidence:.1f}%):\n" + content.split("Technical Analysis")[1].split("Repair")[0].strip() ) # Parse safety considerations if "Safety and Prevention" in content: response["safety_considerations"].append( content.split("Safety and Prevention")[1].strip() ) # Parse repair recommendations if "Repair and Maintenance" in content: response["expert_recommendations"].append( content.split("Repair and Maintenance")[1].split("Safety")[0].strip() ) return response except Exception as e: logger.error(f"Error formatting response: {str(e)}") raise def get_enhanced_analysis( self, damage_type: str, confidence: float, custom_query: str = None ) -> Dict[str, List[str]]: """Get enhanced analysis with optional custom query support""" try: if not self.vector_store: raise ValueError("Vector store not initialized") # Prepare query if custom_query: query = f"{custom_query} for {damage_type} damage" else: query = f""" Provide detailed analysis for {damage_type} damage with {confidence}% confidence level. Include technical assessment, safety considerations, and repair recommendations. """ # Get relevant documents docs = self.vector_store.similarity_search( query=query, k=3, # Get top 3 most relevant documents fetch_k=5 # Fetch top 5 for better diversity ) # Format and return response return self._format_response(docs, damage_type, confidence) except Exception as e: logger.error(f"Error getting enhanced analysis: {str(e)}") return { "technical_details": [f"Error retrieving analysis: {str(e)}"], "safety_considerations": ["Please try again or contact support."], "expert_recommendations": ["System currently unavailable."] } def get_similar_cases(self, damage_type: str, confidence: float) -> List[Dict[str, Any]]: """Get similar damage cases for comparison""" try: if not self.vector_store: raise ValueError("Vector store not initialized") query = f"Find similar cases of {damage_type} damage" docs = self.vector_store.similarity_search(query, k=3) similar_cases = [] for doc in docs: if doc.metadata['damage_type'] != damage_type: # Avoid same damage type similar_cases.append({ 'damage_type': doc.metadata['damage_type'], 'severity': doc.metadata['severity'], 'location': doc.metadata['location'], 'details': doc.page_content[:200] + '...' # First 200 chars }) return similar_cases except Exception as e: logger.error(f"Error getting similar cases: {str(e)}") return []