Spaces:

veerukhannan
/

advisor

Sleeping

File size: 6,196 Bytes

import os
import chromadb
from sentence_transformers import SentenceTransformer
from loguru import logger

class SentenceTransformerEmbeddings:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)

    def __call__(self, input: list[str]) -> list[list[float]]:
        embeddings = self.model.encode(input)
        return embeddings.tolist()

def initialize_chromadb():
    """Initialize ChromaDB and load documents if needed"""
    try:
        # Set up paths
        base_path = os.path.dirname(os.path.abspath(__file__))
        doc_path = os.path.join(base_path, 'a2023-45.txt')
        index_path = os.path.join(base_path, 'index.txt')
        chroma_path = os.path.join(base_path, 'chroma_db')
        
        # Check if required files exist
        if not os.path.exists(doc_path):
            logger.error(f"Document file not found at {doc_path}")
            return False
        if not os.path.exists(index_path):
            logger.error(f"Index file not found at {index_path}")
            return False

        # Ensure ChromaDB directory exists
        os.makedirs(chroma_path, exist_ok=True)
        
        # Initialize ChromaDB
        chroma_client = chromadb.PersistentClient(path=chroma_path)
        embedding_function = SentenceTransformerEmbeddings()
        
        # Check if collection exists and has content
        collections = chroma_client.list_collections()
        collection_exists = any(col.name == "legal_documents" for col in collections)
        
        if collection_exists:
            collection = chroma_client.get_collection(
                name="legal_documents",
                embedding_function=embedding_function
            )
            if collection.count() > 0:
                logger.info("ChromaDB collection already exists and has content")
                return True
        
        # If we get here, we need to create or repopulate the collection
        logger.info("Loading documents into ChromaDB...")
        
        # Delete existing collection if it exists
        if collection_exists:
            chroma_client.delete_collection("legal_documents")
        
        # Create new collection
        collection = chroma_client.create_collection(
            name="legal_documents",
            embedding_function=embedding_function
        )

        # Read and process documents
        with open(doc_path, 'r', encoding='utf-8') as f:
            document = f.read().strip()
        
        with open(index_path, 'r', encoding='utf-8') as f:
            index_content = [line.strip() for line in f.readlines() if line.strip()]

        # Process document into sections
        sections = []
        current_section = ""
        current_title = ""
        
        for line in document.split('\n'):
            line = line.strip()
            if any(index_line in line for index_line in index_content):
                if current_section and current_title:
                    sections.append({
                        "title": current_title,
                        "content": current_section.strip()
                    })
                current_title = line
                current_section = ""
            else:
                if line:
                    current_section += line + "\n"
        
        if current_section and current_title:
            sections.append({
                "title": current_title,
                "content": current_section.strip()
            })

        # Prepare and add data to ChromaDB
        if sections:
            documents = []
            metadatas = []
            ids = []
            
            for i, section in enumerate(sections):
                if section["content"].strip():
                    documents.append(section["content"])
                    metadatas.append({
                        "title": section["title"],
                        "source": "a2023-45.txt",
                        "section_number": i + 1
                    })
                    ids.append(f"section_{i+1}")

            collection.add(
                documents=documents,
                metadatas=metadatas,
                ids=ids
            )
            
            logger.info(f"Successfully loaded {len(documents)} sections into ChromaDB")
            return True
        else:
            logger.error("No valid sections found in document")
            return False

    except Exception as e:
        logger.error(f"Error initializing ChromaDB: {str(e)}")
        return False

def test_chromadb_content():
    """Test if ChromaDB has the required content"""
    try:
        # First ensure ChromaDB is initialized
        if not initialize_chromadb():
            return False
        
        # Set up ChromaDB path
        base_path = os.path.dirname(os.path.abspath(__file__))
        chroma_path = os.path.join(base_path, 'chroma_db')
        
        # Initialize ChromaDB
        chroma_client = chromadb.PersistentClient(path=chroma_path)
        
        # Get collection
        collection = chroma_client.get_collection(
            name="legal_documents",
            embedding_function=SentenceTransformerEmbeddings()
        )
        
        # Check collection size
        count = collection.count()
        if count == 0:
            logger.error("Collection is empty")
            return False
        
        logger.info(f"Found {count} documents in ChromaDB")
        
        # Test query to verify content
        test_results = collection.query(
            query_texts=["What are the general provisions?"],
            n_results=1
        )
        
        if not test_results['documents']:
            logger.error("Test query returned no results")
            return False
        
        logger.info("ChromaDB content verification successful")
        return True

    except Exception as e:
        logger.error(f"Error testing ChromaDB: {str(e)}")
        return False

if __name__ == "__main__":
    success = test_chromadb_content()
    if success:
        print("ChromaDB content verification successful")
    else:
        print("ChromaDB content verification failed")