Spaces:
Sleeping
Sleeping
import os | |
import chromadb | |
from sentence_transformers import SentenceTransformer | |
from loguru import logger | |
class SentenceTransformerEmbeddings: | |
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'): | |
self.model = SentenceTransformer(model_name) | |
def __call__(self, input: list[str]) -> list[list[float]]: | |
embeddings = self.model.encode(input) | |
return embeddings.tolist() | |
def test_chromadb_content(): | |
"""Test if ChromaDB has the required content""" | |
try: | |
# Set up ChromaDB path | |
base_path = os.path.dirname(os.path.abspath(__file__)) | |
chroma_path = os.path.join(base_path, 'chroma_db') | |
if not os.path.exists(chroma_path): | |
logger.error(f"ChromaDB directory not found at {chroma_path}") | |
return False | |
# Initialize ChromaDB | |
chroma_client = chromadb.PersistentClient(path=chroma_path) | |
# Check if collection exists | |
collections = chroma_client.list_collections() | |
if not any(col.name == "legal_documents" for col in collections): | |
logger.error("Legal documents collection not found in ChromaDB") | |
return False | |
# Get collection | |
collection = chroma_client.get_collection( | |
name="legal_documents", | |
embedding_function=SentenceTransformerEmbeddings() | |
) | |
# Check collection size | |
count = collection.count() | |
if count == 0: | |
logger.error("Collection is empty") | |
return False | |
logger.info(f"Found {count} documents in ChromaDB") | |
# Test query to verify content | |
test_results = collection.query( | |
query_texts=["What are the general provisions?"], | |
n_results=1 | |
) | |
if not test_results['documents']: | |
logger.error("Test query returned no results") | |
return False | |
# Print sample content | |
logger.info("Sample content from ChromaDB:") | |
for i, (doc, metadata) in enumerate(zip(test_results['documents'][0], test_results['metadatas'][0])): | |
logger.info(f"\nDocument {i+1}:") | |
logger.info(f"Title: {metadata['title']}") | |
logger.info(f"Content preview: {doc[:200]}...") | |
return True | |
except Exception as e: | |
logger.error(f"Error testing ChromaDB: {str(e)}") | |
return False | |
if __name__ == "__main__": | |
success = test_chromadb_content() | |
if success: | |
print("ChromaDB content verification successful") | |
else: | |
print("ChromaDB content verification failed") |