Spaces:
Sleeping
Sleeping
File size: 2,635 Bytes
6a1ad16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import os
import chromadb
from sentence_transformers import SentenceTransformer
from loguru import logger
class SentenceTransformerEmbeddings:
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
self.model = SentenceTransformer(model_name)
def __call__(self, input: list[str]) -> list[list[float]]:
embeddings = self.model.encode(input)
return embeddings.tolist()
def test_chromadb_content():
"""Test if ChromaDB has the required content"""
try:
# Set up ChromaDB path
base_path = os.path.dirname(os.path.abspath(__file__))
chroma_path = os.path.join(base_path, 'chroma_db')
if not os.path.exists(chroma_path):
logger.error(f"ChromaDB directory not found at {chroma_path}")
return False
# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path=chroma_path)
# Check if collection exists
collections = chroma_client.list_collections()
if not any(col.name == "legal_documents" for col in collections):
logger.error("Legal documents collection not found in ChromaDB")
return False
# Get collection
collection = chroma_client.get_collection(
name="legal_documents",
embedding_function=SentenceTransformerEmbeddings()
)
# Check collection size
count = collection.count()
if count == 0:
logger.error("Collection is empty")
return False
logger.info(f"Found {count} documents in ChromaDB")
# Test query to verify content
test_results = collection.query(
query_texts=["What are the general provisions?"],
n_results=1
)
if not test_results['documents']:
logger.error("Test query returned no results")
return False
# Print sample content
logger.info("Sample content from ChromaDB:")
for i, (doc, metadata) in enumerate(zip(test_results['documents'][0], test_results['metadatas'][0])):
logger.info(f"\nDocument {i+1}:")
logger.info(f"Title: {metadata['title']}")
logger.info(f"Content preview: {doc[:200]}...")
return True
except Exception as e:
logger.error(f"Error testing ChromaDB: {str(e)}")
return False
if __name__ == "__main__":
success = test_chromadb_content()
if success:
print("ChromaDB content verification successful")
else:
print("ChromaDB content verification failed") |