Spaces:
Sleeping
Sleeping
import os | |
import chromadb | |
from sentence_transformers import SentenceTransformer | |
from loguru import logger | |
class SentenceTransformerEmbeddings: | |
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'): | |
self.model = SentenceTransformer(model_name) | |
def __call__(self, input: list[str]) -> list[list[float]]: | |
embeddings = self.model.encode(input) | |
return embeddings.tolist() | |
def load_documents(): | |
"""Load and process documents into ChromaDB""" | |
try: | |
# Set up paths | |
base_path = os.path.dirname(os.path.abspath(__file__)) | |
doc_path = os.path.join(base_path, 'a2023-45.txt') | |
index_path = os.path.join(base_path, 'index.txt') | |
chroma_path = os.path.join(base_path, 'chroma_db') | |
# Ensure ChromaDB directory exists | |
os.makedirs(chroma_path, exist_ok=True) | |
logger.info(f"Loading documents from {doc_path} and {index_path}") | |
# Initialize ChromaDB | |
chroma_client = chromadb.PersistentClient(path=chroma_path) | |
embedding_function = SentenceTransformerEmbeddings() | |
# Create new collection (delete if exists) | |
if "legal_documents" in [col.name for col in chroma_client.list_collections()]: | |
chroma_client.delete_collection("legal_documents") | |
collection = chroma_client.create_collection( | |
name="legal_documents", | |
embedding_function=embedding_function | |
) | |
# Read and validate files | |
with open(doc_path, 'r', encoding='utf-8') as f: | |
document = f.read().strip() | |
with open(index_path, 'r', encoding='utf-8') as f: | |
index_content = [line.strip() for line in f.readlines() if line.strip()] | |
# Process document into sections | |
sections = [] | |
current_section = "" | |
current_title = "" | |
for line in document.split('\n'): | |
line = line.strip() | |
if any(index_line in line for index_line in index_content): | |
if current_section and current_title: | |
sections.append({ | |
"title": current_title, | |
"content": current_section.strip() | |
}) | |
current_title = line | |
current_section = "" | |
else: | |
if line: | |
current_section += line + "\n" | |
# Add final section | |
if current_section and current_title: | |
sections.append({ | |
"title": current_title, | |
"content": current_section.strip() | |
}) | |
# Prepare data for ChromaDB | |
documents = [] | |
metadatas = [] | |
ids = [] | |
for i, section in enumerate(sections): | |
if section["content"].strip(): | |
documents.append(section["content"]) | |
metadatas.append({ | |
"title": section["title"], | |
"source": "a2023-45.txt", | |
"section_number": i + 1 | |
}) | |
ids.append(f"section_{i+1}") | |
# Add to ChromaDB | |
collection.add( | |
documents=documents, | |
metadatas=metadatas, | |
ids=ids | |
) | |
logger.info(f"Successfully loaded {len(documents)} sections into ChromaDB") | |
return True | |
except Exception as e: | |
logger.error(f"Error loading documents: {str(e)}") | |
return False | |
if __name__ == "__main__": | |
success = load_documents() | |
if success: | |
print("Documents successfully loaded into ChromaDB") | |
else: | |
print("Failed to load documents into ChromaDB") |