import chromadb from chromadb.utils import embedding_functions from tqdm import tqdm import os from typing import List, Dict class TextEmbedder: def __init__(self, collection_name: str = "text_collection"): # Initialize ChromaDB client self.chroma_client = chromadb.Client() # Initialize embedding function self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction( model_name="all-MiniLM-L6-v2" ) # Create collection self.collection = self.chroma_client.create_collection( name=collection_name, embedding_function=self.embedding_function, metadata={"hnsw:space": "cosine"} ) def process_files(self, text_file: str, index_file: str, chunk_size: int = 512): """Process main text file and index file""" try: # Read main text file print("Reading main text file...") with open(text_file, 'r', encoding='utf-8') as f: text_content = f.read() # Read index file print("Reading index file...") with open(index_file, 'r', encoding='utf-8') as f: index_lines = f.readlines() # Create chunks from text content chunks = [] for i in range(0, len(text_content), chunk_size): chunk = text_content[i:i + chunk_size] chunks.append(chunk) print(f"Created {len(chunks)} chunks from text") # Add documents to collection print("Adding documents to ChromaDB...") for i, chunk in enumerate(tqdm(chunks)): # Get corresponding index line if available index_text = index_lines[i].strip() if i < len(index_lines) else f"Chunk {i+1}" self.collection.add( documents=[chunk], ids=[f"doc_{i}"], metadatas=[{ "index": index_text, "chunk_number": i, "source": "a2023-45.txt" }] ) print("Successfully processed all documents!") return True except Exception as e: print(f"Error processing files: {str(e)}") return False def main(): # Initialize embedder embedder = TextEmbedder() # Process files success = embedder.process_files( text_file='a2023-45.txt', index_file='index.txt' ) if success: print("Embedding process completed successfully!") else: print("Embedding process failed!") if __name__ == "__main__": main()