Spaces:

veerukhannan
/

advisor

Sleeping

App Files Files Community

veerukhannan commited on Nov 23, 2024

Commit

bb05d9c

verified ·

1 Parent(s): 579fc75

Create add_embeddings.py

Browse files

Files changed (1) hide show

add_embeddings.py +84 -0

add_embeddings.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import chromadb
+from chromadb.utils import embedding_functions
+from tqdm import tqdm
+import os
+from typing import List, Dict
+class TextEmbedder:
+    def __init__(self, collection_name: str = "text_collection"):
+        # Initialize ChromaDB client
+        self.chroma_client = chromadb.Client()
+        # Initialize embedding function
+        self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
+            model_name="all-MiniLM-L6-v2"
+        )
+        # Create collection
+        self.collection = self.chroma_client.create_collection(
+            name=collection_name,
+            embedding_function=self.embedding_function,
+            metadata={"hnsw:space": "cosine"}
+        )
+    def process_files(self, text_file: str, index_file: str, chunk_size: int = 512):
+        """Process main text file and index file"""
+        try:
+            # Read main text file
+            print("Reading main text file...")
+            with open(text_file, 'r', encoding='utf-8') as f:
+                text_content = f.read()
+            # Read index file
+            print("Reading index file...")
+            with open(index_file, 'r', encoding='utf-8') as f:
+                index_lines = f.readlines()
+            # Create chunks from text content
+            chunks = []
+            for i in range(0, len(text_content), chunk_size):
+                chunk = text_content[i:i + chunk_size]
+                chunks.append(chunk)
+            print(f"Created {len(chunks)} chunks from text")
+            # Add documents to collection
+            print("Adding documents to ChromaDB...")
+            for i, chunk in enumerate(tqdm(chunks)):
+                # Get corresponding index line if available
+                index_text = index_lines[i].strip() if i < len(index_lines) else f"Chunk {i+1}"
+                self.collection.add(
+                    documents=[chunk],
+                    ids=[f"doc_{i}"],
+                    metadatas=[{
+                        "index": index_text,
+                        "chunk_number": i,
+                        "source": "a2023-45.txt"
+                    }]
+                )
+            print("Successfully processed all documents!")
+            return True
+        except Exception as e:
+            print(f"Error processing files: {str(e)}")
+            return False
+def main():
+    # Initialize embedder
+    embedder = TextEmbedder()
+    # Process files
+    success = embedder.process_files(
+        text_file='a2023-45.txt',
+        index_file='index.txt'
+    )
+    if success:
+        print("Embedding process completed successfully!")
+    else:
+        print("Embedding process failed!")
+if __name__ == "__main__":
+    main()