Spaces:

veerukhannan
/

advisor

Sleeping

App Files Files Community

veerukhannan commited on Nov 23, 2024

Commit

5531e47

verified ·

1 Parent(s): c784c97

Delete add_embeddings.py

Browse files

Files changed (1) hide show

add_embeddings.py +0 -84

add_embeddings.py DELETED Viewed

@@ -1,84 +0,0 @@
-import chromadb
-from chromadb.utils import embedding_functions
-from tqdm import tqdm
-import os
-from typing import List, Dict
-class TextEmbedder:
-    def __init__(self, collection_name: str = "text_collection"):
-        # Initialize ChromaDB client
-        self.chroma_client = chromadb.Client()
-        # Initialize embedding function
-        self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
-            model_name="all-MiniLM-L6-v2"
-        )
-        # Create collection
-        self.collection = self.chroma_client.create_collection(
-            name=collection_name,
-            embedding_function=self.embedding_function,
-            metadata={"hnsw:space": "cosine"}
-        )
-    def process_files(self, text_file: str, index_file: str, chunk_size: int = 512):
-        """Process main text file and index file"""
-        try:
-            # Read main text file
-            print("Reading main text file...")
-            with open(text_file, 'r', encoding='utf-8') as f:
-                text_content = f.read()
-            # Read index file
-            print("Reading index file...")
-            with open(index_file, 'r', encoding='utf-8') as f:
-                index_lines = f.readlines()
-            # Create chunks from text content
-            chunks = []
-            for i in range(0, len(text_content), chunk_size):
-                chunk = text_content[i:i + chunk_size]
-                chunks.append(chunk)
-            print(f"Created {len(chunks)} chunks from text")
-            # Add documents to collection
-            print("Adding documents to ChromaDB...")
-            for i, chunk in enumerate(tqdm(chunks)):
-                # Get corresponding index line if available
-                index_text = index_lines[i].strip() if i < len(index_lines) else f"Chunk {i+1}"
-                self.collection.add(
-                    documents=[chunk],
-                    ids=[f"doc_{i}"],
-                    metadatas=[{
-                        "index": index_text,
-                        "chunk_number": i,
-                        "source": "a2023-45.txt"
-                    }]
-                )
-            print("Successfully processed all documents!")
-            return True
-        except Exception as e:
-            print(f"Error processing files: {str(e)}")
-            return False
-def main():
-    # Initialize embedder
-    embedder = TextEmbedder()
-    # Process files
-    success = embedder.process_files(
-        text_file='a2023-45.txt',
-        index_file='index.txt'
-    )
-    if success:
-        print("Embedding process completed successfully!")
-    else:
-        print("Embedding process failed!")
-if __name__ == "__main__":
-    main()