File size: 2,827 Bytes
bb05d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import chromadb
from chromadb.utils import embedding_functions
from tqdm import tqdm
import os
from typing import List, Dict

class TextEmbedder:
    def __init__(self, collection_name: str = "text_collection"):
        # Initialize ChromaDB client
        self.chroma_client = chromadb.Client()
        
        # Initialize embedding function
        self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="all-MiniLM-L6-v2"
        )
        
        # Create collection
        self.collection = self.chroma_client.create_collection(
            name=collection_name,
            embedding_function=self.embedding_function,
            metadata={"hnsw:space": "cosine"}
        )

    def process_files(self, text_file: str, index_file: str, chunk_size: int = 512):
        """Process main text file and index file"""
        try:
            # Read main text file
            print("Reading main text file...")
            with open(text_file, 'r', encoding='utf-8') as f:
                text_content = f.read()
            
            # Read index file
            print("Reading index file...")
            with open(index_file, 'r', encoding='utf-8') as f:
                index_lines = f.readlines()
            
            # Create chunks from text content
            chunks = []
            for i in range(0, len(text_content), chunk_size):
                chunk = text_content[i:i + chunk_size]
                chunks.append(chunk)
            
            print(f"Created {len(chunks)} chunks from text")
            
            # Add documents to collection
            print("Adding documents to ChromaDB...")
            for i, chunk in enumerate(tqdm(chunks)):
                # Get corresponding index line if available
                index_text = index_lines[i].strip() if i < len(index_lines) else f"Chunk {i+1}"
                
                self.collection.add(
                    documents=[chunk],
                    ids=[f"doc_{i}"],
                    metadatas=[{
                        "index": index_text,
                        "chunk_number": i,
                        "source": "a2023-45.txt"
                    }]
                )
            
            print("Successfully processed all documents!")
            return True
            
        except Exception as e:
            print(f"Error processing files: {str(e)}")
            return False

def main():
    # Initialize embedder
    embedder = TextEmbedder()
    
    # Process files
    success = embedder.process_files(
        text_file='a2023-45.txt',
        index_file='index.txt'
    )
    
    if success:
        print("Embedding process completed successfully!")
    else:
        print("Embedding process failed!")

if __name__ == "__main__":
    main()