veerukhannan commited on
Commit
bb05d9c
·
verified ·
1 Parent(s): 579fc75

Create add_embeddings.py

Browse files
Files changed (1) hide show
  1. add_embeddings.py +84 -0
add_embeddings.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from chromadb.utils import embedding_functions
3
+ from tqdm import tqdm
4
+ import os
5
+ from typing import List, Dict
6
+
7
+ class TextEmbedder:
8
+ def __init__(self, collection_name: str = "text_collection"):
9
+ # Initialize ChromaDB client
10
+ self.chroma_client = chromadb.Client()
11
+
12
+ # Initialize embedding function
13
+ self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
14
+ model_name="all-MiniLM-L6-v2"
15
+ )
16
+
17
+ # Create collection
18
+ self.collection = self.chroma_client.create_collection(
19
+ name=collection_name,
20
+ embedding_function=self.embedding_function,
21
+ metadata={"hnsw:space": "cosine"}
22
+ )
23
+
24
+ def process_files(self, text_file: str, index_file: str, chunk_size: int = 512):
25
+ """Process main text file and index file"""
26
+ try:
27
+ # Read main text file
28
+ print("Reading main text file...")
29
+ with open(text_file, 'r', encoding='utf-8') as f:
30
+ text_content = f.read()
31
+
32
+ # Read index file
33
+ print("Reading index file...")
34
+ with open(index_file, 'r', encoding='utf-8') as f:
35
+ index_lines = f.readlines()
36
+
37
+ # Create chunks from text content
38
+ chunks = []
39
+ for i in range(0, len(text_content), chunk_size):
40
+ chunk = text_content[i:i + chunk_size]
41
+ chunks.append(chunk)
42
+
43
+ print(f"Created {len(chunks)} chunks from text")
44
+
45
+ # Add documents to collection
46
+ print("Adding documents to ChromaDB...")
47
+ for i, chunk in enumerate(tqdm(chunks)):
48
+ # Get corresponding index line if available
49
+ index_text = index_lines[i].strip() if i < len(index_lines) else f"Chunk {i+1}"
50
+
51
+ self.collection.add(
52
+ documents=[chunk],
53
+ ids=[f"doc_{i}"],
54
+ metadatas=[{
55
+ "index": index_text,
56
+ "chunk_number": i,
57
+ "source": "a2023-45.txt"
58
+ }]
59
+ )
60
+
61
+ print("Successfully processed all documents!")
62
+ return True
63
+
64
+ except Exception as e:
65
+ print(f"Error processing files: {str(e)}")
66
+ return False
67
+
68
+ def main():
69
+ # Initialize embedder
70
+ embedder = TextEmbedder()
71
+
72
+ # Process files
73
+ success = embedder.process_files(
74
+ text_file='a2023-45.txt',
75
+ index_file='index.txt'
76
+ )
77
+
78
+ if success:
79
+ print("Embedding process completed successfully!")
80
+ else:
81
+ print("Embedding process failed!")
82
+
83
+ if __name__ == "__main__":
84
+ main()