veerukhannan commited on
Commit
5531e47
·
verified ·
1 Parent(s): c784c97

Delete add_embeddings.py

Browse files
Files changed (1) hide show
  1. add_embeddings.py +0 -84
add_embeddings.py DELETED
@@ -1,84 +0,0 @@
1
- import chromadb
2
- from chromadb.utils import embedding_functions
3
- from tqdm import tqdm
4
- import os
5
- from typing import List, Dict
6
-
7
- class TextEmbedder:
8
- def __init__(self, collection_name: str = "text_collection"):
9
- # Initialize ChromaDB client
10
- self.chroma_client = chromadb.Client()
11
-
12
- # Initialize embedding function
13
- self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
14
- model_name="all-MiniLM-L6-v2"
15
- )
16
-
17
- # Create collection
18
- self.collection = self.chroma_client.create_collection(
19
- name=collection_name,
20
- embedding_function=self.embedding_function,
21
- metadata={"hnsw:space": "cosine"}
22
- )
23
-
24
- def process_files(self, text_file: str, index_file: str, chunk_size: int = 512):
25
- """Process main text file and index file"""
26
- try:
27
- # Read main text file
28
- print("Reading main text file...")
29
- with open(text_file, 'r', encoding='utf-8') as f:
30
- text_content = f.read()
31
-
32
- # Read index file
33
- print("Reading index file...")
34
- with open(index_file, 'r', encoding='utf-8') as f:
35
- index_lines = f.readlines()
36
-
37
- # Create chunks from text content
38
- chunks = []
39
- for i in range(0, len(text_content), chunk_size):
40
- chunk = text_content[i:i + chunk_size]
41
- chunks.append(chunk)
42
-
43
- print(f"Created {len(chunks)} chunks from text")
44
-
45
- # Add documents to collection
46
- print("Adding documents to ChromaDB...")
47
- for i, chunk in enumerate(tqdm(chunks)):
48
- # Get corresponding index line if available
49
- index_text = index_lines[i].strip() if i < len(index_lines) else f"Chunk {i+1}"
50
-
51
- self.collection.add(
52
- documents=[chunk],
53
- ids=[f"doc_{i}"],
54
- metadatas=[{
55
- "index": index_text,
56
- "chunk_number": i,
57
- "source": "a2023-45.txt"
58
- }]
59
- )
60
-
61
- print("Successfully processed all documents!")
62
- return True
63
-
64
- except Exception as e:
65
- print(f"Error processing files: {str(e)}")
66
- return False
67
-
68
- def main():
69
- # Initialize embedder
70
- embedder = TextEmbedder()
71
-
72
- # Process files
73
- success = embedder.process_files(
74
- text_file='a2023-45.txt',
75
- index_file='index.txt'
76
- )
77
-
78
- if success:
79
- print("Embedding process completed successfully!")
80
- else:
81
- print("Embedding process failed!")
82
-
83
- if __name__ == "__main__":
84
- main()