Spaces:
Sleeping
Sleeping
File size: 2,827 Bytes
bb05d9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import chromadb
from chromadb.utils import embedding_functions
from tqdm import tqdm
import os
from typing import List, Dict
class TextEmbedder:
def __init__(self, collection_name: str = "text_collection"):
# Initialize ChromaDB client
self.chroma_client = chromadb.Client()
# Initialize embedding function
self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="all-MiniLM-L6-v2"
)
# Create collection
self.collection = self.chroma_client.create_collection(
name=collection_name,
embedding_function=self.embedding_function,
metadata={"hnsw:space": "cosine"}
)
def process_files(self, text_file: str, index_file: str, chunk_size: int = 512):
"""Process main text file and index file"""
try:
# Read main text file
print("Reading main text file...")
with open(text_file, 'r', encoding='utf-8') as f:
text_content = f.read()
# Read index file
print("Reading index file...")
with open(index_file, 'r', encoding='utf-8') as f:
index_lines = f.readlines()
# Create chunks from text content
chunks = []
for i in range(0, len(text_content), chunk_size):
chunk = text_content[i:i + chunk_size]
chunks.append(chunk)
print(f"Created {len(chunks)} chunks from text")
# Add documents to collection
print("Adding documents to ChromaDB...")
for i, chunk in enumerate(tqdm(chunks)):
# Get corresponding index line if available
index_text = index_lines[i].strip() if i < len(index_lines) else f"Chunk {i+1}"
self.collection.add(
documents=[chunk],
ids=[f"doc_{i}"],
metadatas=[{
"index": index_text,
"chunk_number": i,
"source": "a2023-45.txt"
}]
)
print("Successfully processed all documents!")
return True
except Exception as e:
print(f"Error processing files: {str(e)}")
return False
def main():
# Initialize embedder
embedder = TextEmbedder()
# Process files
success = embedder.process_files(
text_file='a2023-45.txt',
index_file='index.txt'
)
if success:
print("Embedding process completed successfully!")
else:
print("Embedding process failed!")
if __name__ == "__main__":
main() |