Spaces:
Sleeping
Sleeping
File size: 1,637 Bytes
819bacd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import json
from typing import List
from sentence_transformers import SentenceTransformer
from src.db_local_storage.files_db import VECTOR_FILES_DIRECTORY
from src.db_local_storage.vector_files_db import vector_files_db as EMBEDDING_DATA
class CreateEmbeddingsFeature:
@staticmethod
def chunk_text(text: str, chunk_size: int = 512) -> List[str]:
chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
return chunks
@staticmethod
async def create_embeddings(text: str, filename: str) -> List:
# TODO: Check model
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
chunks = CreateEmbeddingsFeature.chunk_text(text)
# with open(VECTOR_FILES_DIRECTORY, "r") as file:
# EMBEDDING_DATA = json.load(file)
id = len(EMBEDDING_DATA) + 1
docoument_index = f"document_{id}"
EMBEDDING_DATA[docoument_index] = {
"metadata": {"id": id, "filename": filename, "chunks": len(chunks)},
"data": [],
}
for i, chunk in enumerate(chunks):
embedding = model.encode(chunk).tolist()
embedding_entry = {
"embedding": embedding,
"metadata": {
"chunk_index": i,
"original_text": chunk,
"document_id": docoument_index,
},
}
EMBEDDING_DATA[docoument_index]["data"].append(embedding_entry)
# print(EMBEDDING_DATA)
# with open(VECTOR_FILES_DIRECTORY, "w") as f:
# json.dump(EMBEDDING_DATA, f)
return
|