import json from typing import List from app.db_local_storage.vector_files_db import vector_files_db as EMBEDDING_DATA from app.modules.model import model, qa_pipeline class CreateEmbeddingsFeature: @staticmethod async def chunk_text(text: str, chunk_size: int = 512) -> List[str]: chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)] return chunks @staticmethod async def create_embeddings(text: str, filename: str) -> List: chunks = await CreateEmbeddingsFeature.chunk_text(text) id = len(EMBEDDING_DATA) + 1 docoument_index = f"document_{id}" EMBEDDING_DATA[docoument_index] = { "metadata": {"id": id, "filename": filename, "chunks": len(chunks)}, "data": [], } for i, chunk in enumerate(chunks): embedding = model.encode(chunk).tolist() embedding_entry = { "embedding": embedding, "metadata": { "chunk_index": i, "original_text": chunk, "document_id": docoument_index, }, } EMBEDDING_DATA[docoument_index]["data"].append(embedding_entry) return