File size: 1,250 Bytes
819bacd
 
 
fe7c659
714be4e
819bacd
 
 
 
 
714be4e
819bacd
 
 
 
 
 
bc0b69d
819bacd
 
 
 
 
 
 
 
 
bc0b69d
819bacd
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import json
from typing import List

from app.db_local_storage.vector_files_db import vector_files_db as EMBEDDING_DATA
from app.modules.model import model, qa_pipeline


class CreateEmbeddingsFeature:

    @staticmethod
    async def chunk_text(text: str, chunk_size: int = 512) -> List[str]:
        chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
        return chunks

    @staticmethod
    async def create_embeddings(text: str, filename: str) -> List:

        chunks = await CreateEmbeddingsFeature.chunk_text(text)

        id = len(EMBEDDING_DATA) + 1
        docoument_index = f"document_{id}"
        EMBEDDING_DATA[docoument_index] = {
            "metadata": {"id": id, "filename": filename, "chunks": len(chunks)},
            "data": [],
        }

        for i, chunk in enumerate(chunks):
            embedding = model.encode(chunk).tolist()
            embedding_entry = {
                "embedding": embedding,
                "metadata": {
                    "chunk_index": i,
                    "original_text": chunk,
                    "document_id": docoument_index,
                },
            }
            EMBEDDING_DATA[docoument_index]["data"].append(embedding_entry)

        return