File size: 2,502 Bytes
d7d0d8e
 
934d38c
25cf1b8
1fba972
 
7fc5d88
 
 
eb82136
 
 
0bbe0b5
 
7fc5d88
1fba972
0bbe0b5
4664e01
d0350c6
0bbe0b5
934d38c
 
eb82136
934d38c
 
 
 
 
 
 
25cf1b8
934d38c
eb82136
934d38c
 
0bbe0b5
eb82136
 
 
 
934d38c
7fc5d88
934d38c
1fba972
2a4cafb
1fba972
 
7fc5d88
d7d0d8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
934d38c
0bbe0b5
1fba972
2a4cafb
1fba972
934d38c
1fba972
 
 
 
d7d0d8e
 
 
 
934d38c
 
eb82136
53cc68a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain_pinecone import PineconeVectorStore
from pinecone import ServerlessSpec
import pinecone
import os



def initDocument():
        # get api key from app.pinecone.io
    PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY",'')
    # find your environment next to the api key in pinecone console
    PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")


    index_name = "transcript-bits"
    model_name = "thenlper/gte-base"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)

    pc = pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    
    try:
        index = pc.Index(index_name)
    except pinecone.core.client.exceptions.NotFoundException:
        pc.create_index(
            name=index_name,
            dimension=768,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )
        index = pc.Index(index_name)

    vector_store = PineconeVectorStore(index=index, embedding=embeddings)
    return vector_store




async def delete_documents(task_id):
    vector_store = initDocument()

    vector_store.delete(
        filter={
            "task_id": {"$eq": task_id},
        }
    )



def generateChunks(chunks, task_id, n=100):
    combined = [chunks[i : i + n] for i in range(0, len(chunks), n)]
    result = []
    for chunk in combined:
        data = {"text": ""}
        for item in chunk:
            if chunk.index(item) == 0:
                data["start"] = item["start"]
            if chunk.index(item) == len(chunk) - 1:
                data["end"] = item["end"]
            data["text"] += " " + item["text"]

        temp = Document(
            page_content=data["text"],
            metadata={"start": data["start"], "end": data["end"], "task_id": task_id},
        )
        result.append(temp)
    return result


def search(query: str, task_id: str):
    vector_store = initDocument()

    filtering_conditions = {
            "task_id": {"$eq": task_id},
        }
    data =vector_store.similarity_search(query, k=3, filter=filtering_conditions)
    return [
        {"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
        for d in data
    ]



def encode(temp: list[Document]):
    vector_store = initDocument()
    vector_store.add_documents(temp)
    # return  embeddings.embed_documents(texts = [d.page_content for d in temp])