File size: 3,100 Bytes
d7d0d8e
 
1fba972
41d120a
1fba972
 
7fc5d88
 
 
d7d0d8e
0bbe0b5
 
 
 
 
7fc5d88
1fba972
0bbe0b5
 
 
41d120a
0bbe0b5
 
41d120a
 
0bbe0b5
7fc5d88
1fba972
 
2a4cafb
1fba972
 
7fc5d88
d7d0d8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bbe0b5
 
 
 
 
 
 
 
 
 
 
084147e
41d120a
0bbe0b5
 
1fba972
2a4cafb
1fba972
acfe093
1fba972
 
 
 
d7d0d8e
 
 
 
53cc68a
 
 
 
 
 
 
 
 
 
084147e
41d120a
53cc68a
5781d73
1fba972
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.vectorstores import Pinecone
from pinecone import PodSpec
import pinecone
import os




async def delete_documents(task_id):
    # get api key from app.pinecone.io
    PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
    # find your environment next to the api key in pinecone console
    PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")


    index_name = "transcript-bits"
    model_name = "thenlper/gte-base"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    spec = PodSpec()


    pc=pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV,spec=spec)
    vector_index = pc.Index(index_name)
    docsearch = Pinecone.from_existing_index(index_name, embeddings)

    docsearch.delete(
        filter={
            "task_id": {"$eq": task_id},
        }
    )



def generateChunks(chunks, task_id, n=100):
    combined = [chunks[i : i + n] for i in range(0, len(chunks), n)]
    result = []
    for chunk in combined:
        data = {"text": ""}
        for item in chunk:
            if chunk.index(item) == 0:
                data["start"] = item["start"]
            if chunk.index(item) == len(chunk) - 1:
                data["end"] = item["end"]
            data["text"] += " " + item["text"]

        temp = Document(
            page_content=data["text"],
            metadata={"start": data["start"], "end": data["end"], "task_id": task_id},
        )
        result.append(temp)
    return result


def search(query: str, task_id: str):
# get api key from app.pinecone.io
    PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
    # find your environment next to the api key in pinecone console
    PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")


    index_name = "transcript-bits"
    model_name = "thenlper/gte-base"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)


    pc=pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    vector_index = pc.Index(index_name)
    docsearch = Pinecone.from_existing_index(index_name, embeddings)

    filtering_conditions = {
            "task_id": {"$eq": task_id},
        }
    data =docsearch.similarity_search(query, k=3, filter=filtering_conditions)
    return [
        {"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
        for d in data
    ]



def encode(temp: list[Document]):
    PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
    # find your environment next to the api key in pinecone console
    PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")


    index_name = "transcript-bits"
    model_name = "thenlper/gte-base"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)


    pc=pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    vector_index = pc.Index(index_name)
    docsearch = Pinecone.from_existing_index(index_name, embeddings)
    docsearch.add_documents(temp)
    # return  embeddings.embed_documents(texts = [d.page_content for d in temp])