File size: 3,071 Bytes
d7d0d8e
 
1fba972
 
 
7fc5d88
 
 
d7d0d8e
0bbe0b5
 
 
 
 
7fc5d88
1fba972
0bbe0b5
 
 
 
 
 
 
 
7fc5d88
1fba972
 
2a4cafb
1fba972
 
7fc5d88
d7d0d8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bbe0b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fba972
2a4cafb
1fba972
d396a7d
1fba972
 
 
 
d7d0d8e
 
 
 
53cc68a
 
 
 
 
 
 
 
 
 
 
 
 
5781d73
1fba972
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.vectorstores import Pinecone
import pinecone
import os




async def delete_documents(task_id):
    # get api key from app.pinecone.io
    PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
    # find your environment next to the api key in pinecone console
    PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")


    index_name = "transcript-bits"
    model_name = "thenlper/gte-base"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)


    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    vector_index = pinecone.Index(index_name=index_name)
    docsearch = Pinecone.from_existing_index(index_name, embeddings)

    docsearch.delete(
        filter={
            "task_id": {"$eq": task_id},
        }
    )



def generateChunks(chunks, task_id, n=100):
    combined = [chunks[i : i + n] for i in range(0, len(chunks), n)]
    result = []
    for chunk in combined:
        data = {"text": ""}
        for item in chunk:
            if chunk.index(item) == 0:
                data["start"] = item["start"]
            if chunk.index(item) == len(chunk) - 1:
                data["end"] = item["end"]
            data["text"] += " " + item["text"]

        temp = Document(
            page_content=data["text"],
            metadata={"start": data["start"], "end": data["end"], "task_id": task_id},
        )
        result.append(temp)
    return result


def search(query: str, task_id: str):
# get api key from app.pinecone.io
    PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
    # find your environment next to the api key in pinecone console
    PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")


    index_name = "transcript-bits"
    model_name = "thenlper/gte-base"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)


    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    vector_index = pinecone.Index(index_name=index_name)
    docsearch = Pinecone.from_existing_index(index_name, embeddings)

    filtering_conditions = {
            "task_id": {"$eq": task_id},
        }
    data =docsearch.similarity_search(query, k=10, filter=filtering_conditions)
    return [
        {"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
        for d in data
    ]



def encode(temp: list[Document]):
    PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
    # find your environment next to the api key in pinecone console
    PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")


    index_name = "transcript-bits"
    model_name = "thenlper/gte-base"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)


    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    vector_index = pinecone.Index(index_name=index_name)
    docsearch = Pinecone.from_existing_index(index_name, embeddings)
    docsearch.add_documents(temp)
    # return  embeddings.embed_documents(texts = [d.page_content for d in temp])