File size: 1,963 Bytes
d7d0d8e
 
1fba972
 
 
7fc5d88
1fba972
 
 
 
7fc5d88
 
1fba972
 
 
d7d0d8e
7fc5d88
1fba972
 
 
 
7fc5d88
1fba972
 
 
 
 
 
7fc5d88
d7d0d8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fba972
 
 
 
 
 
 
 
d7d0d8e
 
 
 
1fba972
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.vectorstores import Pinecone
import pinecone
import os

# get api key from app.pinecone.io
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")


index_name = "transcript-bits"
model_name = "thenlper/gte-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)


pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
vector_index = pinecone.Index(index_name=index_name)
docsearch = Pinecone.from_existing_index(index_name, embeddings)


async def delete_documents(task_id):
    docsearch.delete(
        filter={
            "task_id": {"$eq": "task_id"},
        }
    )



def generateChunks(chunks, task_id, n=100):
    combined = [chunks[i : i + n] for i in range(0, len(chunks), n)]
    result = []
    for chunk in combined:
        data = {"text": ""}
        for item in chunk:
            if chunk.index(item) == 0:
                data["start"] = item["start"]
            if chunk.index(item) == len(chunk) - 1:
                data["end"] = item["end"]
            data["text"] += " " + item["text"]

        temp = Document(
            page_content=data["text"],
            metadata={"start": data["start"], "end": data["end"], "task_id": task_id},
        )
        result.append(temp)
    return result


def search(query: str, task_id: str):
    filtering_conditions = {
            "task_id": {"$eq": "task_id"},
        }
    data =docsearch.similarity_search(query, k=10, filter=filtering_conditions)
    return [
        {"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
        for d in data
    ]



def encode(temp: list[Document]):
    docsearch.add_documents([temp])
    # return  embeddings.embed_documents(texts = [d.page_content for d in temp])