File size: 3,100 Bytes
d7d0d8e 1fba972 41d120a 1fba972 7fc5d88 d7d0d8e 0bbe0b5 7fc5d88 1fba972 0bbe0b5 41d120a 0bbe0b5 41d120a 0bbe0b5 7fc5d88 1fba972 2a4cafb 1fba972 7fc5d88 d7d0d8e 0bbe0b5 084147e 41d120a 0bbe0b5 1fba972 2a4cafb 1fba972 acfe093 1fba972 d7d0d8e 53cc68a 084147e 41d120a 53cc68a 5781d73 1fba972 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.vectorstores import Pinecone
from pinecone import PodSpec
import pinecone
import os
async def delete_documents(task_id):
# get api key from app.pinecone.io
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
index_name = "transcript-bits"
model_name = "thenlper/gte-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
spec = PodSpec()
pc=pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV,spec=spec)
vector_index = pc.Index(index_name)
docsearch = Pinecone.from_existing_index(index_name, embeddings)
docsearch.delete(
filter={
"task_id": {"$eq": task_id},
}
)
def generateChunks(chunks, task_id, n=100):
combined = [chunks[i : i + n] for i in range(0, len(chunks), n)]
result = []
for chunk in combined:
data = {"text": ""}
for item in chunk:
if chunk.index(item) == 0:
data["start"] = item["start"]
if chunk.index(item) == len(chunk) - 1:
data["end"] = item["end"]
data["text"] += " " + item["text"]
temp = Document(
page_content=data["text"],
metadata={"start": data["start"], "end": data["end"], "task_id": task_id},
)
result.append(temp)
return result
def search(query: str, task_id: str):
# get api key from app.pinecone.io
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
index_name = "transcript-bits"
model_name = "thenlper/gte-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
pc=pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
vector_index = pc.Index(index_name)
docsearch = Pinecone.from_existing_index(index_name, embeddings)
filtering_conditions = {
"task_id": {"$eq": task_id},
}
data =docsearch.similarity_search(query, k=3, filter=filtering_conditions)
return [
{"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
for d in data
]
def encode(temp: list[Document]):
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
index_name = "transcript-bits"
model_name = "thenlper/gte-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
pc=pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
vector_index = pc.Index(index_name)
docsearch = Pinecone.from_existing_index(index_name, embeddings)
docsearch.add_documents(temp)
# return embeddings.embed_documents(texts = [d.page_content for d in temp])
|