File size: 3,071 Bytes
d7d0d8e 1fba972 7fc5d88 d7d0d8e 0bbe0b5 7fc5d88 1fba972 0bbe0b5 7fc5d88 1fba972 2a4cafb 1fba972 7fc5d88 d7d0d8e 0bbe0b5 1fba972 2a4cafb 1fba972 d396a7d 1fba972 d7d0d8e 53cc68a 5781d73 1fba972 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.vectorstores import Pinecone
import pinecone
import os
async def delete_documents(task_id):
# get api key from app.pinecone.io
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
index_name = "transcript-bits"
model_name = "thenlper/gte-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
vector_index = pinecone.Index(index_name=index_name)
docsearch = Pinecone.from_existing_index(index_name, embeddings)
docsearch.delete(
filter={
"task_id": {"$eq": task_id},
}
)
def generateChunks(chunks, task_id, n=100):
combined = [chunks[i : i + n] for i in range(0, len(chunks), n)]
result = []
for chunk in combined:
data = {"text": ""}
for item in chunk:
if chunk.index(item) == 0:
data["start"] = item["start"]
if chunk.index(item) == len(chunk) - 1:
data["end"] = item["end"]
data["text"] += " " + item["text"]
temp = Document(
page_content=data["text"],
metadata={"start": data["start"], "end": data["end"], "task_id": task_id},
)
result.append(temp)
return result
def search(query: str, task_id: str):
# get api key from app.pinecone.io
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
index_name = "transcript-bits"
model_name = "thenlper/gte-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
vector_index = pinecone.Index(index_name=index_name)
docsearch = Pinecone.from_existing_index(index_name, embeddings)
filtering_conditions = {
"task_id": {"$eq": task_id},
}
data =docsearch.similarity_search(query, k=10, filter=filtering_conditions)
return [
{"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
for d in data
]
def encode(temp: list[Document]):
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
index_name = "transcript-bits"
model_name = "thenlper/gte-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
vector_index = pinecone.Index(index_name=index_name)
docsearch = Pinecone.from_existing_index(index_name, embeddings)
docsearch.add_documents(temp)
# return embeddings.embed_documents(texts = [d.page_content for d in temp])
|