tranny / App /Embedding /utils /Initialize.py
Mbonea's picture
Wider window
d396a7d
raw
history blame
3.07 kB
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.vectorstores import Pinecone
import pinecone
import os
async def delete_documents(task_id):
# get api key from app.pinecone.io
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
index_name = "transcript-bits"
model_name = "thenlper/gte-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
vector_index = pinecone.Index(index_name=index_name)
docsearch = Pinecone.from_existing_index(index_name, embeddings)
docsearch.delete(
filter={
"task_id": {"$eq": task_id},
}
)
def generateChunks(chunks, task_id, n=100):
combined = [chunks[i : i + n] for i in range(0, len(chunks), n)]
result = []
for chunk in combined:
data = {"text": ""}
for item in chunk:
if chunk.index(item) == 0:
data["start"] = item["start"]
if chunk.index(item) == len(chunk) - 1:
data["end"] = item["end"]
data["text"] += " " + item["text"]
temp = Document(
page_content=data["text"],
metadata={"start": data["start"], "end": data["end"], "task_id": task_id},
)
result.append(temp)
return result
def search(query: str, task_id: str):
# get api key from app.pinecone.io
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
index_name = "transcript-bits"
model_name = "thenlper/gte-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
vector_index = pinecone.Index(index_name=index_name)
docsearch = Pinecone.from_existing_index(index_name, embeddings)
filtering_conditions = {
"task_id": {"$eq": task_id},
}
data =docsearch.similarity_search(query, k=10, filter=filtering_conditions)
return [
{"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
for d in data
]
def encode(temp: list[Document]):
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
index_name = "transcript-bits"
model_name = "thenlper/gte-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
vector_index = pinecone.Index(index_name=index_name)
docsearch = Pinecone.from_existing_index(index_name, embeddings)
docsearch.add_documents(temp)
# return embeddings.embed_documents(texts = [d.page_content for d in temp])