tranny / App /Embedding /utils /Initialize.py
Mbonea's picture
did it work?
934d38c
raw
history blame
2.5 kB
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain_pinecone import PineconeVectorStore
from pinecone import ServerlessSpec
import pinecone
import os
def initDocument():
# get api key from app.pinecone.io
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY",'')
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
index_name = "transcript-bits"
model_name = "thenlper/gte-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
try:
index = pc.Index(index_name)
except pinecone.core.client.exceptions.NotFoundException:
pc.create_index(
name=index_name,
dimension=768,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embeddings)
return vector_store
async def delete_documents(task_id):
vector_store = initDocument()
vector_store.delete(
filter={
"task_id": {"$eq": task_id},
}
)
def generateChunks(chunks, task_id, n=100):
combined = [chunks[i : i + n] for i in range(0, len(chunks), n)]
result = []
for chunk in combined:
data = {"text": ""}
for item in chunk:
if chunk.index(item) == 0:
data["start"] = item["start"]
if chunk.index(item) == len(chunk) - 1:
data["end"] = item["end"]
data["text"] += " " + item["text"]
temp = Document(
page_content=data["text"],
metadata={"start": data["start"], "end": data["end"], "task_id": task_id},
)
result.append(temp)
return result
def search(query: str, task_id: str):
vector_store = initDocument()
filtering_conditions = {
"task_id": {"$eq": task_id},
}
data =vector_store.similarity_search(query, k=3, filter=filtering_conditions)
return [
{"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
for d in data
]
def encode(temp: list[Document]):
vector_store = initDocument()
vector_store.add_documents(temp)
# return embeddings.embed_documents(texts = [d.page_content for d in temp])