from langchain.embeddings import HuggingFaceEmbeddings from langchain.docstore.document import Document from langchain.vectorstores import Pinecone import pinecone import os async def delete_documents(task_id): # get api key from app.pinecone.io PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") # find your environment next to the api key in pinecone console PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT") index_name = "transcript-bits" model_name = "thenlper/gte-base" embeddings = HuggingFaceEmbeddings(model_name=model_name) pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV) vector_index = pinecone.Index(index_name=index_name) docsearch = Pinecone.from_existing_index(index_name, embeddings) docsearch.delete( filter={ "task_id": {"$eq": task_id}, } ) def generateChunks(chunks, task_id, n=100): combined = [chunks[i : i + n] for i in range(0, len(chunks), n)] result = [] for chunk in combined: data = {"text": ""} for item in chunk: if chunk.index(item) == 0: data["start"] = item["start"] if chunk.index(item) == len(chunk) - 1: data["end"] = item["end"] data["text"] += " " + item["text"] temp = Document( page_content=data["text"], metadata={"start": data["start"], "end": data["end"], "task_id": task_id}, ) result.append(temp) return result def search(query: str, task_id: str): # get api key from app.pinecone.io PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") # find your environment next to the api key in pinecone console PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT") index_name = "transcript-bits" model_name = "thenlper/gte-base" embeddings = HuggingFaceEmbeddings(model_name=model_name) pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV) vector_index = pinecone.Index(index_name=index_name) docsearch = Pinecone.from_existing_index(index_name, embeddings) filtering_conditions = { "task_id": {"$eq": task_id}, } data =docsearch.similarity_search(query, k=10, filter=filtering_conditions) return [ {"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]} for d in data ] def encode(temp: list[Document]): PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") # find your environment next to the api key in pinecone console PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT") index_name = "transcript-bits" model_name = "thenlper/gte-base" embeddings = HuggingFaceEmbeddings(model_name=model_name) pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV) vector_index = pinecone.Index(index_name=index_name) docsearch = Pinecone.from_existing_index(index_name, embeddings) docsearch.add_documents(temp) # return embeddings.embed_documents(texts = [d.page_content for d in temp])