|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.docstore.document import Document |
|
from langchain.vectorstores import Pinecone |
|
import pinecone |
|
import os |
|
|
|
|
|
|
|
|
|
async def delete_documents(task_id): |
|
|
|
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") |
|
|
|
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT") |
|
|
|
|
|
index_name = "transcript-bits" |
|
model_name = "thenlper/gte-base" |
|
embeddings = HuggingFaceEmbeddings(model_name=model_name) |
|
|
|
|
|
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV) |
|
vector_index = pinecone.Index(index_name=index_name) |
|
docsearch = Pinecone.from_existing_index(index_name, embeddings) |
|
|
|
docsearch.delete( |
|
filter={ |
|
"task_id": {"$eq": task_id}, |
|
} |
|
) |
|
|
|
|
|
|
|
def generateChunks(chunks, task_id, n=100): |
|
combined = [chunks[i : i + n] for i in range(0, len(chunks), n)] |
|
result = [] |
|
for chunk in combined: |
|
data = {"text": ""} |
|
for item in chunk: |
|
if chunk.index(item) == 0: |
|
data["start"] = item["start"] |
|
if chunk.index(item) == len(chunk) - 1: |
|
data["end"] = item["end"] |
|
data["text"] += " " + item["text"] |
|
|
|
temp = Document( |
|
page_content=data["text"], |
|
metadata={"start": data["start"], "end": data["end"], "task_id": task_id}, |
|
) |
|
result.append(temp) |
|
return result |
|
|
|
|
|
def search(query: str, task_id: str): |
|
|
|
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") |
|
|
|
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT") |
|
|
|
|
|
index_name = "transcript-bits" |
|
model_name = "thenlper/gte-base" |
|
embeddings = HuggingFaceEmbeddings(model_name=model_name) |
|
|
|
|
|
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV) |
|
vector_index = pinecone.Index(index_name=index_name) |
|
docsearch = Pinecone.from_existing_index(index_name, embeddings) |
|
|
|
filtering_conditions = { |
|
"task_id": {"$eq": task_id}, |
|
} |
|
data =docsearch.similarity_search(query, k=10, filter=filtering_conditions) |
|
return [ |
|
{"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]} |
|
for d in data |
|
] |
|
|
|
|
|
|
|
def encode(temp: list[Document]): |
|
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") |
|
|
|
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT") |
|
|
|
|
|
index_name = "transcript-bits" |
|
model_name = "thenlper/gte-base" |
|
embeddings = HuggingFaceEmbeddings(model_name=model_name) |
|
|
|
|
|
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV) |
|
vector_index = pinecone.Index(index_name=index_name) |
|
docsearch = Pinecone.from_existing_index(index_name, embeddings) |
|
docsearch.add_documents(temp) |
|
|
|
|