|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.docstore.document import Document |
|
from langchain_pinecone import PineconeVectorStore |
|
from pinecone import ServerlessSpec |
|
import pinecone |
|
import os |
|
|
|
|
|
|
|
def initDocument(): |
|
|
|
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY",'') |
|
|
|
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT") |
|
|
|
|
|
index_name = "transcript-bits" |
|
model_name = "thenlper/gte-base" |
|
embeddings = HuggingFaceEmbeddings(model_name=model_name) |
|
|
|
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV) |
|
|
|
try: |
|
index = pc.Index(index_name) |
|
except pinecone.core.client.exceptions.NotFoundException: |
|
pc.create_index( |
|
name=index_name, |
|
dimension=768, |
|
metric="cosine", |
|
spec=ServerlessSpec(cloud="aws", region="us-east-1") |
|
) |
|
index = pc.Index(index_name) |
|
|
|
vector_store = PineconeVectorStore(index=index, embedding=embeddings) |
|
return vector_store |
|
|
|
|
|
|
|
|
|
async def delete_documents(task_id): |
|
vector_store = initDocument() |
|
|
|
vector_store.delete( |
|
filter={ |
|
"task_id": {"$eq": task_id}, |
|
} |
|
) |
|
|
|
|
|
|
|
def generateChunks(chunks, task_id, n=100): |
|
combined = [chunks[i : i + n] for i in range(0, len(chunks), n)] |
|
result = [] |
|
for chunk in combined: |
|
data = {"text": ""} |
|
for item in chunk: |
|
if chunk.index(item) == 0: |
|
data["start"] = item["start"] |
|
if chunk.index(item) == len(chunk) - 1: |
|
data["end"] = item["end"] |
|
data["text"] += " " + item["text"] |
|
|
|
temp = Document( |
|
page_content=data["text"], |
|
metadata={"start": data["start"], "end": data["end"], "task_id": task_id}, |
|
) |
|
result.append(temp) |
|
return result |
|
|
|
|
|
def search(query: str, task_id: str): |
|
vector_store = initDocument() |
|
|
|
filtering_conditions = { |
|
"task_id": {"$eq": task_id}, |
|
} |
|
data =vector_store.similarity_search(query, k=3, filter=filtering_conditions) |
|
return [ |
|
{"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]} |
|
for d in data |
|
] |
|
|
|
|
|
|
|
def encode(temp: list[Document]): |
|
vector_store = initDocument() |
|
vector_store.add_documents(temp) |
|
|
|
|
|
|
|
|
|
|