File size: 2,502 Bytes
d7d0d8e 934d38c 25cf1b8 1fba972 7fc5d88 eb82136 0bbe0b5 7fc5d88 1fba972 0bbe0b5 4664e01 d0350c6 0bbe0b5 934d38c eb82136 934d38c 25cf1b8 934d38c eb82136 934d38c 0bbe0b5 eb82136 934d38c 7fc5d88 934d38c 1fba972 2a4cafb 1fba972 7fc5d88 d7d0d8e 934d38c 0bbe0b5 1fba972 2a4cafb 1fba972 934d38c 1fba972 d7d0d8e 934d38c eb82136 53cc68a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain_pinecone import PineconeVectorStore
from pinecone import ServerlessSpec
import pinecone
import os
def initDocument():
# get api key from app.pinecone.io
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY",'')
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT")
index_name = "transcript-bits"
model_name = "thenlper/gte-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
try:
index = pc.Index(index_name)
except pinecone.core.client.exceptions.NotFoundException:
pc.create_index(
name=index_name,
dimension=768,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embeddings)
return vector_store
async def delete_documents(task_id):
vector_store = initDocument()
vector_store.delete(
filter={
"task_id": {"$eq": task_id},
}
)
def generateChunks(chunks, task_id, n=100):
combined = [chunks[i : i + n] for i in range(0, len(chunks), n)]
result = []
for chunk in combined:
data = {"text": ""}
for item in chunk:
if chunk.index(item) == 0:
data["start"] = item["start"]
if chunk.index(item) == len(chunk) - 1:
data["end"] = item["end"]
data["text"] += " " + item["text"]
temp = Document(
page_content=data["text"],
metadata={"start": data["start"], "end": data["end"], "task_id": task_id},
)
result.append(temp)
return result
def search(query: str, task_id: str):
vector_store = initDocument()
filtering_conditions = {
"task_id": {"$eq": task_id},
}
data =vector_store.similarity_search(query, k=3, filter=filtering_conditions)
return [
{"text": d.page_content, "start": d.metadata["start"], "end": d.metadata["end"]}
for d in data
]
def encode(temp: list[Document]):
vector_store = initDocument()
vector_store.add_documents(temp)
# return embeddings.embed_documents(texts = [d.page_content for d in temp])
|