Last commit not found
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.docstore.document import Document | |
from langchain.vectorstores import MongoDBAtlasVectorSearch | |
from pymongo import MongoClient | |
from motor.motor_asyncio import AsyncIOMotorClient | |
import os,pprint | |
completion_base = os.environ.get("completion_base") | |
openai_api_key = os.environ.get("openai_api_key") | |
mongoDB = os.environ.get("MONGO_DB") | |
template = """### Given the following context | |
### Context | |
{context} | |
### Use it to explain the question: {question} | |
""" | |
async def fetch_data(question, context): | |
url = completion_base | |
payload = json.dumps( | |
{ | |
"messages": [ | |
{ | |
"role": "system", | |
"content": "### You provide explanations based on the provided context", | |
}, | |
{ | |
"role": "user", | |
"content": template.format(context=context, question=question), | |
}, | |
], | |
"model": "gpt-3.5-turbo", | |
"temperature": 1, | |
"presence_penalty": 0, | |
"top_p": 0.95, | |
"frequency_penalty": 0, | |
"stream": False, | |
} | |
) | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {openai_api_key}", | |
} | |
async with aiohttp.ClientSession() as session: | |
async with session.post(url, headers=headers, data=payload) as response: | |
response = await response.json() | |
return response["choices"][0]["message"]["content"] | |
async def delete_documents(task_id): | |
client = AsyncIOMotorClient(mongoDB) | |
db = client["transcriptions"] | |
collection = db["videos"] | |
result = await collection.delete_many({"task_id": task_id}) | |
print(f"Deleted {result.deleted_count} document(s)") | |
# mongo_client = MongoClient( | |
# mongoDB | |
# ) | |
# model_name = "BAAI/bge-base-en" | |
# collection = mongo_client["transcriptions"]["videos"] | |
# embeddings = HuggingFaceEmbeddings(model_name=model_name) | |
# index_name = "test_embeddings" | |
# vectorstore = MongoDBAtlasVectorSearch(collection, embeddings, index_name=index_name) | |
def generateChunks(chunks, task_id, n=100): | |
combined = [chunks[i : i + n] for i in range(0, len(chunks), n)] | |
result = [] | |
for chunk in combined: | |
data = {"text": ""} | |
for item in chunk: | |
if chunk.index(item) == 0: | |
data["start"] = item["start"] | |
if chunk.index(item) == len(chunk) - 1: | |
data["end"] = item["end"] | |
data["text"] += " " + item["text"] | |
temp = Document( | |
page_content=data["text"], | |
metadata={"start": data["start"], "end": data["end"], "task_id": task_id}, | |
) | |
result.append(temp) | |
return result | |
def search(query: str, task_id: str): | |
mongo_client = MongoClient(mongoDB) | |
model_name = "BAAI/bge-base-en" | |
collection = mongo_client["transcriptions"]["videos"] | |
embeddings = HuggingFaceEmbeddings(model_name=model_name) | |
index_name = "test_embedding" | |
k = 5 | |
vectorstore = MongoDBAtlasVectorSearch( | |
collection, | |
embedding=embeddings, | |
index_name="test_embedding", | |
) | |
data = vectorstore.similarity_search( | |
query=query, | |
pre_filter={"text": {"path": "task_id", "query": task_id}}, | |
search_kwargs={ | |
"k": k, # overrequest k during search | |
"pre_filter": {"path": "task_id", "equals": task_id}, | |
"post_filter_pipeline": [{"$limit": k}], # limit results to top k | |
}, | |
) | |
# data =[d.dict() for d in data] | |
# print(data[0].metadata.exclude({'_id','embedding'})) | |
# pprint.pprint(data[0].metadata) | |
return [{"text": d.page_content,'start':d.metadata['start'],"end":d.metadata['end']} for d in data] | |
# agent =vectorstore.as_retriever( | |
# ) | |
# return agent.get_relevant_documents | |
def encode(temp: list[Document]): | |
mongo_client = MongoClient(mongoDB) | |
model_name = "BAAI/bge-base-en" | |
collection = mongo_client["transcriptions"]["videos"] | |
embeddings = HuggingFaceEmbeddings(model_name=model_name) | |
index_name = "test_embedding" | |
vectorstore = MongoDBAtlasVectorSearch( | |
collection, embeddings, index_name=index_name | |
) | |
vectorstore.from_documents( | |
temp, embedding=embeddings, collection=collection, index_name=index_name | |
) | |
# return embeddings.embed_documents(texts = [d.page_content for d in temp]) |