|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.docstore.document import Document |
|
from langchain.vectorstores import MongoDBAtlasVectorSearch |
|
from pymongo import MongoClient |
|
from motor.motor_asyncio import AsyncIOMotorClient |
|
import os,pprint |
|
|
|
|
|
completion_base = os.environ.get("completion_base") |
|
openai_api_key = os.environ.get("openai_api_key") |
|
mongoDB = os.environ.get("MONGO_DB") |
|
template = """### Given the following context |
|
### Context |
|
{context} |
|
### Use it to explain the question: {question} |
|
""" |
|
|
|
|
|
async def fetch_data(question, context): |
|
url = completion_base |
|
|
|
payload = json.dumps( |
|
{ |
|
"messages": [ |
|
{ |
|
"role": "system", |
|
"content": "### You provide explanations based on the provided context", |
|
}, |
|
{ |
|
"role": "user", |
|
"content": template.format(context=context, question=question), |
|
}, |
|
], |
|
"model": "gpt-3.5-turbo", |
|
"temperature": 1, |
|
"presence_penalty": 0, |
|
"top_p": 0.95, |
|
"frequency_penalty": 0, |
|
"stream": False, |
|
} |
|
) |
|
|
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {openai_api_key}", |
|
} |
|
|
|
async with aiohttp.ClientSession() as session: |
|
async with session.post(url, headers=headers, data=payload) as response: |
|
response = await response.json() |
|
return response["choices"][0]["message"]["content"] |
|
|
|
|
|
async def delete_documents(task_id): |
|
client = AsyncIOMotorClient(mongoDB) |
|
db = client["transcriptions"] |
|
collection = db["videos"] |
|
|
|
result = await collection.delete_many({"task_id": task_id}) |
|
print(f"Deleted {result.deleted_count} document(s)") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generateChunks(chunks, task_id, n=100): |
|
combined = [chunks[i : i + n] for i in range(0, len(chunks), n)] |
|
result = [] |
|
for chunk in combined: |
|
data = {"text": ""} |
|
for item in chunk: |
|
if chunk.index(item) == 0: |
|
data["start"] = item["start"] |
|
if chunk.index(item) == len(chunk) - 1: |
|
data["end"] = item["end"] |
|
data["text"] += " " + item["text"] |
|
|
|
temp = Document( |
|
page_content=data["text"], |
|
metadata={"start": data["start"], "end": data["end"], "task_id": task_id}, |
|
) |
|
result.append(temp) |
|
return result |
|
|
|
|
|
def search(query: str, task_id: str): |
|
mongo_client = MongoClient(mongoDB) |
|
model_name = "BAAI/bge-base-en" |
|
collection = mongo_client["transcriptions"]["videos"] |
|
embeddings = HuggingFaceEmbeddings(model_name=model_name) |
|
index_name = "test_embedding" |
|
k = 5 |
|
vectorstore = MongoDBAtlasVectorSearch( |
|
collection, |
|
embedding=embeddings, |
|
index_name="test_embedding", |
|
) |
|
|
|
data = vectorstore.similarity_search( |
|
query=query, |
|
pre_filter={"text": {"path": "task_id", "query": task_id}}, |
|
search_kwargs={ |
|
"k": k, |
|
"pre_filter": {"path": "task_id", "equals": task_id}, |
|
"post_filter_pipeline": [{"$limit": k}], |
|
}, |
|
) |
|
|
|
|
|
|
|
return [{"text": d.page_content,'start':d.metadata['start'],"end":d.metadata['end']} for d in data] |
|
|
|
|
|
|
|
|
|
|
|
|
|
def encode(temp: list[Document]): |
|
mongo_client = MongoClient(mongoDB) |
|
model_name = "BAAI/bge-base-en" |
|
collection = mongo_client["transcriptions"]["videos"] |
|
embeddings = HuggingFaceEmbeddings(model_name=model_name) |
|
index_name = "test_embedding" |
|
vectorstore = MongoDBAtlasVectorSearch( |
|
collection, embeddings, index_name=index_name |
|
) |
|
vectorstore.from_documents( |
|
temp, embedding=embeddings, collection=collection, index_name=index_name |
|
) |
|
|