tranny / App /Embedding /utils /Initialize.py
Last commit not found
raw
history blame
4.49 kB
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from motor.motor_asyncio import AsyncIOMotorClient
import os,pprint
completion_base = os.environ.get("completion_base")
openai_api_key = os.environ.get("openai_api_key")
mongoDB = os.environ.get("MONGO_DB")
template = """### Given the following context
### Context
{context}
### Use it to explain the question: {question}
"""
async def fetch_data(question, context):
url = completion_base
payload = json.dumps(
{
"messages": [
{
"role": "system",
"content": "### You provide explanations based on the provided context",
},
{
"role": "user",
"content": template.format(context=context, question=question),
},
],
"model": "gpt-3.5-turbo",
"temperature": 1,
"presence_penalty": 0,
"top_p": 0.95,
"frequency_penalty": 0,
"stream": False,
}
)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {openai_api_key}",
}
async with aiohttp.ClientSession() as session:
async with session.post(url, headers=headers, data=payload) as response:
response = await response.json()
return response["choices"][0]["message"]["content"]
async def delete_documents(task_id):
client = AsyncIOMotorClient(mongoDB)
db = client["transcriptions"]
collection = db["videos"]
result = await collection.delete_many({"task_id": task_id})
print(f"Deleted {result.deleted_count} document(s)")
# mongo_client = MongoClient(
# mongoDB
# )
# model_name = "BAAI/bge-base-en"
# collection = mongo_client["transcriptions"]["videos"]
# embeddings = HuggingFaceEmbeddings(model_name=model_name)
# index_name = "test_embeddings"
# vectorstore = MongoDBAtlasVectorSearch(collection, embeddings, index_name=index_name)
def generateChunks(chunks, task_id, n=100):
combined = [chunks[i : i + n] for i in range(0, len(chunks), n)]
result = []
for chunk in combined:
data = {"text": ""}
for item in chunk:
if chunk.index(item) == 0:
data["start"] = item["start"]
if chunk.index(item) == len(chunk) - 1:
data["end"] = item["end"]
data["text"] += " " + item["text"]
temp = Document(
page_content=data["text"],
metadata={"start": data["start"], "end": data["end"], "task_id": task_id},
)
result.append(temp)
return result
def search(query: str, task_id: str):
mongo_client = MongoClient(mongoDB)
model_name = "BAAI/bge-base-en"
collection = mongo_client["transcriptions"]["videos"]
embeddings = HuggingFaceEmbeddings(model_name=model_name)
index_name = "test_embedding"
k = 5
vectorstore = MongoDBAtlasVectorSearch(
collection,
embedding=embeddings,
index_name="test_embedding",
)
data = vectorstore.similarity_search(
query=query,
pre_filter={"text": {"path": "task_id", "query": task_id}},
search_kwargs={
"k": k, # overrequest k during search
"pre_filter": {"path": "task_id", "equals": task_id},
"post_filter_pipeline": [{"$limit": k}], # limit results to top k
},
)
# data =[d.dict() for d in data]
# print(data[0].metadata.exclude({'_id','embedding'}))
# pprint.pprint(data[0].metadata)
return [{"text": d.page_content,'start':d.metadata['start'],"end":d.metadata['end']} for d in data]
# agent =vectorstore.as_retriever(
# )
# return agent.get_relevant_documents
def encode(temp: list[Document]):
mongo_client = MongoClient(mongoDB)
model_name = "BAAI/bge-base-en"
collection = mongo_client["transcriptions"]["videos"]
embeddings = HuggingFaceEmbeddings(model_name=model_name)
index_name = "test_embedding"
vectorstore = MongoDBAtlasVectorSearch(
collection, embeddings, index_name=index_name
)
vectorstore.from_documents(
temp, embedding=embeddings, collection=collection, index_name=index_name
)
# return embeddings.embed_documents(texts = [d.page_content for d in temp])