Spaces:

bla
/

tranny

Runtime error

App Files Files Community

tranny / App /Embedding /utils /Initialize.py

Mbonea

delete documents

7fc5d88 over 1 year ago

raw

history blame

4.49 kB

	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.docstore.document import Document
	from langchain.vectorstores import MongoDBAtlasVectorSearch
	from pymongo import MongoClient
	from motor.motor_asyncio import AsyncIOMotorClient
	import os,pprint


	completion_base = os.environ.get("completion_base")
	openai_api_key = os.environ.get("openai_api_key")
	mongoDB = os.environ.get("MONGO_DB")
	template = """### Given the following context
	### Context
	{context}
	### Use it to explain the question: {question}
	"""


	async def fetch_data(question, context):
	url = completion_base

	payload = json.dumps(
	{
	"messages": [
	{
	"role": "system",
	"content": "### You provide explanations based on the provided context",
	},
	{
	"role": "user",
	"content": template.format(context=context, question=question),
	},
	],
	"model": "gpt-3.5-turbo",
	"temperature": 1,
	"presence_penalty": 0,
	"top_p": 0.95,
	"frequency_penalty": 0,
	"stream": False,
	}
	)

	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {openai_api_key}",
	}

	async with aiohttp.ClientSession() as session:
	async with session.post(url, headers=headers, data=payload) as response:
	response = await response.json()
	return response["choices"][0]["message"]["content"]


	async def delete_documents(task_id):
	client = AsyncIOMotorClient(mongoDB)
	db = client["transcriptions"]
	collection = db["videos"]

	result = await collection.delete_many({"task_id": task_id})
	print(f"Deleted {result.deleted_count} document(s)")


	# mongo_client = MongoClient(
	# mongoDB
	# )
	# model_name = "BAAI/bge-base-en"
	# collection = mongo_client["transcriptions"]["videos"]
	# embeddings = HuggingFaceEmbeddings(model_name=model_name)
	# index_name = "test_embeddings"
	# vectorstore = MongoDBAtlasVectorSearch(collection, embeddings, index_name=index_name)


	def generateChunks(chunks, task_id, n=100):
	combined = [chunks[i : i + n] for i in range(0, len(chunks), n)]
	result = []
	for chunk in combined:
	data = {"text": ""}
	for item in chunk:
	if chunk.index(item) == 0:
	data["start"] = item["start"]
	if chunk.index(item) == len(chunk) - 1:
	data["end"] = item["end"]
	data["text"] += " " + item["text"]

	temp = Document(
	page_content=data["text"],
	metadata={"start": data["start"], "end": data["end"], "task_id": task_id},
	)
	result.append(temp)
	return result


	def search(query: str, task_id: str):
	mongo_client = MongoClient(mongoDB)
	model_name = "BAAI/bge-base-en"
	collection = mongo_client["transcriptions"]["videos"]
	embeddings = HuggingFaceEmbeddings(model_name=model_name)
	index_name = "test_embedding"
	k = 5
	vectorstore = MongoDBAtlasVectorSearch(
	collection,
	embedding=embeddings,
	index_name="test_embedding",
	)

	data = vectorstore.similarity_search(
	query=query,
	pre_filter={"text": {"path": "task_id", "query": task_id}},
	search_kwargs={
	"k": k, # overrequest k during search
	"pre_filter": {"path": "task_id", "equals": task_id},
	"post_filter_pipeline": [{"$limit": k}], # limit results to top k
	},
	)
	# data =[d.dict() for d in data]
	# print(data[0].metadata.exclude({'_id','embedding'}))
	# pprint.pprint(data[0].metadata)
	return [{"text": d.page_content,'start':d.metadata['start'],"end":d.metadata['end']} for d in data]
	# agent =vectorstore.as_retriever(

	# )
	# return agent.get_relevant_documents


	def encode(temp: list[Document]):
	mongo_client = MongoClient(mongoDB)
	model_name = "BAAI/bge-base-en"
	collection = mongo_client["transcriptions"]["videos"]
	embeddings = HuggingFaceEmbeddings(model_name=model_name)
	index_name = "test_embedding"
	vectorstore = MongoDBAtlasVectorSearch(
	collection, embeddings, index_name=index_name
	)
	vectorstore.from_documents(
	temp, embedding=embeddings, collection=collection, index_name=index_name
	)
	# return embeddings.embed_documents(texts = [d.page_content for d in temp])