Spaces:

sartajbhuvaji
/

resonate

Sleeping

App Files Files Community

resonate / src /pinecone /resonate_pinecone_functions.py

sartajbhuvaji

Added files from branch huggingface

5f773d1 verified over 1 year ago

raw

history blame contribute delete

15 kB

	# Description: Pinecone Serverless Class for Resonate
	# Reference: https://www.pinecone.io/docs/

	import datetime
	import uuid
	import json
	import os
	import time
	import pandas as pd
	from dotenv import load_dotenv
	from langchain_openai import OpenAIEmbeddings
	from pinecone import Pinecone, ServerlessSpec

	def load_json_config(json_file_path="./config/config.json"):
	with open(json_file_path, "r") as file:
	data = json.load(file)
	return data


	class PineconeServerless:
	def __init__(self) -> None:
	print("Pinecone Serverless Initializing")
	json_config = load_json_config()
	#load_dotenv("./config/.env")

	self.PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
	self.OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
	if self.PINECONE_API_KEY is not None:
	self.pinecone = Pinecone(api_key=self.PINECONE_API_KEY)
	self._init_config(json_config)
	self.meeting_title = None
	self.base_data_path = "./data/jsonMetaDataFiles/"
	self.master_json_file = f"{self.base_data_path}{self.master_json_filename}.json"
	self._create_master_json()
	self._create_index()
	self.response = None
	print("Pinecone Serverless Initialized")

	def _init_config(self, json_config) -> None:
	for key, value in json_config.items():
	setattr(self, key.lower(), value)

	def check_index_already_exists(self) -> bool:
	return self.pinecone_index_name in self.pinecone.list_indexes()

	def _get_vector_embedder(self):
	if self.embedding_provider == "OpenAI":
	return OpenAIEmbeddings(model=self.embedding_model_name)
	else:
	raise ValueError("Invalid Embedding Model")

	def _get_index(self):
	return self.pinecone.Index(self.pinecone_index_name)

	def _create_index(self) -> None:
	'''
	Creates a new index in Pinecone if it does not exist
	'''
	pinecone_indexes_list = [
	index.get("name")
	for index in self.pinecone.list_indexes().get("indexes", [])]

	if self.pinecone_index_name not in pinecone_indexes_list:
	try:
	self.pinecone.create_index(
	name=self.pinecone_index_name,
	metric=self.pinecone_metric,
	dimension=self.pinecone_vector_dimension,
	spec=ServerlessSpec(
	cloud=self.pinecone_cloud_provider,
	region=self.pinecone_region,
	# pod_type="p1.x1", # Future use
	),
	)

	while not self.pinecone.describe_index(self.pinecone_index_name).status["ready"]:
	time.sleep(5)

	except Exception as e:
	print("Index creation failed: ", e)

	def describe_index_stats(self) -> dict:
	try:
	index = self._get_index()
	return index.describe_index_stats()
	except Exception as e:
	print("Index does not exist: ", e)
	return {}

	def _delete_index(self) -> None:
	try:
	self.pinecone.delete_index(self.pinecone_index_name)
	except Exception as e:
	print("Index does not exist: ", e)

	def _create_master_json(self) -> None:
	'''
	Check if the master json file exists, if not, create it
	'''
	os.makedirs(os.path.dirname(self.base_data_path), exist_ok=True)
	if not os.path.exists(self.master_json_file):
	with open(self.master_json_file, "w") as file:
	data = {
	"index": self.pinecone_index_name,
	"namespace": self.pinecone_namespace,
	"last_conversation_no": 0,
	"meeting_uuids": [],
	"meetings": [],
	}

	with open(self.master_json_file, "w") as f:
	json.dump(data, f, indent=4)

	print(f"Created {self.master_json_file}")

	def _update_master_json(
	self,
	meeting_uuid: str,
	meeting_title: str,
	last_conversation_no: int,
	meeting_video_file: bool,
	time_stamp: str,
	) -> dict:
	'''
	Updates the master json file with the new meeting details
	'''
	with open(self.master_json_file, "r+") as f:
	data = json.load(f)
	data["meeting_uuids"] = list(set(data["meeting_uuids"] + [meeting_uuid]))
	data["last_conversation_no"] = last_conversation_no
	data["meetings"].append(
	{
	"meeting_uuid": meeting_uuid,
	"meeting_title": meeting_title,
	"meeting_date": time_stamp,
	"meeting_video_file": meeting_video_file,
	}
	)
	return data

	def _get_meeting_members(self, transcript: pd.DataFrame) -> list[str]:
	return list(transcript["speaker_label"].unique())

	def _create_new_meeting_json(
	self,
	meeting_uuid: str,
	meeting_title: str,
	last_conversation_no: int,
	meeting_members: list[str],
	meeting_video_file: bool,
	time_stamp: str,
	meeting_summary: str,
	) -> dict:
	'''
	Creates a new json file for the meeting details
	'''
	data = {
	"index": self.pinecone_index_name,
	"namespace": self.pinecone_namespace,
	"meeting_title": meeting_title,
	"meeting_uuid": meeting_uuid,
	"meeting_date": time_stamp,
	"last_conversation_no": last_conversation_no,
	"meeting_video_file": meeting_video_file,
	"meeting_members": meeting_members,
	"meeting_summary": meeting_summary,
	}

	meeting_details_file = os.path.join(self.base_data_path, f"{meeting_uuid}.json")
	with open(meeting_details_file, "w") as f:
	json.dump(data, f, indent=4)

	def _get_last_conversation_no(self) -> list[str]:

	with open(self.master_json_file, "r") as f:
	data = json.load(f)

	return data["last_conversation_no"]

	def _set_new_meeting_json(
	self,
	meeting_uuid: str,
	meeting_title: str,
	last_conversation_no: str,
	meeting_members: list[str],
	meeting_video_file: bool,
	meeting_summary: str,
	) -> dict:
	'''
	Updates the master json file with the new meeting details
	'''
	time_stamp = str(datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
	self._create_new_meeting_json(
	meeting_uuid,
	meeting_title,
	last_conversation_no,
	meeting_members,
	meeting_video_file,
	time_stamp,
	meeting_summary,
	)
	data = self._update_master_json(
	meeting_uuid,
	meeting_title,
	last_conversation_no,
	meeting_video_file,
	time_stamp,
	)

	with open(self.master_json_file, "w") as f:
	json.dump(data, f, indent=4)

	def _convert_to_hr_min_sec(self, time_in_minutes) -> str:
	# Hr:Min:Sec
	hours = int(time_in_minutes // 60)
	minutes = int(time_in_minutes % 60)
	seconds = int((time_in_minutes - int(time_in_minutes)) * 60)
	return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

	def pinecone_upsert(
	self,
	transcript: pd.DataFrame,
	meeting_uuid: str = "",
	meeting_video_file: bool = False,
	meeting_title: str = "Unnamed",
	meeting_summary: str = "",
	) -> None:
	"""
	Upserts the transcript into Pinecone
	"""
	print("Upserting transcript into Pinecone...")
	texts = []
	metadatas = []

	last_conversation_no = self._get_last_conversation_no()
	last_conversation_no = int(last_conversation_no)

	embed = self._get_vector_embedder()
	meeting_members = self._get_meeting_members(transcript)
	index = self._get_index()

	for _, record in transcript.iterrows():
	start_time = self._convert_to_hr_min_sec(record["start_time"])

	metadata = {
	"speaker": record["speaker_label"],
	"start_time": start_time,
	"text": record["text"],
	"meeting_uuid": meeting_uuid,
	}
	texts.append(record["text"])
	metadatas.append(metadata)

	if len(texts) >= self.pinecone_upsert_batch_limit:
	ids = list(
	map(
	lambda i: str(i + 1),
	range(last_conversation_no, last_conversation_no + len(texts)),
	)
	)
	last_conversation_no += len(texts)
	embeds = embed.embed_documents(texts)

	try:
	index.upsert(
	vectors=zip(ids, embeds, metadatas),
	namespace=self.pinecone_namespace,
	)
	except Exception as e:
	print("Error upserting into Pinecone: ", e)
	texts = []
	metadatas = []

	# Upsert the remaining texts
	if len(texts) > 0:
	ids = list(
	map(
	lambda i: str(i + 1),
	range(last_conversation_no, last_conversation_no + len(texts)),
	)
	)
	last_conversation_no += len(texts)
	embeds = embed.embed_documents(texts)

	try:
	index.upsert(
	vectors=zip(ids, embeds, metadatas),
	namespace=self.pinecone_namespace,
	)
	except Exception as e:
	print("Error upserting into Pinecone: ", e)

	self._set_new_meeting_json(
	meeting_uuid,
	meeting_title,
	last_conversation_no,
	meeting_members,
	meeting_video_file,
	meeting_summary,
	)

	print("Upserted transcript into Pinecone")

	def _extract_id_from_response(self, response: list) -> list[int]:
	if response:
	return list(int(match["id"]) for match in response["matches"])
	return []

	def query_pinecone(
	self, query: str, in_filter: list[str] = [], complete_db_flag: bool = False
	) -> list:
	"""
	Queries Pinecone for the given query, where in_filter is the list of meeting_uuids to filter the query
	and if complete_db_flag is True, the entire database is queried
	"""
	# for using without clustering, complete_db_flag to True
	try:
	index = self._get_index()
	embed = self._get_vector_embedder()

	filter = None if complete_db_flag else {"meeting_uuid": {"$in": in_filter}}

	self.response = index.query(
	vector=embed.embed_documents([query])[0],
	namespace=self.pinecone_namespace,
	top_k=self.pinecone_top_k_results,
	include_metadata=True,
	filter=filter,
	)
	return self.response
	except Exception as e:
	print("Error querying Pinecone: ", e)
	return []


	def query_delta_conversations(self) -> pd.DataFrame:
	"""
	Queries Pinecone for the given query and returns the delta conversations (conversation window around the query result)
	"""
	ids = self._extract_id_from_response(self.response)
	last_conversation_no = self._get_last_conversation_no()
	index = self._get_index()
	conversation = {}

	for id in ids:
	left = (
	id - self.pinecone_delta_window
	if id - self.pinecone_delta_window > 0
	else 1
	)
	right = (
	id + self.pinecone_delta_window
	if id + self.pinecone_delta_window <= last_conversation_no
	else last_conversation_no
	)
	window = [str(i) for i in range(left, right + 1)]
	try:
	# print("Fetch window: ", window)
	print("Contextual Window Conversation IDs: ", window)
	fetch_response = index.fetch(
	ids=window, namespace=self.pinecone_namespace
	)
	conversation[id] = fetch_response
	except Exception as e:
	print("Error fetching from Pinecone for id:", id, "Error:", e)
	continue
	# print('conversation length: ', len(conversation))
	return self._parse_fetch_conversations(conversation)


	def _parse_fetch_conversations(self, conversation)-> dict:
	'''
	Parses the conversation dictionary and returns a grouped_dfs
	'''
	data_rows = []
	for primary_hit_id, primary_hit_data in conversation.items():
	for _, vector_data in primary_hit_data["vectors"].items():
	id = vector_data["id"]
	meeting_uuid = vector_data["metadata"]["meeting_uuid"]
	speaker = vector_data["metadata"]["speaker"]
	start_time = vector_data["metadata"]["start_time"]
	text = vector_data["metadata"]["text"]

	data_rows.append(
	(primary_hit_id, id, meeting_uuid, speaker, start_time, text)
	)

	columns = ["primary_id", "id", "meeting_uuid", "speaker", "start_time", "text"]
	delta_conversation_df = pd.DataFrame(data_rows, columns=columns)
	delta_conversation_df = delta_conversation_df.sort_values(by=["id"])
	delta_conversation_df = delta_conversation_df.drop_duplicates(subset=["id"])

	# creating separate df for rows with same meeting_cluster_id
	grouped_dfs = {
	group_name: group.reset_index(drop=True, inplace=False)
	for group_name, group in delta_conversation_df.groupby("meeting_uuid")
	}
	# return delta_conversation_df
	return grouped_dfs


	if __name__ == "__main__":
	pinecone = PineconeServerless()
	print(pinecone.describe_index_stats())

	for i in range(1, 3):
	print(i)
	transcript = pd.read_csv(f"./data/transcriptFiles/healthcare_{i}.csv")
	transcript.dropna(inplace=True)
	pinecone.pinecone_upsert(
	transcript,
	meeting_uuid=str(uuid.uuid4()),
	meeting_video_file=False,
	meeting_title=f"Healthcare Meeting {i}",
	meeting_summary=f"Healthcare Meeting Summary Meeting {i}",
	)
	time.sleep(5)
	print(pinecone.describe_index_stats())

	query = "I am one of the directors in Wappingers Central School District."
	response1 = pinecone.query_pinecone(query, "", True)
	print(response1)