resonate / src /pinecone /resonate_pinecone_functions.py
sartajbhuvaji's picture
Added files from branch huggingface
5f773d1 verified
# Description: Pinecone Serverless Class for Resonate
# Reference: https://www.pinecone.io/docs/
import datetime
import uuid
import json
import os
import time
import pandas as pd
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec
def load_json_config(json_file_path="./config/config.json"):
with open(json_file_path, "r") as file:
data = json.load(file)
return data
class PineconeServerless:
def __init__(self) -> None:
print("Pinecone Serverless Initializing")
json_config = load_json_config()
#load_dotenv("./config/.env")
self.PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
self.OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if self.PINECONE_API_KEY is not None:
self.pinecone = Pinecone(api_key=self.PINECONE_API_KEY)
self._init_config(json_config)
self.meeting_title = None
self.base_data_path = "./data/jsonMetaDataFiles/"
self.master_json_file = f"{self.base_data_path}{self.master_json_filename}.json"
self._create_master_json()
self._create_index()
self.response = None
print("Pinecone Serverless Initialized")
def _init_config(self, json_config) -> None:
for key, value in json_config.items():
setattr(self, key.lower(), value)
def check_index_already_exists(self) -> bool:
return self.pinecone_index_name in self.pinecone.list_indexes()
def _get_vector_embedder(self):
if self.embedding_provider == "OpenAI":
return OpenAIEmbeddings(model=self.embedding_model_name)
else:
raise ValueError("Invalid Embedding Model")
def _get_index(self):
return self.pinecone.Index(self.pinecone_index_name)
def _create_index(self) -> None:
'''
Creates a new index in Pinecone if it does not exist
'''
pinecone_indexes_list = [
index.get("name")
for index in self.pinecone.list_indexes().get("indexes", [])]
if self.pinecone_index_name not in pinecone_indexes_list:
try:
self.pinecone.create_index(
name=self.pinecone_index_name,
metric=self.pinecone_metric,
dimension=self.pinecone_vector_dimension,
spec=ServerlessSpec(
cloud=self.pinecone_cloud_provider,
region=self.pinecone_region,
# pod_type="p1.x1", # Future use
),
)
while not self.pinecone.describe_index(self.pinecone_index_name).status["ready"]:
time.sleep(5)
except Exception as e:
print("Index creation failed: ", e)
def describe_index_stats(self) -> dict:
try:
index = self._get_index()
return index.describe_index_stats()
except Exception as e:
print("Index does not exist: ", e)
return {}
def _delete_index(self) -> None:
try:
self.pinecone.delete_index(self.pinecone_index_name)
except Exception as e:
print("Index does not exist: ", e)
def _create_master_json(self) -> None:
'''
Check if the master json file exists, if not, create it
'''
os.makedirs(os.path.dirname(self.base_data_path), exist_ok=True)
if not os.path.exists(self.master_json_file):
with open(self.master_json_file, "w") as file:
data = {
"index": self.pinecone_index_name,
"namespace": self.pinecone_namespace,
"last_conversation_no": 0,
"meeting_uuids": [],
"meetings": [],
}
with open(self.master_json_file, "w") as f:
json.dump(data, f, indent=4)
print(f"Created {self.master_json_file}")
def _update_master_json(
self,
meeting_uuid: str,
meeting_title: str,
last_conversation_no: int,
meeting_video_file: bool,
time_stamp: str,
) -> dict:
'''
Updates the master json file with the new meeting details
'''
with open(self.master_json_file, "r+") as f:
data = json.load(f)
data["meeting_uuids"] = list(set(data["meeting_uuids"] + [meeting_uuid]))
data["last_conversation_no"] = last_conversation_no
data["meetings"].append(
{
"meeting_uuid": meeting_uuid,
"meeting_title": meeting_title,
"meeting_date": time_stamp,
"meeting_video_file": meeting_video_file,
}
)
return data
def _get_meeting_members(self, transcript: pd.DataFrame) -> list[str]:
return list(transcript["speaker_label"].unique())
def _create_new_meeting_json(
self,
meeting_uuid: str,
meeting_title: str,
last_conversation_no: int,
meeting_members: list[str],
meeting_video_file: bool,
time_stamp: str,
meeting_summary: str,
) -> dict:
'''
Creates a new json file for the meeting details
'''
data = {
"index": self.pinecone_index_name,
"namespace": self.pinecone_namespace,
"meeting_title": meeting_title,
"meeting_uuid": meeting_uuid,
"meeting_date": time_stamp,
"last_conversation_no": last_conversation_no,
"meeting_video_file": meeting_video_file,
"meeting_members": meeting_members,
"meeting_summary": meeting_summary,
}
meeting_details_file = os.path.join(self.base_data_path, f"{meeting_uuid}.json")
with open(meeting_details_file, "w") as f:
json.dump(data, f, indent=4)
def _get_last_conversation_no(self) -> list[str]:
with open(self.master_json_file, "r") as f:
data = json.load(f)
return data["last_conversation_no"]
def _set_new_meeting_json(
self,
meeting_uuid: str,
meeting_title: str,
last_conversation_no: str,
meeting_members: list[str],
meeting_video_file: bool,
meeting_summary: str,
) -> dict:
'''
Updates the master json file with the new meeting details
'''
time_stamp = str(datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
self._create_new_meeting_json(
meeting_uuid,
meeting_title,
last_conversation_no,
meeting_members,
meeting_video_file,
time_stamp,
meeting_summary,
)
data = self._update_master_json(
meeting_uuid,
meeting_title,
last_conversation_no,
meeting_video_file,
time_stamp,
)
with open(self.master_json_file, "w") as f:
json.dump(data, f, indent=4)
def _convert_to_hr_min_sec(self, time_in_minutes) -> str:
# Hr:Min:Sec
hours = int(time_in_minutes // 60)
minutes = int(time_in_minutes % 60)
seconds = int((time_in_minutes - int(time_in_minutes)) * 60)
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
def pinecone_upsert(
self,
transcript: pd.DataFrame,
meeting_uuid: str = "",
meeting_video_file: bool = False,
meeting_title: str = "Unnamed",
meeting_summary: str = "",
) -> None:
"""
Upserts the transcript into Pinecone
"""
print("Upserting transcript into Pinecone...")
texts = []
metadatas = []
last_conversation_no = self._get_last_conversation_no()
last_conversation_no = int(last_conversation_no)
embed = self._get_vector_embedder()
meeting_members = self._get_meeting_members(transcript)
index = self._get_index()
for _, record in transcript.iterrows():
start_time = self._convert_to_hr_min_sec(record["start_time"])
metadata = {
"speaker": record["speaker_label"],
"start_time": start_time,
"text": record["text"],
"meeting_uuid": meeting_uuid,
}
texts.append(record["text"])
metadatas.append(metadata)
if len(texts) >= self.pinecone_upsert_batch_limit:
ids = list(
map(
lambda i: str(i + 1),
range(last_conversation_no, last_conversation_no + len(texts)),
)
)
last_conversation_no += len(texts)
embeds = embed.embed_documents(texts)
try:
index.upsert(
vectors=zip(ids, embeds, metadatas),
namespace=self.pinecone_namespace,
)
except Exception as e:
print("Error upserting into Pinecone: ", e)
texts = []
metadatas = []
# Upsert the remaining texts
if len(texts) > 0:
ids = list(
map(
lambda i: str(i + 1),
range(last_conversation_no, last_conversation_no + len(texts)),
)
)
last_conversation_no += len(texts)
embeds = embed.embed_documents(texts)
try:
index.upsert(
vectors=zip(ids, embeds, metadatas),
namespace=self.pinecone_namespace,
)
except Exception as e:
print("Error upserting into Pinecone: ", e)
self._set_new_meeting_json(
meeting_uuid,
meeting_title,
last_conversation_no,
meeting_members,
meeting_video_file,
meeting_summary,
)
print("Upserted transcript into Pinecone")
def _extract_id_from_response(self, response: list) -> list[int]:
if response:
return list(int(match["id"]) for match in response["matches"])
return []
def query_pinecone(
self, query: str, in_filter: list[str] = [], complete_db_flag: bool = False
) -> list:
"""
Queries Pinecone for the given query, where in_filter is the list of meeting_uuids to filter the query
and if complete_db_flag is True, the entire database is queried
"""
# for using without clustering, complete_db_flag to True
try:
index = self._get_index()
embed = self._get_vector_embedder()
filter = None if complete_db_flag else {"meeting_uuid": {"$in": in_filter}}
self.response = index.query(
vector=embed.embed_documents([query])[0],
namespace=self.pinecone_namespace,
top_k=self.pinecone_top_k_results,
include_metadata=True,
filter=filter,
)
return self.response
except Exception as e:
print("Error querying Pinecone: ", e)
return []
def query_delta_conversations(self) -> pd.DataFrame:
"""
Queries Pinecone for the given query and returns the delta conversations (conversation window around the query result)
"""
ids = self._extract_id_from_response(self.response)
last_conversation_no = self._get_last_conversation_no()
index = self._get_index()
conversation = {}
for id in ids:
left = (
id - self.pinecone_delta_window
if id - self.pinecone_delta_window > 0
else 1
)
right = (
id + self.pinecone_delta_window
if id + self.pinecone_delta_window <= last_conversation_no
else last_conversation_no
)
window = [str(i) for i in range(left, right + 1)]
try:
# print("Fetch window: ", window)
print("Contextual Window Conversation IDs: ", window)
fetch_response = index.fetch(
ids=window, namespace=self.pinecone_namespace
)
conversation[id] = fetch_response
except Exception as e:
print("Error fetching from Pinecone for id:", id, "Error:", e)
continue
# print('conversation length: ', len(conversation))
return self._parse_fetch_conversations(conversation)
def _parse_fetch_conversations(self, conversation)-> dict:
'''
Parses the conversation dictionary and returns a grouped_dfs
'''
data_rows = []
for primary_hit_id, primary_hit_data in conversation.items():
for _, vector_data in primary_hit_data["vectors"].items():
id = vector_data["id"]
meeting_uuid = vector_data["metadata"]["meeting_uuid"]
speaker = vector_data["metadata"]["speaker"]
start_time = vector_data["metadata"]["start_time"]
text = vector_data["metadata"]["text"]
data_rows.append(
(primary_hit_id, id, meeting_uuid, speaker, start_time, text)
)
columns = ["primary_id", "id", "meeting_uuid", "speaker", "start_time", "text"]
delta_conversation_df = pd.DataFrame(data_rows, columns=columns)
delta_conversation_df = delta_conversation_df.sort_values(by=["id"])
delta_conversation_df = delta_conversation_df.drop_duplicates(subset=["id"])
# creating separate df for rows with same meeting_cluster_id
grouped_dfs = {
group_name: group.reset_index(drop=True, inplace=False)
for group_name, group in delta_conversation_df.groupby("meeting_uuid")
}
# return delta_conversation_df
return grouped_dfs
if __name__ == "__main__":
pinecone = PineconeServerless()
print(pinecone.describe_index_stats())
for i in range(1, 3):
print(i)
transcript = pd.read_csv(f"./data/transcriptFiles/healthcare_{i}.csv")
transcript.dropna(inplace=True)
pinecone.pinecone_upsert(
transcript,
meeting_uuid=str(uuid.uuid4()),
meeting_video_file=False,
meeting_title=f"Healthcare Meeting {i}",
meeting_summary=f"Healthcare Meeting Summary Meeting {i}",
)
time.sleep(5)
print(pinecone.describe_index_stats())
query = "I am one of the directors in Wappingers Central School District."
response1 = pinecone.query_pinecone(query, "", True)
print(response1)