resonate / init_one_time_utils /pinecone_sample_dataloader.py
sartajbhuvaji's picture
Added files from branch huggingface
5f773d1 verified
# Uploads data to pinecone
# Runner: python init_one_time_utils/pinecone_sample_dataloader.py
# Average Run Time: 35-40 min
import json
import time
import pandas as pd
import sys
import os
# Ensuring the project's root directory is in the Python path for module importing
project_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
if project_root not in sys.path:
sys.path.append(project_root)
# Importing the PineconeServerless class from the project's module
from src.pinecone.resonate_pinecone_functions import PineconeServerless
class TranscriptProcessor:
"""
A class to process and upsert transcripts to Pinecone.
Attributes:
uuid_to_filename_dict (dict): A mapping from UUIDs to their respective transcript file names.
pinecone (PineconeServerless): An instance of the PineconeServerless class for database operations.
"""
def __init__(self):
"""
Initializes the TranscriptProcessor with a predefined UUID to filename mapping and a PineconeServerless instance.
"""
# Mapping UUIDs to their respective transcript file names
self.uuid_to_filename_dict = {
"52d105f8-1c80-4056-8253-732b9e2bec63": "office_relocation_1.csv",
"9ed1fefa-db53-41fc-a21b-479b67e30073": "office_relocation_2.csv",
"e993da88-0e17-4a35-ba9a-c03decca607b": "office_relocation_3.csv",
"61d453f1-2852-48d9-a25a-b6e04c3c4908": "office_relocation_4.csv",
"ba94585e-b0df-4633-bef2-a4f94f644c11": "Social_Media_-_Harmed_Teens.csv",
"906c7694-0e33-4c8e-8f51-0365155fbb81": "Social_Media_-_Ruins_your_life.csv",
"52d2dfe4-748b-4ecf-84fb-64be6ebcaeef": "ES2014a.Mix-Headset.csv",
"1be8e439-45b3-4c97-9e4a-5c78c1a15e78": "ES2014b.Mix-Headset_1.csv",
"a4b7b490-7b28-4744-85e5-d216f40ff52c": "ES2014b.Mix-Headset_2.csv",
"b3821662-03f1-4349-8781-ba5f64439693": "ES2014c.Mix-Headset.csv",
"95efa3c5-9770-4160-9f28-35350efb9f73": "Gitlab_Monthly_Release_Kickoff_1.csv",
"85430eae-d466-4d63-9015-5835bbe71b90": "product_marketing_meeting.csv",
"55d8afa8-a1bf-413c-a75c-b8c14da88d87": "Gitlab_Monthly_Release_Kickoff_2.csv",
"15b7549d-4b3f-43b5-9507-85de435f1b4a": "2023-09-26_Architecture_Design_Workflow_New_Diffs_kickoff_call_1.csv",
"875564dc-9954-41da-9084-ccf04ebffdb0": "2023-09-26_Architecture_Design_Workflow_New_Diffs_kickoff_call_2.csv",
"72858a28-248d-4bef-af03-c62a3c285fbb": "2023-09-26_Architecture_Design_Workflow_New_Diffs_kickoff_call_3.csv",
"4cbd0d4e-6cf9-4db4-bf15-f4f4e4d3d8d8": "2023-10-03-New_diffs_Architecture_Workflow.csv",
"4badb5ba-ca92-4c3c-a7e9-0d49fc7a8137": "2023-10-10_New_diffs_architecture_workflow_weekly_EMEA_AMER_1.csv",
"9c5aa3e4-b047-4f08-a838-9b665e251e4d": "2023-10-10_New_diffs_architecture_workflow_weekly_EMEA_AMER_2.csv",
"d7c8e3b8-c6e0-4845-8669-f2f4ed1b8549": "2023-10-17_New_diffs_architecture_blueprint_1.csv",
"876e67fa-314d-40e4-b942-21ca63e81995": "2023-10-17_New_diffs_architecture_blueprint_2.csv",
}
# Initializing a PineconeServerless instance for database operations
self.pinecone = PineconeServerless()
def load_json_config(self, json_file_path=".//config/config.json"):
"""
Loads a JSON configuration file.
Parameters:
json_file_path (str): The path to the JSON configuration file.
Returns:
dict: The data loaded from the JSON file.
"""
with open(json_file_path, "r", encoding="utf-8") as file:
data = json.load(file)
return data
def pinecone_init_upsert(
self, df_transcript, meeting_title, meeting_summary, meeting_uuid
):
"""
Initializes and performs an upsert operation to Pinecone with transcript data.
Parameters:
df_transcript (DataFrame): The transcript data as a pandas DataFrame.
meeting_title (str): The title of the meeting.
meeting_summary (str): The summary of the meeting.
meeting_uuid (str): The UUID of the meeting.
Exceptions:
Catches and prints any exceptions raised during the upsert operation.
"""
try:
self.pinecone.pinecone_upsert(
df_transcript,
meeting_uuid=meeting_uuid,
meeting_video_file=False,
meeting_title=meeting_title,
meeting_summary=meeting_summary,
)
# Wait for a short period to ensure the upsert operation completes
time.sleep(5)
except Exception as e:
print("Error upserting transcript to Pinecone: ", e)
def process_transcripts(self):
"""
Processes and upserts all transcripts to Pinecone based on the UUID to filename mapping and the summary data.
"""
summary_file = "./data/summaryFiles/abstract_summary_data.csv"
df_summary = pd.read_csv(summary_file)
# Creating a dictionary from the summaries DataFrame
df_summary_dict = df_summary.set_index("uuid")["text"].to_dict()
transcript_folder = "./data/transcriptFiles/"
for uuid, summary in df_summary_dict.items():
if uuid in self.uuid_to_filename_dict:
filename = self.uuid_to_filename_dict[uuid]
df_transcript = pd.read_csv(transcript_folder + filename)
meeting_title = filename.replace(".csv", "")
meeting_uuid = uuid
self.pinecone_init_upsert(
df_transcript, meeting_title, summary, meeting_uuid
)
time.sleep(20) # To prevent OPEN AI embedding limit error
if __name__ == "__main__":
processor = TranscriptProcessor()
processor.process_transcripts()