# Uploads data to pinecone # Runner: python init_one_time_utils/pinecone_sample_dataloader.py # Average Run Time: 35-40 min import json import time import pandas as pd import sys import os # Ensuring the project's root directory is in the Python path for module importing project_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) if project_root not in sys.path: sys.path.append(project_root) # Importing the PineconeServerless class from the project's module from src.pinecone.resonate_pinecone_functions import PineconeServerless class TranscriptProcessor: """ A class to process and upsert transcripts to Pinecone. Attributes: uuid_to_filename_dict (dict): A mapping from UUIDs to their respective transcript file names. pinecone (PineconeServerless): An instance of the PineconeServerless class for database operations. """ def __init__(self): """ Initializes the TranscriptProcessor with a predefined UUID to filename mapping and a PineconeServerless instance. """ # Mapping UUIDs to their respective transcript file names self.uuid_to_filename_dict = { "52d105f8-1c80-4056-8253-732b9e2bec63": "office_relocation_1.csv", "9ed1fefa-db53-41fc-a21b-479b67e30073": "office_relocation_2.csv", "e993da88-0e17-4a35-ba9a-c03decca607b": "office_relocation_3.csv", "61d453f1-2852-48d9-a25a-b6e04c3c4908": "office_relocation_4.csv", "ba94585e-b0df-4633-bef2-a4f94f644c11": "Social_Media_-_Harmed_Teens.csv", "906c7694-0e33-4c8e-8f51-0365155fbb81": "Social_Media_-_Ruins_your_life.csv", "52d2dfe4-748b-4ecf-84fb-64be6ebcaeef": "ES2014a.Mix-Headset.csv", "1be8e439-45b3-4c97-9e4a-5c78c1a15e78": "ES2014b.Mix-Headset_1.csv", "a4b7b490-7b28-4744-85e5-d216f40ff52c": "ES2014b.Mix-Headset_2.csv", "b3821662-03f1-4349-8781-ba5f64439693": "ES2014c.Mix-Headset.csv", "95efa3c5-9770-4160-9f28-35350efb9f73": "Gitlab_Monthly_Release_Kickoff_1.csv", "85430eae-d466-4d63-9015-5835bbe71b90": "product_marketing_meeting.csv", "55d8afa8-a1bf-413c-a75c-b8c14da88d87": "Gitlab_Monthly_Release_Kickoff_2.csv", "15b7549d-4b3f-43b5-9507-85de435f1b4a": "2023-09-26_Architecture_Design_Workflow_New_Diffs_kickoff_call_1.csv", "875564dc-9954-41da-9084-ccf04ebffdb0": "2023-09-26_Architecture_Design_Workflow_New_Diffs_kickoff_call_2.csv", "72858a28-248d-4bef-af03-c62a3c285fbb": "2023-09-26_Architecture_Design_Workflow_New_Diffs_kickoff_call_3.csv", "4cbd0d4e-6cf9-4db4-bf15-f4f4e4d3d8d8": "2023-10-03-New_diffs_Architecture_Workflow.csv", "4badb5ba-ca92-4c3c-a7e9-0d49fc7a8137": "2023-10-10_New_diffs_architecture_workflow_weekly_EMEA_AMER_1.csv", "9c5aa3e4-b047-4f08-a838-9b665e251e4d": "2023-10-10_New_diffs_architecture_workflow_weekly_EMEA_AMER_2.csv", "d7c8e3b8-c6e0-4845-8669-f2f4ed1b8549": "2023-10-17_New_diffs_architecture_blueprint_1.csv", "876e67fa-314d-40e4-b942-21ca63e81995": "2023-10-17_New_diffs_architecture_blueprint_2.csv", } # Initializing a PineconeServerless instance for database operations self.pinecone = PineconeServerless() def load_json_config(self, json_file_path=".//config/config.json"): """ Loads a JSON configuration file. Parameters: json_file_path (str): The path to the JSON configuration file. Returns: dict: The data loaded from the JSON file. """ with open(json_file_path, "r", encoding="utf-8") as file: data = json.load(file) return data def pinecone_init_upsert( self, df_transcript, meeting_title, meeting_summary, meeting_uuid ): """ Initializes and performs an upsert operation to Pinecone with transcript data. Parameters: df_transcript (DataFrame): The transcript data as a pandas DataFrame. meeting_title (str): The title of the meeting. meeting_summary (str): The summary of the meeting. meeting_uuid (str): The UUID of the meeting. Exceptions: Catches and prints any exceptions raised during the upsert operation. """ try: self.pinecone.pinecone_upsert( df_transcript, meeting_uuid=meeting_uuid, meeting_video_file=False, meeting_title=meeting_title, meeting_summary=meeting_summary, ) # Wait for a short period to ensure the upsert operation completes time.sleep(5) except Exception as e: print("Error upserting transcript to Pinecone: ", e) def process_transcripts(self): """ Processes and upserts all transcripts to Pinecone based on the UUID to filename mapping and the summary data. """ summary_file = "./data/summaryFiles/abstract_summary_data.csv" df_summary = pd.read_csv(summary_file) # Creating a dictionary from the summaries DataFrame df_summary_dict = df_summary.set_index("uuid")["text"].to_dict() transcript_folder = "./data/transcriptFiles/" for uuid, summary in df_summary_dict.items(): if uuid in self.uuid_to_filename_dict: filename = self.uuid_to_filename_dict[uuid] df_transcript = pd.read_csv(transcript_folder + filename) meeting_title = filename.replace(".csv", "") meeting_uuid = uuid self.pinecone_init_upsert( df_transcript, meeting_title, summary, meeting_uuid ) time.sleep(20) # To prevent OPEN AI embedding limit error if __name__ == "__main__": processor = TranscriptProcessor() processor.process_transcripts()