Spaces:

sartajbhuvaji
/

resonate

Sleeping

App Files Files Community

resonate / init_one_time_utils /pinecone_sample_dataloader.py

sartajbhuvaji

Added files from branch huggingface

5f773d1 verified over 1 year ago

raw

history blame contribute delete

6.05 kB

	# Uploads data to pinecone
	# Runner: python init_one_time_utils/pinecone_sample_dataloader.py
	# Average Run Time: 35-40 min
	import json
	import time
	import pandas as pd
	import sys
	import os

	# Ensuring the project's root directory is in the Python path for module importing
	project_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
	if project_root not in sys.path:
	sys.path.append(project_root)

	# Importing the PineconeServerless class from the project's module
	from src.pinecone.resonate_pinecone_functions import PineconeServerless


	class TranscriptProcessor:
	"""
	A class to process and upsert transcripts to Pinecone.

	Attributes:
	uuid_to_filename_dict (dict): A mapping from UUIDs to their respective transcript file names.
	pinecone (PineconeServerless): An instance of the PineconeServerless class for database operations.
	"""

	def __init__(self):
	"""
	Initializes the TranscriptProcessor with a predefined UUID to filename mapping and a PineconeServerless instance.
	"""
	# Mapping UUIDs to their respective transcript file names
	self.uuid_to_filename_dict = {
	"52d105f8-1c80-4056-8253-732b9e2bec63": "office_relocation_1.csv",
	"9ed1fefa-db53-41fc-a21b-479b67e30073": "office_relocation_2.csv",
	"e993da88-0e17-4a35-ba9a-c03decca607b": "office_relocation_3.csv",
	"61d453f1-2852-48d9-a25a-b6e04c3c4908": "office_relocation_4.csv",
	"ba94585e-b0df-4633-bef2-a4f94f644c11": "Social_Media_-_Harmed_Teens.csv",
	"906c7694-0e33-4c8e-8f51-0365155fbb81": "Social_Media_-_Ruins_your_life.csv",
	"52d2dfe4-748b-4ecf-84fb-64be6ebcaeef": "ES2014a.Mix-Headset.csv",
	"1be8e439-45b3-4c97-9e4a-5c78c1a15e78": "ES2014b.Mix-Headset_1.csv",
	"a4b7b490-7b28-4744-85e5-d216f40ff52c": "ES2014b.Mix-Headset_2.csv",
	"b3821662-03f1-4349-8781-ba5f64439693": "ES2014c.Mix-Headset.csv",
	"95efa3c5-9770-4160-9f28-35350efb9f73": "Gitlab_Monthly_Release_Kickoff_1.csv",
	"85430eae-d466-4d63-9015-5835bbe71b90": "product_marketing_meeting.csv",
	"55d8afa8-a1bf-413c-a75c-b8c14da88d87": "Gitlab_Monthly_Release_Kickoff_2.csv",
	"15b7549d-4b3f-43b5-9507-85de435f1b4a": "2023-09-26_Architecture_Design_Workflow_New_Diffs_kickoff_call_1.csv",
	"875564dc-9954-41da-9084-ccf04ebffdb0": "2023-09-26_Architecture_Design_Workflow_New_Diffs_kickoff_call_2.csv",
	"72858a28-248d-4bef-af03-c62a3c285fbb": "2023-09-26_Architecture_Design_Workflow_New_Diffs_kickoff_call_3.csv",
	"4cbd0d4e-6cf9-4db4-bf15-f4f4e4d3d8d8": "2023-10-03-New_diffs_Architecture_Workflow.csv",
	"4badb5ba-ca92-4c3c-a7e9-0d49fc7a8137": "2023-10-10_New_diffs_architecture_workflow_weekly_EMEA_AMER_1.csv",
	"9c5aa3e4-b047-4f08-a838-9b665e251e4d": "2023-10-10_New_diffs_architecture_workflow_weekly_EMEA_AMER_2.csv",
	"d7c8e3b8-c6e0-4845-8669-f2f4ed1b8549": "2023-10-17_New_diffs_architecture_blueprint_1.csv",
	"876e67fa-314d-40e4-b942-21ca63e81995": "2023-10-17_New_diffs_architecture_blueprint_2.csv",
	}
	# Initializing a PineconeServerless instance for database operations
	self.pinecone = PineconeServerless()

	def load_json_config(self, json_file_path=".//config/config.json"):
	"""
	Loads a JSON configuration file.

	Parameters:
	json_file_path (str): The path to the JSON configuration file.

	Returns:
	dict: The data loaded from the JSON file.
	"""
	with open(json_file_path, "r", encoding="utf-8") as file:
	data = json.load(file)
	return data

	def pinecone_init_upsert(
	self, df_transcript, meeting_title, meeting_summary, meeting_uuid
	):
	"""
	Initializes and performs an upsert operation to Pinecone with transcript data.

	Parameters:
	df_transcript (DataFrame): The transcript data as a pandas DataFrame.
	meeting_title (str): The title of the meeting.
	meeting_summary (str): The summary of the meeting.
	meeting_uuid (str): The UUID of the meeting.

	Exceptions:
	Catches and prints any exceptions raised during the upsert operation.
	"""
	try:
	self.pinecone.pinecone_upsert(
	df_transcript,
	meeting_uuid=meeting_uuid,
	meeting_video_file=False,
	meeting_title=meeting_title,
	meeting_summary=meeting_summary,
	)
	# Wait for a short period to ensure the upsert operation completes
	time.sleep(5)
	except Exception as e:
	print("Error upserting transcript to Pinecone: ", e)

	def process_transcripts(self):
	"""
	Processes and upserts all transcripts to Pinecone based on the UUID to filename mapping and the summary data.
	"""
	summary_file = "./data/summaryFiles/abstract_summary_data.csv"
	df_summary = pd.read_csv(summary_file)
	# Creating a dictionary from the summaries DataFrame
	df_summary_dict = df_summary.set_index("uuid")["text"].to_dict()

	transcript_folder = "./data/transcriptFiles/"

	for uuid, summary in df_summary_dict.items():
	if uuid in self.uuid_to_filename_dict:

	filename = self.uuid_to_filename_dict[uuid]
	df_transcript = pd.read_csv(transcript_folder + filename)
	meeting_title = filename.replace(".csv", "")
	meeting_uuid = uuid

	self.pinecone_init_upsert(
	df_transcript, meeting_title, summary, meeting_uuid
	)
	time.sleep(20) # To prevent OPEN AI embedding limit error


	if __name__ == "__main__":
	processor = TranscriptProcessor()
	processor.process_transcripts()