Spaces:

jvamvas
/

prompting_challenge_final_test

App Files Files Community

prompting_challenge_final_test / app_utils.py

jvamvas's picture

Initial commit

a2e759c about 2 months ago

3.61 kB

	import os
	from pathlib import Path
	from concurrent.futures import ThreadPoolExecutor, as_completed

	import gradio as gr
	import jsonlines
	from openai import OpenAI
	from dotenv import load_dotenv
	from evaluation_utils import evaluate_response


	def get_split():
	load_dotenv()
	split = os.getenv("SPLIT")
	if split == "train":
	return "evaluation on development set"
	elif split == "test":
	return "evaluation on test set"


	# Utility function to chunk a list into batches
	def chunk_list(data, chunk_size):
	for i in range(0, len(data), chunk_size):
	yield data[i:i + chunk_size]

	# Function to send an individual request to the OpenAI API
	def send_request(client, prompt, index):
	response = client.chat.completions.create(
	model="gpt-4o-mini",
	temperature=0,
	seed=42,
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": prompt},
	],
	max_tokens=1024,
	)
	return index, response.choices[0].message.content

	def evaluate_prompt(prompt: str, num_samples: int = None, split: str = None, batch_size: int = 5, progress=gr.Progress()):
	progress(0, desc="Starting...")
	load_dotenv()

	if num_samples is None:
	num_samples = int(os.getenv("NUM_SAMPLES"))

	if split is None:
	split = os.getenv("SPLIT")
	assert split in ["train", "test"]

	# Define the path to the test.jsonl file
	test_file_path = Path(__file__).parent / f"{split}.jsonl"

	# Load the data from the jsonl file
	test_data = []
	with jsonlines.open(test_file_path) as reader:
	for item in reader:
	test_data.append(item)

	test_data = [item for item in test_data if "'" not in item["shuffled_tokenized"] and "’" not in item["shuffled_tokenized"]]

	# Limit to first num_samples items for faster evaluation
	test_data = test_data[:num_samples]

	client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

	responses = [None] * num_samples # Pre-allocate a list to store responses in order
	instantiated_prompts = []

	# Create and process batches
	for batch_data in chunk_list(test_data, batch_size):
	# Prepare the prompts for this batch
	batch_prompts = [
	prompt.replace("{% shuffled_sentence %}", test_item["shuffled_tokenized"])
	for test_item in batch_data
	]
	instantiated_prompts.extend(batch_prompts)

	# Send requests in parallel using ThreadPoolExecutor
	with ThreadPoolExecutor() as executor:
	futures = {executor.submit(send_request, client, item_prompt, i): i for i, item_prompt in enumerate(batch_prompts, start=len(instantiated_prompts) - len(batch_prompts))}

	for future in as_completed(futures):
	try:
	index, response = future.result()
	responses[index] = response # Store the response at the correct index
	except Exception as e:
	print(f"Request failed: {e}")
	responses[index] = "Error: Request failed"

	# Update progress after each batch
	progress(len(instantiated_prompts) / len(test_data), desc="Processing batches...")

	# Evaluate responses
	scores = []
	for test_item, instantiated_prompt, response in zip(test_data, instantiated_prompts, responses):
	score = evaluate_response(test_item["original_tokenized"], response)
	scores.append(score)
	yield (test_item["original_sentence"], instantiated_prompt, response, score)