|
import os |
|
from pathlib import Path |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
|
|
import gradio as gr |
|
import jsonlines |
|
from openai import OpenAI |
|
from dotenv import load_dotenv |
|
from evaluation_utils import evaluate_response |
|
|
|
|
|
def get_split(): |
|
load_dotenv() |
|
split = os.getenv("SPLIT") |
|
if split == "train": |
|
return "evaluation on development set" |
|
elif split == "test": |
|
return "evaluation on test set" |
|
|
|
|
|
|
|
def chunk_list(data, chunk_size): |
|
for i in range(0, len(data), chunk_size): |
|
yield data[i:i + chunk_size] |
|
|
|
|
|
def send_request(client, prompt, index): |
|
response = client.chat.completions.create( |
|
model="gpt-4o-mini", |
|
temperature=0, |
|
seed=42, |
|
messages=[ |
|
{"role": "system", "content": "You are a helpful assistant."}, |
|
{"role": "user", "content": prompt}, |
|
], |
|
max_tokens=1024, |
|
) |
|
return index, response.choices[0].message.content |
|
|
|
def evaluate_prompt(prompt: str, num_samples: int = None, split: str = None, batch_size: int = 5, progress=gr.Progress()): |
|
progress(0, desc="Starting...") |
|
load_dotenv() |
|
|
|
if num_samples is None: |
|
num_samples = int(os.getenv("NUM_SAMPLES")) |
|
|
|
if split is None: |
|
split = os.getenv("SPLIT") |
|
assert split in ["train", "test"] |
|
|
|
|
|
test_file_path = Path(__file__).parent / f"{split}.jsonl" |
|
|
|
|
|
test_data = [] |
|
with jsonlines.open(test_file_path) as reader: |
|
for item in reader: |
|
test_data.append(item) |
|
|
|
test_data = [item for item in test_data if "'" not in item["shuffled_tokenized"] and "β" not in item["shuffled_tokenized"]] |
|
|
|
|
|
test_data = test_data[:num_samples] |
|
|
|
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY')) |
|
|
|
responses = [None] * num_samples |
|
instantiated_prompts = [] |
|
|
|
|
|
for batch_data in chunk_list(test_data, batch_size): |
|
|
|
batch_prompts = [ |
|
prompt.replace("{% shuffled_sentence %}", test_item["shuffled_tokenized"]) |
|
for test_item in batch_data |
|
] |
|
instantiated_prompts.extend(batch_prompts) |
|
|
|
|
|
with ThreadPoolExecutor() as executor: |
|
futures = {executor.submit(send_request, client, item_prompt, i): i for i, item_prompt in enumerate(batch_prompts, start=len(instantiated_prompts) - len(batch_prompts))} |
|
|
|
for future in as_completed(futures): |
|
try: |
|
index, response = future.result() |
|
responses[index] = response |
|
except Exception as e: |
|
print(f"Request failed: {e}") |
|
responses[index] = "Error: Request failed" |
|
|
|
|
|
progress(len(instantiated_prompts) / len(test_data), desc="Processing batches...") |
|
|
|
|
|
scores = [] |
|
for test_item, instantiated_prompt, response in zip(test_data, instantiated_prompts, responses): |
|
score = evaluate_response(test_item["original_tokenized"], response) |
|
scores.append(score) |
|
yield (test_item["original_sentence"], instantiated_prompt, response, score) |
|
|