jvamvas commited on
Commit
a2e759c
1 Parent(s): 4a63dbd

Initial commit

Browse files
Files changed (6) hide show
  1. app.py +82 -0
  2. app_utils.py +99 -0
  3. evaluation_utils.py +69 -0
  4. requirements.txt +5 -0
  5. test.jsonl +0 -0
  6. train.jsonl +0 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from app_utils import evaluate_prompt, get_split
4
+
5
+ import logging
6
+
7
+ logging.basicConfig(level=logging.INFO)
8
+
9
+
10
+ with gr.Blocks(title=f"Prompting Challenge ({get_split()}") as demo:
11
+ gr.Markdown(
12
+ f"""
13
+ # Prompting Challenge
14
+ ### ({get_split()})
15
+ """ + """
16
+ The goal of this challenge is to prompt GPT-4 to "unscramble" a sentence.
17
+
18
+ The input is a sentence with scrambled word order, e.g.: *"are How ? you"*
19
+
20
+ GPT-4 should identify the original sentence, e.g.: *"How are you?"*
21
+
22
+ Enter your prompt template here. Use `{% shuffled_sentence %}` at the place where you want the shuffled sentence to be inserted.
23
+ """
24
+ )
25
+
26
+ input_text = gr.Textbox(
27
+ lines=10,
28
+ label="Input Text",
29
+ value="Unscramble the following sentence: {% shuffled_sentence %}"
30
+ )
31
+ submit_button = gr.Button("Submit")
32
+ results_output = gr.HTML(label="Results")
33
+
34
+ def update_results(prompt):
35
+ result_tuples = list(evaluate_prompt(prompt))
36
+ if result_tuples:
37
+ total_score = sum(item_score for _, _, _, item_score in result_tuples)
38
+ score = total_score / len(result_tuples)
39
+ else:
40
+ score = 0
41
+ html_output = "<dl style='font-family: Arial, sans-serif;'>"
42
+ html_output += f"<h2 style='color: #333; margin-top: 20px; margin-bottom: 20px;'>Accuracy: {100 * score:.1f}%</h2>"
43
+ newline = '\n'
44
+ for index, (original, prompt, response, item_score) in enumerate(result_tuples, 1):
45
+ background_color = "#fff4ea" if item_score < 0.5 else "#e4ffe4" if item_score > 0.9 else "whitesmoke"
46
+ html_output += f"""
47
+ <div style='background-color: {background_color}; padding: 10px; margin-bottom: 20px;'>
48
+ <h3 style='color: #333; margin-top: 0;'>Test item #{index}</h3>
49
+ <dt style='padding: 5px;'>
50
+ <span style='font-weight: 600;'>Original Sentence:</span>
51
+ </dt>
52
+ <dd style='margin-left: 20px; padding: 5px;'>{original.replace(newline, "<br>")}</dd>
53
+
54
+ <dt style='padding: 5px;'>
55
+ <span style='font-weight: 600;'>Prompt:</span>
56
+ </dt>
57
+ <dd style='margin-left: 20px; padding: 5px;'>{prompt.replace(newline, "<br>")}</dd>
58
+
59
+ <dt style='padding: 5px;'>
60
+ <span style='font-weight: 600;'>Response by GPT-4:</span>
61
+ </dt>
62
+ <dd style='margin-left: 20px; padding: 5px;font-style: italic;'>{response.replace(newline, "<br>")}</dd>
63
+ <dt style='padding: 5px;'>
64
+ <span style='font-weight: 600;'>Score:</span>
65
+ </dt>
66
+ <dd style='margin-left: 20px; padding: 5px;'>
67
+ <span style='color: #333;'>{100 * item_score:.1f}%</span>
68
+ </dd>
69
+ </div>
70
+ """
71
+ html_output += "</dl>"
72
+ return html_output
73
+
74
+ submit_button.click(
75
+ fn=update_results,
76
+ inputs=[input_text],
77
+ outputs=[results_output]
78
+ )
79
+
80
+ if __name__ == "__main__":
81
+ demo.launch()
82
+ # demo.launch(share=True)
app_utils.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+
5
+ import gradio as gr
6
+ import jsonlines
7
+ from openai import OpenAI
8
+ from dotenv import load_dotenv
9
+ from evaluation_utils import evaluate_response
10
+
11
+
12
+ def get_split():
13
+ load_dotenv()
14
+ split = os.getenv("SPLIT")
15
+ if split == "train":
16
+ return "evaluation on development set"
17
+ elif split == "test":
18
+ return "evaluation on test set"
19
+
20
+
21
+ # Utility function to chunk a list into batches
22
+ def chunk_list(data, chunk_size):
23
+ for i in range(0, len(data), chunk_size):
24
+ yield data[i:i + chunk_size]
25
+
26
+ # Function to send an individual request to the OpenAI API
27
+ def send_request(client, prompt, index):
28
+ response = client.chat.completions.create(
29
+ model="gpt-4o-mini",
30
+ temperature=0,
31
+ seed=42,
32
+ messages=[
33
+ {"role": "system", "content": "You are a helpful assistant."},
34
+ {"role": "user", "content": prompt},
35
+ ],
36
+ max_tokens=1024,
37
+ )
38
+ return index, response.choices[0].message.content
39
+
40
+ def evaluate_prompt(prompt: str, num_samples: int = None, split: str = None, batch_size: int = 5, progress=gr.Progress()):
41
+ progress(0, desc="Starting...")
42
+ load_dotenv()
43
+
44
+ if num_samples is None:
45
+ num_samples = int(os.getenv("NUM_SAMPLES"))
46
+
47
+ if split is None:
48
+ split = os.getenv("SPLIT")
49
+ assert split in ["train", "test"]
50
+
51
+ # Define the path to the test.jsonl file
52
+ test_file_path = Path(__file__).parent / f"{split}.jsonl"
53
+
54
+ # Load the data from the jsonl file
55
+ test_data = []
56
+ with jsonlines.open(test_file_path) as reader:
57
+ for item in reader:
58
+ test_data.append(item)
59
+
60
+ test_data = [item for item in test_data if "'" not in item["shuffled_tokenized"] and "’" not in item["shuffled_tokenized"]]
61
+
62
+ # Limit to first num_samples items for faster evaluation
63
+ test_data = test_data[:num_samples]
64
+
65
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
66
+
67
+ responses = [None] * num_samples # Pre-allocate a list to store responses in order
68
+ instantiated_prompts = []
69
+
70
+ # Create and process batches
71
+ for batch_data in chunk_list(test_data, batch_size):
72
+ # Prepare the prompts for this batch
73
+ batch_prompts = [
74
+ prompt.replace("{% shuffled_sentence %}", test_item["shuffled_tokenized"])
75
+ for test_item in batch_data
76
+ ]
77
+ instantiated_prompts.extend(batch_prompts)
78
+
79
+ # Send requests in parallel using ThreadPoolExecutor
80
+ with ThreadPoolExecutor() as executor:
81
+ futures = {executor.submit(send_request, client, item_prompt, i): i for i, item_prompt in enumerate(batch_prompts, start=len(instantiated_prompts) - len(batch_prompts))}
82
+
83
+ for future in as_completed(futures):
84
+ try:
85
+ index, response = future.result()
86
+ responses[index] = response # Store the response at the correct index
87
+ except Exception as e:
88
+ print(f"Request failed: {e}")
89
+ responses[index] = "Error: Request failed"
90
+
91
+ # Update progress after each batch
92
+ progress(len(instantiated_prompts) / len(test_data), desc="Processing batches...")
93
+
94
+ # Evaluate responses
95
+ scores = []
96
+ for test_item, instantiated_prompt, response in zip(test_data, instantiated_prompts, responses):
97
+ score = evaluate_response(test_item["original_tokenized"], response)
98
+ scores.append(score)
99
+ yield (test_item["original_sentence"], instantiated_prompt, response, score)
evaluation_utils.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import spacy
3
+ from fast_sentence_tokenize import tokenize_text
4
+
5
+
6
+ def evaluate_response(original_tokenized: str, response: str) -> int:
7
+ """
8
+ - Tokenize response string using spacy
9
+ - Create a list of response tokens
10
+ - Assign every original token a rank:
11
+ - Only look at the last mention of a token in the response
12
+ - Rank the tokens by how early they appear in the response (last mention only)
13
+ - Calculate ranking accuracy
14
+
15
+ Returns a value between 0 and 1
16
+ """
17
+ original_tokenized = original_tokenized.strip().lower()
18
+ response = response.strip().lower()
19
+
20
+ # Tokenize response string using a simple regex-based tokenization
21
+ response_tokens = tokenize_text(response)
22
+
23
+ # Create a list of original tokens
24
+ original_tokens = original_tokenized.split()
25
+
26
+ # Create ranks for response tokens
27
+ response_token_ranks = {}
28
+ for token in original_tokens:
29
+ if token not in response_tokens:
30
+ return 0 # If any original token is missing from the response, return 0 immediately
31
+
32
+ # Create ranks for original tokens
33
+ original_token_ranks = {}
34
+ for i, token in enumerate(original_tokens):
35
+ original_token_ranks[token] = i
36
+
37
+ # Create ranks for response tokens
38
+ for token in original_tokens:
39
+ # Assign index of last occurrence of token in response
40
+ response_token_ranks[token] = len(response_tokens) - 1 - response_tokens[::-1].index(token)
41
+
42
+ # Normalize the response token ranks
43
+ sorted_ranks = sorted(set(response_token_ranks.values()))
44
+ rank_mapping = {old_rank: new_rank for new_rank, old_rank in enumerate(sorted_ranks)}
45
+ for token, rank in response_token_ranks.items():
46
+ response_token_ranks[token] = rank_mapping[rank]
47
+
48
+ # Calculate Kendall's tau
49
+ n = len(original_tokens)
50
+ concordant_pairs = 0
51
+ discordant_pairs = 0
52
+
53
+ for i in range(n):
54
+ for j in range(i + 1, n):
55
+ original_diff = original_token_ranks[original_tokens[i]] - original_token_ranks[original_tokens[j]]
56
+ response_diff = response_token_ranks[original_tokens[i]] - response_token_ranks[original_tokens[j]]
57
+
58
+ if original_diff * response_diff > 0:
59
+ concordant_pairs += 1
60
+ elif original_diff * response_diff < 0:
61
+ discordant_pairs += 1
62
+
63
+ total_pairs = n * (n - 1) // 2
64
+ kendall_tau = (concordant_pairs - discordant_pairs) / total_pairs
65
+
66
+ # Normalize Kendall's tau to be between 0 and 1
67
+ normalized_kendall_tau = (kendall_tau + 1) / 2
68
+
69
+ return normalized_kendall_tau
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ spacy
2
+ jsonlines
3
+ openai
4
+ python-dotenv
5
+ fast-sentence-tokenize
test.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
train.jsonl ADDED
The diff for this file is too large to render. See raw diff