Spaces:

jvamvas
/

prompting_challenge_final_test

Sleeping

App Files Files Community

jvamvas commited on Sep 6

Commit

a2e759c

•

1 Parent(s): 4a63dbd

Initial commit

Browse files

Files changed (6) hide show

app.py +82 -0
app_utils.py +99 -0
evaluation_utils.py +69 -0
requirements.txt +5 -0
test.jsonl +0 -0
train.jsonl +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import gradio as gr
+from app_utils import evaluate_prompt, get_split
+import logging
+logging.basicConfig(level=logging.INFO)
+with gr.Blocks(title=f"Prompting Challenge ({get_split()}") as demo:
+    gr.Markdown(
+        f"""
+        # Prompting Challenge
+        ### ({get_split()})
+        """ + """
+        The goal of this challenge is to prompt GPT-4 to "unscramble" a sentence.
+        The input is a sentence with scrambled word order, e.g.: *"are How ? you"*
+        GPT-4 should identify the original sentence, e.g.: *"How are you?"*
+        Enter your prompt template here. Use `{% shuffled_sentence %}` at the place where you want the shuffled sentence to be inserted.
+        """
+    )
+    input_text = gr.Textbox(
+        lines=10,
+        label="Input Text",
+        value="Unscramble the following sentence: {% shuffled_sentence %}"
+    )
+    submit_button = gr.Button("Submit")
+    results_output = gr.HTML(label="Results")
+    def update_results(prompt):
+        result_tuples = list(evaluate_prompt(prompt))
+        if result_tuples:
+            total_score = sum(item_score for _, _, _, item_score in result_tuples)
+            score = total_score / len(result_tuples)
+        else:
+            score = 0
+        html_output = "<dl style='font-family: Arial, sans-serif;'>"
+        html_output += f"<h2 style='color: #333; margin-top: 20px; margin-bottom: 20px;'>Accuracy: {100 * score:.1f}%</h2>"
+        newline = '\n'
+        for index, (original, prompt, response, item_score) in enumerate(result_tuples, 1):
+            background_color = "#fff4ea" if item_score < 0.5 else "#e4ffe4" if item_score > 0.9 else "whitesmoke"
+            html_output += f"""
+                <div style='background-color: {background_color}; padding: 10px; margin-bottom: 20px;'>
+                    <h3 style='color: #333; margin-top: 0;'>Test item #{index}</h3>
+                    <dt style='padding: 5px;'>
+                        <span style='font-weight: 600;'>Original Sentence:</span>
+                    </dt>
+                    <dd style='margin-left: 20px; padding: 5px;'>{original.replace(newline, "<br>")}</dd>
+                    <dt style='padding: 5px;'>
+                        <span style='font-weight: 600;'>Prompt:</span>
+                    </dt>
+                    <dd style='margin-left: 20px; padding: 5px;'>{prompt.replace(newline, "<br>")}</dd>
+                    <dt style='padding: 5px;'>
+                        <span style='font-weight: 600;'>Response by GPT-4:</span>
+                    </dt>
+                    <dd style='margin-left: 20px; padding: 5px;font-style: italic;'>{response.replace(newline, "<br>")}</dd>
+                    <dt style='padding: 5px;'>
+                        <span style='font-weight: 600;'>Score:</span>
+                    </dt>
+                    <dd style='margin-left: 20px; padding: 5px;'>
+                        <span style='color: #333;'>{100 * item_score:.1f}%</span>
+                    </dd>
+                </div>
+            """
+        html_output += "</dl>"
+        return html_output
+    submit_button.click(
+        fn=update_results,
+        inputs=[input_text],
+        outputs=[results_output]
+    )
+if __name__ == "__main__":
+    demo.launch()
+    # demo.launch(share=True)

app_utils.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import gradio as gr
+import jsonlines
+from openai import OpenAI
+from dotenv import load_dotenv
+from evaluation_utils import evaluate_response
+def get_split():
+    load_dotenv()
+    split = os.getenv("SPLIT")
+    if split == "train":
+        return "evaluation on development set"
+    elif split == "test":
+        return "evaluation on test set"
+# Utility function to chunk a list into batches
+def chunk_list(data, chunk_size):
+    for i in range(0, len(data), chunk_size):
+        yield data[i:i + chunk_size]
+# Function to send an individual request to the OpenAI API
+def send_request(client, prompt, index):
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        temperature=0,
+        seed=42,
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt},
+        ],
+        max_tokens=1024,
+    )
+    return index, response.choices[0].message.content
+def evaluate_prompt(prompt: str, num_samples: int = None, split: str = None, batch_size: int = 5, progress=gr.Progress()):
+    progress(0, desc="Starting...")
+    load_dotenv()
+    if num_samples is None:
+        num_samples = int(os.getenv("NUM_SAMPLES"))
+    if split is None:
+        split = os.getenv("SPLIT")
+    assert split in ["train", "test"]
+    # Define the path to the test.jsonl file
+    test_file_path = Path(__file__).parent / f"{split}.jsonl"
+    # Load the data from the jsonl file
+    test_data = []
+    with jsonlines.open(test_file_path) as reader:
+        for item in reader:
+            test_data.append(item)
+    test_data = [item for item in test_data if "'" not in item["shuffled_tokenized"] and "’" not in item["shuffled_tokenized"]]
+    # Limit to first num_samples items for faster evaluation
+    test_data = test_data[:num_samples]
+    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
+    responses = [None] * num_samples  # Pre-allocate a list to store responses in order
+    instantiated_prompts = []
+    # Create and process batches
+    for batch_data in chunk_list(test_data, batch_size):
+        # Prepare the prompts for this batch
+        batch_prompts = [
+            prompt.replace("{% shuffled_sentence %}", test_item["shuffled_tokenized"])
+            for test_item in batch_data
+        ]
+        instantiated_prompts.extend(batch_prompts)
+        # Send requests in parallel using ThreadPoolExecutor
+        with ThreadPoolExecutor() as executor:
+            futures = {executor.submit(send_request, client, item_prompt, i): i for i, item_prompt in enumerate(batch_prompts, start=len(instantiated_prompts) - len(batch_prompts))}
+            for future in as_completed(futures):
+                try:
+                    index, response = future.result()
+                    responses[index] = response  # Store the response at the correct index
+                except Exception as e:
+                    print(f"Request failed: {e}")
+                    responses[index] = "Error: Request failed"
+        # Update progress after each batch
+        progress(len(instantiated_prompts) / len(test_data), desc="Processing batches...")
+    # Evaluate responses
+    scores = []
+    for test_item, instantiated_prompt, response in zip(test_data, instantiated_prompts, responses):
+        score = evaluate_response(test_item["original_tokenized"], response)
+        scores.append(score)
+        yield (test_item["original_sentence"], instantiated_prompt, response, score)

evaluation_utils.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import spacy
+from fast_sentence_tokenize import tokenize_text
+def evaluate_response(original_tokenized: str, response: str) -> int:
+    """
+    - Tokenize response string using spacy
+    - Create a list of response tokens
+    - Assign every original token a rank:
+        - Only look at the last mention of a token in the response
+        - Rank the tokens by how early they appear in the response (last mention only)
+    - Calculate ranking accuracy
+    Returns a value between 0 and 1
+    """
+    original_tokenized = original_tokenized.strip().lower()
+    response = response.strip().lower()
+    # Tokenize response string using a simple regex-based tokenization
+    response_tokens = tokenize_text(response)
+    # Create a list of original tokens
+    original_tokens = original_tokenized.split()
+    # Create ranks for response tokens
+    response_token_ranks = {}
+    for token in original_tokens:
+        if token not in response_tokens:
+            return 0  # If any original token is missing from the response, return 0 immediately
+    # Create ranks for original tokens
+    original_token_ranks = {}
+    for i, token in enumerate(original_tokens):
+            original_token_ranks[token] = i
+    # Create ranks for response tokens
+    for token in original_tokens:
+        # Assign index of last occurrence of token in response
+        response_token_ranks[token] = len(response_tokens) - 1 - response_tokens[::-1].index(token)
+    # Normalize the response token ranks
+    sorted_ranks = sorted(set(response_token_ranks.values()))
+    rank_mapping = {old_rank: new_rank for new_rank, old_rank in enumerate(sorted_ranks)}
+    for token, rank in response_token_ranks.items():
+        response_token_ranks[token] = rank_mapping[rank]
+    # Calculate Kendall's tau
+    n = len(original_tokens)
+    concordant_pairs = 0
+    discordant_pairs = 0
+    for i in range(n):
+        for j in range(i + 1, n):
+            original_diff = original_token_ranks[original_tokens[i]] - original_token_ranks[original_tokens[j]]
+            response_diff = response_token_ranks[original_tokens[i]] - response_token_ranks[original_tokens[j]]
+            if original_diff * response_diff > 0:
+                concordant_pairs += 1
+            elif original_diff * response_diff < 0:
+                discordant_pairs += 1
+    total_pairs = n * (n - 1) // 2
+    kendall_tau = (concordant_pairs - discordant_pairs) / total_pairs
+    # Normalize Kendall's tau to be between 0 and 1
+    normalized_kendall_tau = (kendall_tau + 1) / 2
+    return normalized_kendall_tau

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+spacy
+jsonlines
+openai
+python-dotenv
+fast-sentence-tokenize

test.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

train.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff