This Notebook is to test the various OpenAI models, prompts, and number of few-shot examples to see how they perform on the same task.

In [1]:
!pip install wandb --upgrade openai datasets -qU

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

import openai

# set OPENAI_API_KEY environment variable from .env file
openai.api_key = os.getenv("OPENAI_API_KEY")

# import OpenAIChatCompletions class from openai_chat_completion.py file and compare_completion_and_prediction function from util.py file
from openai_chat_completion import OpenAIChatCompletions
from util import compare_completion_and_prediction

Models:
- gpt-3.5-turbo
- gpt-4

Prompts:
- gpt4-system-message.txt

Few-shot examples:
> 0 ... 10

wandb setup:
- entity: kaleidoscope-data
- project: cookies_llm_experimental_eval
- tags: gpt-3.5-turbo, gpt-4, gpt4-system-message, few-shot

In [3]:
from wandb.integration.openai import autolog

autolog({"project":"cookies_llm_experimental_eval",
         "entity": "kaleidoscope-data",
         "group": "cookies",
         "job_type": "eval"})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 48
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/cmagganas/.netrc


In [4]:
# create an empty dataframe to store predictions
import pandas as pd
predictions_df = pd.DataFrame(columns=['model', 'system_message', 'n_shot', 'prompt', 'completion', 'prediction'])

models_to_test = ["gpt-4", "gpt-3.5-turbo"]
sys_mes_to_test = ["../prompts/gpt4-system-message.txt", "../prompts/gpt4-system-message2.txt"] # names are arbitrary, same prompts but with "####" in system message 2
n_shots_to_test = [None, 1, 2, 3, 5]

In [6]:
# if rerunning the below cell is required, set the following to True
rerun = False
if rerun:
    predictions_df = pd.read_csv('../data/cookies_llm_eval_predictions.csv')

In [178]:
# get predictions for all combinations of models, prompts, and n_shot values
# save predictions to dataframe and then to csv in data folder after each iteration

# loop through models_to_test
for model in models_to_test:
    # loop through prompts_to_test
    for system_message in sys_mes_to_test:
            # instantiate OpenAIChatCompletions class
            chat = OpenAIChatCompletions(model=model, system_message=system_message)
            # loop through n_shots_to_test
            for n_shot in n_shots_to_test:
                sys_mes_var = 1 if system_message == "../prompts/gpt4-system-message.txt" else 2
                n_shot_var = 0 if n_shot == None else n_shot
                # check if predictions for this model, system_message, and n_shot value have already been made
                if predictions_df[(predictions_df['model'] == model) & (predictions_df['system_message'] == sys_mes_var) & (predictions_df['n_shot'] == n_shot_var)].shape[0] == 0:
                    prompts, completions, predictions = chat.predict_jsonl(n_shot=n_shot)
                else:
                     # skip if predictions for this model, system_message, and n_shot value have already been made
                    continue
                # save predictions to dataframe
                df_to_append = pd.DataFrame({'model': model, 'system_message': sys_mes_var, 'n_shot': n_shot_var, 'prompt': prompts, 'completion': completions, 'prediction': predictions})
                df_right = df_to_append['prediction'].apply(pd.Series)
                df_right['prediction'] = df_right['choices'].apply(lambda x: x[0]['message']['content']).drop(columns=['choices'])
                df_to_append = pd.concat([df_to_append[['model', 'system_message', 'n_shot', 'prompt', 'completion']], df_right], axis=1)
                df_to_append.columns = ['model', 'system_message', 'n_shot', 'prompt', 'completion', 'id', 'object', 'created', 'openai_model', 'choices', 'usage', 'prediction']
                # save predictions to dataframe
                predictions_df = pd.concat([predictions_df, df_to_append], ignore_index=True)
                # delete duplicates from dataframe
                predictions_df = predictions_df[~predictions_df.duplicated(subset=['model', 'system_message', 'n_shot', 'prompt'])]
                predictions_df.to_csv('../data/cookies_llm_eval_predictions.csv', index=False)

In [179]:
predictions_df = predictions_df[~predictions_df.duplicated(subset=['model', 'system_message', 'n_shot', 'prompt'])]

In [180]:
predictions_df.shape

(400, 12)

In [143]:
# import numpy as np

# ids = predictions_df['id'].isna()
# # apply pd.Series to predictions column for rows where id is not null and change system_message {0,1} to {1,2}
# new_df_right = predictions_df.loc[ids, 'prediction'].apply(pd.Series)
# new_df_right['prediction'] = new_df_right['choices'].apply(lambda x: x[0]['message']['content']).drop(columns=['choices'])
# new_df_left = predictions_df.loc[ids, ['model', 'system_message', 'n_shot', 'prompt', 'completion']].replace({0:1, 1:2})
# new_df = pd.concat([new_df_left, new_df_right], axis=1)

# predictions_df.columns = ['model', 'system_message', 'n_shot', 'prompt', 'completion', 'id', 'object', 'created', 'openai_model', 'choices', 'usage', 'prediction']
# new_df.columns = ['model', 'system_message', 'n_shot', 'prompt', 'completion', 'id', 'object', 'created', 'openai_model', 'choices', 'usage', 'prediction']
# predictions_df.loc[ids] = new_df

In [155]:
# for col in ['model','system_message','n_shot']:
#     print(predictions_df[col].value_counts())

In [84]:
# import numpy as np

# # create a copy of predictions_df to manipulate
# new_predictions_df = predictions_df

# # replace names with 1 or 2
# def replace_sys_mes_name(x):
#     if x == "../prompts/gpt4-system-message.txt":
#         return "1"
#     elif x == "../prompts/gpt4-system-message2.txt":
#         return "2"
#     else:
#         return x
# new_predictions_df['system_message'] = new_predictions_df['system_message'].apply(lambda x: replace_sys_mes_name(x))
# # replace None with 0
# new_predictions_df['n_shot'] = new_predictions_df['n_shot'].apply(lambda x: 0 if x == None or np.nan else x)

# # break up prediction column into sub columns by each of json keys
# new_predictions_df = pd.concat([new_predictions_df, new_predictions_df['prediction'].apply(pd.Series)], axis=1)

In [168]:
# predictions_df.drop(columns=['num_correct'], inplace=True)

In [181]:
predictions_df

Unnamed: 0,model,system_message,n_shot,prompt,completion,id,object,created,openai_model,choices,usage,prediction
0,gpt-4,1,0,"co-2MFE5QVF,Chill Medicated - Watermelon - Syr...","Chill Medicated,Edible,Beverage,nan,nan",chatcmpl-7VlTkjAqXNRWfltMPpr5v37uBJIsg,chat.completion,1.687805e+09,gpt-4-0314,"[<OpenAIObject at 0x7fcf7fde94e0> JSON: {\n ""...","{\n ""prompt_tokens"": 54,\n ""completion_token...",Hello! It looks like you mentioned a product: ...
1,gpt-4,1,0,"bl-111630024545,Feelz - Space Cowboy 3.5g,nan,...","Feelz,Flower,Bud,Space Cowboy,3.5",chatcmpl-7VlTtGF3RGsngfKB1BXufxoTixX2v,chat.completion,1.687805e+09,gpt-4-0314,"[<OpenAIObject at 0x7fcf7f49d2b0> JSON: {\n ""...","{\n ""prompt_tokens"": 51,\n ""completion_token...",Hello! It seems like you are referring to a pr...
2,gpt-4,1,0,"fl-8voAjt83sD,Champelli | Xclusivo 3.5g | Eigh...","Champelli,Flower,Bud,Xclusivo,3.5",chatcmpl-7VlU80b0m00VaiGymtj9dbqOggTgR,chat.completion,1.687805e+09,gpt-4-0314,"[<OpenAIObject at 0x7fcf7e306890> JSON: {\n ""...","{\n ""prompt_tokens"": 71,\n ""completion_token...",Hello! It seems like you're interested in the ...
3,gpt-4,1,0,"bl-073133213364,CAM - Mellowz #7 7g,nan,FLOWER...","CAM,Flower,Bud,Mellowz #7,7",chatcmpl-7VlUHqbsG2kpFHDxAWfsryh6pHmC9,chat.completion,1.687805e+09,gpt-4-0314,"[<OpenAIObject at 0x7fcf7e33d940> JSON: {\n ""...","{\n ""prompt_tokens"": 49,\n ""completion_token...",It seems like you are looking for information ...
4,gpt-4,1,0,"fl-fwJQL2AWnS,Backpack Boyz | Bubblegum Gelato...","Backpack Boyz,Edible,CBD Tincture/Caps/etc,nan...",chatcmpl-7VlUYvcad2wahIMHavhDEkYrgvjpw,chat.completion,1.687805e+09,gpt-4-0314,"[<OpenAIObject at 0x7fcf7e306980> JSON: {\n ""...","{\n ""prompt_tokens"": 59,\n ""completion_token...",Hello! It seems like you are looking for infor...
...,...,...,...,...,...,...,...,...,...,...,...,...
395,gpt-3.5-turbo,2,1,"co-76GP441T,Minntz - Emerald Cut - Indoor - Jo...","Minntz,Preroll,Joint,Emerald Cut,1",chatcmpl-7VrjRMvs2l8EJd4PVecpSRPCvV9Hk,chat.completion,1.687829e+09,gpt-3.5-turbo-0301,"[{'index': 0, 'message': {'role': 'assistant',...","{'prompt_tokens': 125, 'completion_tokens': 23...","Minntz,Joint,Indoor,Emerald Cut,1g,co-76GP441T."
396,gpt-3.5-turbo,2,1,"co-5RAWYHYQ,The Growers Circle - Double Down -...","The Growers Circle,Flower,Bud,Double Down,3.5",chatcmpl-7VrjT3wfVoLtq3G6xksfVtLz4FloJ,chat.completion,1.687829e+09,gpt-3.5-turbo-0301,"[{'index': 0, 'message': {'role': 'assistant',...","{'prompt_tokens': 123, 'completion_tokens': 22...","The Growers Circle,Double Down,Indoor,3.5g,5RA..."
397,gpt-3.5-turbo,2,1,"md-1195389,Blue Dream Roll Your Own Sugar Shak...","Pacific Stone,Flower,Bud,nan,14",chatcmpl-7VrjVafi1eGBXYfgmGBN0H3b0FzYO,chat.completion,1.687829e+09,gpt-3.5-turbo-0301,"[{'index': 0, 'message': {'role': 'assistant',...","{'prompt_tokens': 119, 'completion_tokens': 20...","Pacific Stone,Sugar Shake,Blue Dream,Roll Your..."
398,gpt-3.5-turbo,2,1,"co-847ZXF37,The Grower Circle - Zoo Dawg x Cos...","The Growers Circle,Preroll,Joint,Zoo Dawg x Co...",chatcmpl-7VrjWQpcRxJTdr3f4BUd7totDZpdF,chat.completion,1.687829e+09,gpt-3.5-turbo-0301,"[{'index': 0, 'message': {'role': 'assistant',...","{'prompt_tokens': 133, 'completion_tokens': 32...","Multi Joint,Zoo Dawg x Cosa Nostra,The Grower ..."


In [182]:
from util import compare_completion_and_prediction

# Function that uses compare_completion_and_prediction to return num_correct and return zero if there is an error
def get_num_correct(completion, prediction):
    try:
        return compare_completion_and_prediction(completion, prediction)['num_correct']
    except:
        return 0 # this will be the case when format is incorrect
        
# Apply get_num_correct function to predictions_df dataframe
predictions_df['num_correct'] = predictions_df.apply(lambda row: get_num_correct(row['completion'], row['prediction']), axis=1)
predictions_df['num_correct'].sum() # out of 1000 possible correct predictions (20 samples * 5 cols per sample) * (2 system messages * 2 models * 5 n_shot values)

669

In [187]:
predictions_df.groupby(['model', 'system_message', 'n_shot'])['num_correct'].sum().sort_values() / 100 # out of 100 possible correct predictions (20 samples * 5 cols per sample)

model          system_message  n_shot
gpt-3.5-turbo  1               0         0.00
                               1         0.00
               2               0         0.00
gpt-4          1               0         0.00
                               1         0.00
               2               0         0.00
gpt-3.5-turbo  1               2         0.24
               2               1         0.24
                               2         0.27
                               3         0.36
               1               3         0.40
                               5         0.44
gpt-4          2               2         0.45
               1               2         0.45
               2               1         0.47
gpt-3.5-turbo  2               5         0.56
gpt-4          1               3         0.62
               2               3         0.67
                               5         0.73
               1               5         0.79
Name: num_correct, dtype: float64

In [184]:
new_predictions_df.to_csv('../data/cookies_llm_eval_proc_preds.csv', index=False)

In [76]:
autolog.disable()

0,1
usage/completion_tokens,▆▆▁▁▁▁▁▁▁▁█▄▁▁▁▁▁▁▁▃▁▁▁▆▂▆▃▅▄▅▆▄▃▁▁▁▁▁▁▁
usage/elapsed_time,▄▆▁▁▁▁▂▁▂▁█▃▁▁▁▂▁▁▂▁▁▁▁▄▂▄▂▃▃▄▅▂▁▁▁▁▂▁▁▁
usage/prompt_tokens,▁▁▂▂▄▄▆▅██▁▁▃▃▄▅▅██▁▁▃▃▁▁▁▁▁▁▂▁▂▁▄▄▆▆██▁
usage/total_tokens,▄▄▂▂▃▃▅▅█▇▆▃▂▂▄▅▅▇▇▂▁▃▂▄▂▄▂▄▃▄▄▃▂▄▃▅▆██▁

0,1
usage/completion_tokens,62.0
usage/elapsed_time,2.40086
usage/prompt_tokens,54.0
usage/total_tokens,116.0
