cmagganas commited on
Commit
23722a9
1 Parent(s): 03e5095

Delete app/cookies_openai_model_eval.ipynb

Browse files
Files changed (1) hide show
  1. app/cookies_openai_model_eval.ipynb +0 -797
app/cookies_openai_model_eval.ipynb DELETED
@@ -1,797 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "attachments": {},
5
- "cell_type": "markdown",
6
- "metadata": {},
7
- "source": [
8
- "This Notebook is to test the various OpenAI models, prompts, and number of few-shot examples to see how they perform on the same task."
9
- ]
10
- },
11
- {
12
- "cell_type": "code",
13
- "execution_count": 1,
14
- "metadata": {},
15
- "outputs": [],
16
- "source": [
17
- "!pip install wandb --upgrade openai datasets -qU"
18
- ]
19
- },
20
- {
21
- "cell_type": "code",
22
- "execution_count": 2,
23
- "metadata": {},
24
- "outputs": [],
25
- "source": [
26
- "import os\n",
27
- "from dotenv import load_dotenv\n",
28
- "load_dotenv()\n",
29
- "\n",
30
- "import openai\n",
31
- "\n",
32
- "# set OPENAI_API_KEY environment variable from .env file\n",
33
- "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
34
- "\n",
35
- "# import OpenAIChatCompletions class from openai_chat_completion.py file and compare_completion_and_prediction function from util.py file\n",
36
- "from openai_chat_completion import OpenAIChatCompletions\n",
37
- "from util import compare_completion_and_prediction"
38
- ]
39
- },
40
- {
41
- "attachments": {},
42
- "cell_type": "markdown",
43
- "metadata": {},
44
- "source": [
45
- "Models:\n",
46
- "- gpt-3.5-turbo\n",
47
- "- gpt-4\n",
48
- "\n",
49
- "Prompts:\n",
50
- "- gpt4-system-message.txt\n",
51
- "\n",
52
- "Few-shot examples:\n",
53
- "> 0 ... 10"
54
- ]
55
- },
56
- {
57
- "attachments": {},
58
- "cell_type": "markdown",
59
- "metadata": {},
60
- "source": [
61
- "wandb setup:\n",
62
- "- entity: kaleidoscope-data\n",
63
- "- project: cookies_llm_experimental_eval\n",
64
- "- tags: gpt-3.5-turbo, gpt-4, gpt4-system-message, few-shot"
65
- ]
66
- },
67
- {
68
- "cell_type": "code",
69
- "execution_count": 3,
70
- "metadata": {},
71
- "outputs": [
72
- {
73
- "name": "stderr",
74
- "output_type": "stream",
75
- "text": [
76
- "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
77
- "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n",
78
- "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n",
79
- "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[32m\u001b[41mERROR\u001b[0m API key must be 40 characters long, yours was 48\n",
80
- "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n",
81
- "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n",
82
- "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /home/cmagganas/.netrc\n"
83
- ]
84
- },
85
- {
86
- "data": {
87
- "text/html": [
88
- "Tracking run with wandb version 0.15.4"
89
- ],
90
- "text/plain": [
91
- "<IPython.core.display.HTML object>"
92
- ]
93
- },
94
- "metadata": {},
95
- "output_type": "display_data"
96
- },
97
- {
98
- "data": {
99
- "text/html": [
100
- "Run data is saved locally in <code>/home/cmagganas/kaleidoscope/llm_data_cleaner/app/wandb/run-20230626_114056-rbtf91s6</code>"
101
- ],
102
- "text/plain": [
103
- "<IPython.core.display.HTML object>"
104
- ]
105
- },
106
- "metadata": {},
107
- "output_type": "display_data"
108
- },
109
- {
110
- "data": {
111
- "text/html": [
112
- "Syncing run <strong><a href='https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval/runs/rbtf91s6' target=\"_blank\">rose-puddle-7</a></strong> to <a href='https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
113
- ],
114
- "text/plain": [
115
- "<IPython.core.display.HTML object>"
116
- ]
117
- },
118
- "metadata": {},
119
- "output_type": "display_data"
120
- },
121
- {
122
- "data": {
123
- "text/html": [
124
- " View project at <a href='https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval' target=\"_blank\">https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval</a>"
125
- ],
126
- "text/plain": [
127
- "<IPython.core.display.HTML object>"
128
- ]
129
- },
130
- "metadata": {},
131
- "output_type": "display_data"
132
- },
133
- {
134
- "data": {
135
- "text/html": [
136
- " View run at <a href='https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval/runs/rbtf91s6' target=\"_blank\">https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval/runs/rbtf91s6</a>"
137
- ],
138
- "text/plain": [
139
- "<IPython.core.display.HTML object>"
140
- ]
141
- },
142
- "metadata": {},
143
- "output_type": "display_data"
144
- }
145
- ],
146
- "source": [
147
- "from wandb.integration.openai import autolog\n",
148
- "\n",
149
- "autolog({\"project\":\"cookies_llm_experimental_eval\",\n",
150
- " \"entity\": \"kaleidoscope-data\",\n",
151
- " \"group\": \"cookies\",\n",
152
- " \"job_type\": \"eval\"})"
153
- ]
154
- },
155
- {
156
- "cell_type": "code",
157
- "execution_count": 4,
158
- "metadata": {},
159
- "outputs": [],
160
- "source": [
161
- "# create an empty dataframe to store predictions\n",
162
- "import pandas as pd\n",
163
- "predictions_df = pd.DataFrame(columns=['model', 'system_message', 'n_shot', 'prompt', 'completion', 'prediction'])\n",
164
- "\n",
165
- "models_to_test = [\"gpt-4\", \"gpt-3.5-turbo\"]\n",
166
- "sys_mes_to_test = [\"../prompts/gpt4-system-message.txt\", \"../prompts/gpt4-system-message2.txt\"] # names are arbitrary, same prompts but with \"####\" in system message 2\n",
167
- "n_shots_to_test = [None, 1, 2, 3, 5]"
168
- ]
169
- },
170
- {
171
- "cell_type": "code",
172
- "execution_count": 6,
173
- "metadata": {},
174
- "outputs": [],
175
- "source": [
176
- "# if rerunning the below cell is required, set the following to True\n",
177
- "rerun = False\n",
178
- "if rerun:\n",
179
- " predictions_df = pd.read_csv('../data/cookies_llm_eval_predictions.csv')"
180
- ]
181
- },
182
- {
183
- "cell_type": "code",
184
- "execution_count": 178,
185
- "metadata": {},
186
- "outputs": [],
187
- "source": [
188
- "# get predictions for all combinations of models, prompts, and n_shot values\n",
189
- "# save predictions to dataframe and then to csv in data folder after each iteration\n",
190
- "\n",
191
- "# loop through models_to_test\n",
192
- "for model in models_to_test:\n",
193
- " # loop through prompts_to_test\n",
194
- " for system_message in sys_mes_to_test:\n",
195
- " # instantiate OpenAIChatCompletions class\n",
196
- " chat = OpenAIChatCompletions(model=model, system_message=system_message)\n",
197
- " # loop through n_shots_to_test\n",
198
- " for n_shot in n_shots_to_test:\n",
199
- " sys_mes_var = 1 if system_message == \"../prompts/gpt4-system-message.txt\" else 2\n",
200
- " n_shot_var = 0 if n_shot == None else n_shot\n",
201
- " # check if predictions for this model, system_message, and n_shot value have already been made\n",
202
- " if predictions_df[(predictions_df['model'] == model) & (predictions_df['system_message'] == sys_mes_var) & (predictions_df['n_shot'] == n_shot_var)].shape[0] == 0:\n",
203
- " prompts, completions, predictions = chat.predict_jsonl(n_shot=n_shot)\n",
204
- " else:\n",
205
- " # skip if predictions for this model, system_message, and n_shot value have already been made\n",
206
- " continue\n",
207
- " # save predictions to dataframe\n",
208
- " df_to_append = pd.DataFrame({'model': model, 'system_message': sys_mes_var, 'n_shot': n_shot_var, 'prompt': prompts, 'completion': completions, 'prediction': predictions})\n",
209
- " df_right = df_to_append['prediction'].apply(pd.Series)\n",
210
- " df_right['prediction'] = df_right['choices'].apply(lambda x: x[0]['message']['content']).drop(columns=['choices'])\n",
211
- " df_to_append = pd.concat([df_to_append[['model', 'system_message', 'n_shot', 'prompt', 'completion']], df_right], axis=1)\n",
212
- " df_to_append.columns = ['model', 'system_message', 'n_shot', 'prompt', 'completion', 'id', 'object', 'created', 'openai_model', 'choices', 'usage', 'prediction']\n",
213
- " # save predictions to dataframe\n",
214
- " predictions_df = pd.concat([predictions_df, df_to_append], ignore_index=True)\n",
215
- " # delete duplicates from dataframe\n",
216
- " predictions_df = predictions_df[~predictions_df.duplicated(subset=['model', 'system_message', 'n_shot', 'prompt'])]\n",
217
- " predictions_df.to_csv('../data/cookies_llm_eval_predictions.csv', index=False)"
218
- ]
219
- },
220
- {
221
- "cell_type": "code",
222
- "execution_count": 179,
223
- "metadata": {},
224
- "outputs": [],
225
- "source": [
226
- "predictions_df = predictions_df[~predictions_df.duplicated(subset=['model', 'system_message', 'n_shot', 'prompt'])]"
227
- ]
228
- },
229
- {
230
- "cell_type": "code",
231
- "execution_count": 180,
232
- "metadata": {},
233
- "outputs": [
234
- {
235
- "data": {
236
- "text/plain": [
237
- "(400, 12)"
238
- ]
239
- },
240
- "execution_count": 180,
241
- "metadata": {},
242
- "output_type": "execute_result"
243
- }
244
- ],
245
- "source": [
246
- "predictions_df.shape"
247
- ]
248
- },
249
- {
250
- "cell_type": "code",
251
- "execution_count": 143,
252
- "metadata": {},
253
- "outputs": [],
254
- "source": [
255
- "# import numpy as np\n",
256
- "\n",
257
- "# ids = predictions_df['id'].isna()\n",
258
- "# # apply pd.Series to predictions column for rows where id is not null and change system_message {0,1} to {1,2}\n",
259
- "# new_df_right = predictions_df.loc[ids, 'prediction'].apply(pd.Series)\n",
260
- "# new_df_right['prediction'] = new_df_right['choices'].apply(lambda x: x[0]['message']['content']).drop(columns=['choices'])\n",
261
- "# new_df_left = predictions_df.loc[ids, ['model', 'system_message', 'n_shot', 'prompt', 'completion']].replace({0:1, 1:2})\n",
262
- "# new_df = pd.concat([new_df_left, new_df_right], axis=1)\n",
263
- "\n",
264
- "# predictions_df.columns = ['model', 'system_message', 'n_shot', 'prompt', 'completion', 'id', 'object', 'created', 'openai_model', 'choices', 'usage', 'prediction']\n",
265
- "# new_df.columns = ['model', 'system_message', 'n_shot', 'prompt', 'completion', 'id', 'object', 'created', 'openai_model', 'choices', 'usage', 'prediction']\n",
266
- "# predictions_df.loc[ids] = new_df"
267
- ]
268
- },
269
- {
270
- "cell_type": "code",
271
- "execution_count": 155,
272
- "metadata": {},
273
- "outputs": [],
274
- "source": [
275
- "# for col in ['model','system_message','n_shot']:\n",
276
- "# print(predictions_df[col].value_counts())"
277
- ]
278
- },
279
- {
280
- "cell_type": "code",
281
- "execution_count": 84,
282
- "metadata": {},
283
- "outputs": [],
284
- "source": [
285
- "# import numpy as np\n",
286
- "\n",
287
- "# # create a copy of predictions_df to manipulate\n",
288
- "# new_predictions_df = predictions_df\n",
289
- "\n",
290
- "# # replace names with 1 or 2\n",
291
- "# def replace_sys_mes_name(x):\n",
292
- "# if x == \"../prompts/gpt4-system-message.txt\":\n",
293
- "# return \"1\"\n",
294
- "# elif x == \"../prompts/gpt4-system-message2.txt\":\n",
295
- "# return \"2\"\n",
296
- "# else:\n",
297
- "# return x\n",
298
- "# new_predictions_df['system_message'] = new_predictions_df['system_message'].apply(lambda x: replace_sys_mes_name(x))\n",
299
- "# # replace None with 0\n",
300
- "# new_predictions_df['n_shot'] = new_predictions_df['n_shot'].apply(lambda x: 0 if x == None or np.nan else x)\n",
301
- "\n",
302
- "# # break up prediction column into sub columns by each of json keys\n",
303
- "# new_predictions_df = pd.concat([new_predictions_df, new_predictions_df['prediction'].apply(pd.Series)], axis=1)"
304
- ]
305
- },
306
- {
307
- "cell_type": "code",
308
- "execution_count": 168,
309
- "metadata": {},
310
- "outputs": [],
311
- "source": [
312
- "# predictions_df.drop(columns=['num_correct'], inplace=True)"
313
- ]
314
- },
315
- {
316
- "cell_type": "code",
317
- "execution_count": 181,
318
- "metadata": {},
319
- "outputs": [
320
- {
321
- "data": {
322
- "text/html": [
323
- "<div>\n",
324
- "<style scoped>\n",
325
- " .dataframe tbody tr th:only-of-type {\n",
326
- " vertical-align: middle;\n",
327
- " }\n",
328
- "\n",
329
- " .dataframe tbody tr th {\n",
330
- " vertical-align: top;\n",
331
- " }\n",
332
- "\n",
333
- " .dataframe thead th {\n",
334
- " text-align: right;\n",
335
- " }\n",
336
- "</style>\n",
337
- "<table border=\"1\" class=\"dataframe\">\n",
338
- " <thead>\n",
339
- " <tr style=\"text-align: right;\">\n",
340
- " <th></th>\n",
341
- " <th>model</th>\n",
342
- " <th>system_message</th>\n",
343
- " <th>n_shot</th>\n",
344
- " <th>prompt</th>\n",
345
- " <th>completion</th>\n",
346
- " <th>id</th>\n",
347
- " <th>object</th>\n",
348
- " <th>created</th>\n",
349
- " <th>openai_model</th>\n",
350
- " <th>choices</th>\n",
351
- " <th>usage</th>\n",
352
- " <th>prediction</th>\n",
353
- " </tr>\n",
354
- " </thead>\n",
355
- " <tbody>\n",
356
- " <tr>\n",
357
- " <th>0</th>\n",
358
- " <td>gpt-4</td>\n",
359
- " <td>1</td>\n",
360
- " <td>0</td>\n",
361
- " <td>co-2MFE5QVF,Chill Medicated - Watermelon - Syr...</td>\n",
362
- " <td>Chill Medicated,Edible,Beverage,nan,nan</td>\n",
363
- " <td>chatcmpl-7VlTkjAqXNRWfltMPpr5v37uBJIsg</td>\n",
364
- " <td>chat.completion</td>\n",
365
- " <td>1.687805e+09</td>\n",
366
- " <td>gpt-4-0314</td>\n",
367
- " <td>[&lt;OpenAIObject at 0x7fcf7fde94e0&gt; JSON: {\\n \"...</td>\n",
368
- " <td>{\\n \"prompt_tokens\": 54,\\n \"completion_token...</td>\n",
369
- " <td>Hello! It looks like you mentioned a product: ...</td>\n",
370
- " </tr>\n",
371
- " <tr>\n",
372
- " <th>1</th>\n",
373
- " <td>gpt-4</td>\n",
374
- " <td>1</td>\n",
375
- " <td>0</td>\n",
376
- " <td>bl-111630024545,Feelz - Space Cowboy 3.5g,nan,...</td>\n",
377
- " <td>Feelz,Flower,Bud,Space Cowboy,3.5</td>\n",
378
- " <td>chatcmpl-7VlTtGF3RGsngfKB1BXufxoTixX2v</td>\n",
379
- " <td>chat.completion</td>\n",
380
- " <td>1.687805e+09</td>\n",
381
- " <td>gpt-4-0314</td>\n",
382
- " <td>[&lt;OpenAIObject at 0x7fcf7f49d2b0&gt; JSON: {\\n \"...</td>\n",
383
- " <td>{\\n \"prompt_tokens\": 51,\\n \"completion_token...</td>\n",
384
- " <td>Hello! It seems like you are referring to a pr...</td>\n",
385
- " </tr>\n",
386
- " <tr>\n",
387
- " <th>2</th>\n",
388
- " <td>gpt-4</td>\n",
389
- " <td>1</td>\n",
390
- " <td>0</td>\n",
391
- " <td>fl-8voAjt83sD,Champelli | Xclusivo 3.5g | Eigh...</td>\n",
392
- " <td>Champelli,Flower,Bud,Xclusivo,3.5</td>\n",
393
- " <td>chatcmpl-7VlU80b0m00VaiGymtj9dbqOggTgR</td>\n",
394
- " <td>chat.completion</td>\n",
395
- " <td>1.687805e+09</td>\n",
396
- " <td>gpt-4-0314</td>\n",
397
- " <td>[&lt;OpenAIObject at 0x7fcf7e306890&gt; JSON: {\\n \"...</td>\n",
398
- " <td>{\\n \"prompt_tokens\": 71,\\n \"completion_token...</td>\n",
399
- " <td>Hello! It seems like you're interested in the ...</td>\n",
400
- " </tr>\n",
401
- " <tr>\n",
402
- " <th>3</th>\n",
403
- " <td>gpt-4</td>\n",
404
- " <td>1</td>\n",
405
- " <td>0</td>\n",
406
- " <td>bl-073133213364,CAM - Mellowz #7 7g,nan,FLOWER...</td>\n",
407
- " <td>CAM,Flower,Bud,Mellowz #7,7</td>\n",
408
- " <td>chatcmpl-7VlUHqbsG2kpFHDxAWfsryh6pHmC9</td>\n",
409
- " <td>chat.completion</td>\n",
410
- " <td>1.687805e+09</td>\n",
411
- " <td>gpt-4-0314</td>\n",
412
- " <td>[&lt;OpenAIObject at 0x7fcf7e33d940&gt; JSON: {\\n \"...</td>\n",
413
- " <td>{\\n \"prompt_tokens\": 49,\\n \"completion_token...</td>\n",
414
- " <td>It seems like you are looking for information ...</td>\n",
415
- " </tr>\n",
416
- " <tr>\n",
417
- " <th>4</th>\n",
418
- " <td>gpt-4</td>\n",
419
- " <td>1</td>\n",
420
- " <td>0</td>\n",
421
- " <td>fl-fwJQL2AWnS,Backpack Boyz | Bubblegum Gelato...</td>\n",
422
- " <td>Backpack Boyz,Edible,CBD Tincture/Caps/etc,nan...</td>\n",
423
- " <td>chatcmpl-7VlUYvcad2wahIMHavhDEkYrgvjpw</td>\n",
424
- " <td>chat.completion</td>\n",
425
- " <td>1.687805e+09</td>\n",
426
- " <td>gpt-4-0314</td>\n",
427
- " <td>[&lt;OpenAIObject at 0x7fcf7e306980&gt; JSON: {\\n \"...</td>\n",
428
- " <td>{\\n \"prompt_tokens\": 59,\\n \"completion_token...</td>\n",
429
- " <td>Hello! It seems like you are looking for infor...</td>\n",
430
- " </tr>\n",
431
- " <tr>\n",
432
- " <th>...</th>\n",
433
- " <td>...</td>\n",
434
- " <td>...</td>\n",
435
- " <td>...</td>\n",
436
- " <td>...</td>\n",
437
- " <td>...</td>\n",
438
- " <td>...</td>\n",
439
- " <td>...</td>\n",
440
- " <td>...</td>\n",
441
- " <td>...</td>\n",
442
- " <td>...</td>\n",
443
- " <td>...</td>\n",
444
- " <td>...</td>\n",
445
- " </tr>\n",
446
- " <tr>\n",
447
- " <th>395</th>\n",
448
- " <td>gpt-3.5-turbo</td>\n",
449
- " <td>2</td>\n",
450
- " <td>1</td>\n",
451
- " <td>co-76GP441T,Minntz - Emerald Cut - Indoor - Jo...</td>\n",
452
- " <td>Minntz,Preroll,Joint,Emerald Cut,1</td>\n",
453
- " <td>chatcmpl-7VrjRMvs2l8EJd4PVecpSRPCvV9Hk</td>\n",
454
- " <td>chat.completion</td>\n",
455
- " <td>1.687829e+09</td>\n",
456
- " <td>gpt-3.5-turbo-0301</td>\n",
457
- " <td>[{'index': 0, 'message': {'role': 'assistant',...</td>\n",
458
- " <td>{'prompt_tokens': 125, 'completion_tokens': 23...</td>\n",
459
- " <td>Minntz,Joint,Indoor,Emerald Cut,1g,co-76GP441T.</td>\n",
460
- " </tr>\n",
461
- " <tr>\n",
462
- " <th>396</th>\n",
463
- " <td>gpt-3.5-turbo</td>\n",
464
- " <td>2</td>\n",
465
- " <td>1</td>\n",
466
- " <td>co-5RAWYHYQ,The Growers Circle - Double Down -...</td>\n",
467
- " <td>The Growers Circle,Flower,Bud,Double Down,3.5</td>\n",
468
- " <td>chatcmpl-7VrjT3wfVoLtq3G6xksfVtLz4FloJ</td>\n",
469
- " <td>chat.completion</td>\n",
470
- " <td>1.687829e+09</td>\n",
471
- " <td>gpt-3.5-turbo-0301</td>\n",
472
- " <td>[{'index': 0, 'message': {'role': 'assistant',...</td>\n",
473
- " <td>{'prompt_tokens': 123, 'completion_tokens': 22...</td>\n",
474
- " <td>The Growers Circle,Double Down,Indoor,3.5g,5RA...</td>\n",
475
- " </tr>\n",
476
- " <tr>\n",
477
- " <th>397</th>\n",
478
- " <td>gpt-3.5-turbo</td>\n",
479
- " <td>2</td>\n",
480
- " <td>1</td>\n",
481
- " <td>md-1195389,Blue Dream Roll Your Own Sugar Shak...</td>\n",
482
- " <td>Pacific Stone,Flower,Bud,nan,14</td>\n",
483
- " <td>chatcmpl-7VrjVafi1eGBXYfgmGBN0H3b0FzYO</td>\n",
484
- " <td>chat.completion</td>\n",
485
- " <td>1.687829e+09</td>\n",
486
- " <td>gpt-3.5-turbo-0301</td>\n",
487
- " <td>[{'index': 0, 'message': {'role': 'assistant',...</td>\n",
488
- " <td>{'prompt_tokens': 119, 'completion_tokens': 20...</td>\n",
489
- " <td>Pacific Stone,Sugar Shake,Blue Dream,Roll Your...</td>\n",
490
- " </tr>\n",
491
- " <tr>\n",
492
- " <th>398</th>\n",
493
- " <td>gpt-3.5-turbo</td>\n",
494
- " <td>2</td>\n",
495
- " <td>1</td>\n",
496
- " <td>co-847ZXF37,The Grower Circle - Zoo Dawg x Cos...</td>\n",
497
- " <td>The Growers Circle,Preroll,Joint,Zoo Dawg x Co...</td>\n",
498
- " <td>chatcmpl-7VrjWQpcRxJTdr3f4BUd7totDZpdF</td>\n",
499
- " <td>chat.completion</td>\n",
500
- " <td>1.687829e+09</td>\n",
501
- " <td>gpt-3.5-turbo-0301</td>\n",
502
- " <td>[{'index': 0, 'message': {'role': 'assistant',...</td>\n",
503
- " <td>{'prompt_tokens': 133, 'completion_tokens': 32...</td>\n",
504
- " <td>Multi Joint,Zoo Dawg x Cosa Nostra,The Grower ...</td>\n",
505
- " </tr>\n",
506
- " <tr>\n",
507
- " <th>399</th>\n",
508
- " <td>gpt-3.5-turbo</td>\n",
509
- " <td>2</td>\n",
510
- " <td>1</td>\n",
511
- " <td>co-8EMW15ZM,Flight Bites - S'mores - Gummy - 1...</td>\n",
512
- " <td>Flight Bites,Edible,Gummies,nan,nan</td>\n",
513
- " <td>chatcmpl-7VrjXiUHiyUyH7udPXIjANVmAUrra</td>\n",
514
- " <td>chat.completion</td>\n",
515
- " <td>1.687829e+09</td>\n",
516
- " <td>gpt-3.5-turbo-0301</td>\n",
517
- " <td>[{'index': 0, 'message': {'role': 'assistant',...</td>\n",
518
- " <td>{'prompt_tokens': 129, 'completion_tokens': 21...</td>\n",
519
- " <td>Flight Bites,Gummy,S'mores,10 count,100mg CO₂ ...</td>\n",
520
- " </tr>\n",
521
- " </tbody>\n",
522
- "</table>\n",
523
- "<p>400 rows × 12 columns</p>\n",
524
- "</div>"
525
- ],
526
- "text/plain": [
527
- " model system_message n_shot \\\n",
528
- "0 gpt-4 1 0 \n",
529
- "1 gpt-4 1 0 \n",
530
- "2 gpt-4 1 0 \n",
531
- "3 gpt-4 1 0 \n",
532
- "4 gpt-4 1 0 \n",
533
- ".. ... ... ... \n",
534
- "395 gpt-3.5-turbo 2 1 \n",
535
- "396 gpt-3.5-turbo 2 1 \n",
536
- "397 gpt-3.5-turbo 2 1 \n",
537
- "398 gpt-3.5-turbo 2 1 \n",
538
- "399 gpt-3.5-turbo 2 1 \n",
539
- "\n",
540
- " prompt \\\n",
541
- "0 co-2MFE5QVF,Chill Medicated - Watermelon - Syr... \n",
542
- "1 bl-111630024545,Feelz - Space Cowboy 3.5g,nan,... \n",
543
- "2 fl-8voAjt83sD,Champelli | Xclusivo 3.5g | Eigh... \n",
544
- "3 bl-073133213364,CAM - Mellowz #7 7g,nan,FLOWER... \n",
545
- "4 fl-fwJQL2AWnS,Backpack Boyz | Bubblegum Gelato... \n",
546
- ".. ... \n",
547
- "395 co-76GP441T,Minntz - Emerald Cut - Indoor - Jo... \n",
548
- "396 co-5RAWYHYQ,The Growers Circle - Double Down -... \n",
549
- "397 md-1195389,Blue Dream Roll Your Own Sugar Shak... \n",
550
- "398 co-847ZXF37,The Grower Circle - Zoo Dawg x Cos... \n",
551
- "399 co-8EMW15ZM,Flight Bites - S'mores - Gummy - 1... \n",
552
- "\n",
553
- " completion \\\n",
554
- "0 Chill Medicated,Edible,Beverage,nan,nan \n",
555
- "1 Feelz,Flower,Bud,Space Cowboy,3.5 \n",
556
- "2 Champelli,Flower,Bud,Xclusivo,3.5 \n",
557
- "3 CAM,Flower,Bud,Mellowz #7,7 \n",
558
- "4 Backpack Boyz,Edible,CBD Tincture/Caps/etc,nan... \n",
559
- ".. ... \n",
560
- "395 Minntz,Preroll,Joint,Emerald Cut,1 \n",
561
- "396 The Growers Circle,Flower,Bud,Double Down,3.5 \n",
562
- "397 Pacific Stone,Flower,Bud,nan,14 \n",
563
- "398 The Growers Circle,Preroll,Joint,Zoo Dawg x Co... \n",
564
- "399 Flight Bites,Edible,Gummies,nan,nan \n",
565
- "\n",
566
- " id object created \\\n",
567
- "0 chatcmpl-7VlTkjAqXNRWfltMPpr5v37uBJIsg chat.completion 1.687805e+09 \n",
568
- "1 chatcmpl-7VlTtGF3RGsngfKB1BXufxoTixX2v chat.completion 1.687805e+09 \n",
569
- "2 chatcmpl-7VlU80b0m00VaiGymtj9dbqOggTgR chat.completion 1.687805e+09 \n",
570
- "3 chatcmpl-7VlUHqbsG2kpFHDxAWfsryh6pHmC9 chat.completion 1.687805e+09 \n",
571
- "4 chatcmpl-7VlUYvcad2wahIMHavhDEkYrgvjpw chat.completion 1.687805e+09 \n",
572
- ".. ... ... ... \n",
573
- "395 chatcmpl-7VrjRMvs2l8EJd4PVecpSRPCvV9Hk chat.completion 1.687829e+09 \n",
574
- "396 chatcmpl-7VrjT3wfVoLtq3G6xksfVtLz4FloJ chat.completion 1.687829e+09 \n",
575
- "397 chatcmpl-7VrjVafi1eGBXYfgmGBN0H3b0FzYO chat.completion 1.687829e+09 \n",
576
- "398 chatcmpl-7VrjWQpcRxJTdr3f4BUd7totDZpdF chat.completion 1.687829e+09 \n",
577
- "399 chatcmpl-7VrjXiUHiyUyH7udPXIjANVmAUrra chat.completion 1.687829e+09 \n",
578
- "\n",
579
- " openai_model choices \\\n",
580
- "0 gpt-4-0314 [<OpenAIObject at 0x7fcf7fde94e0> JSON: {\\n \"... \n",
581
- "1 gpt-4-0314 [<OpenAIObject at 0x7fcf7f49d2b0> JSON: {\\n \"... \n",
582
- "2 gpt-4-0314 [<OpenAIObject at 0x7fcf7e306890> JSON: {\\n \"... \n",
583
- "3 gpt-4-0314 [<OpenAIObject at 0x7fcf7e33d940> JSON: {\\n \"... \n",
584
- "4 gpt-4-0314 [<OpenAIObject at 0x7fcf7e306980> JSON: {\\n \"... \n",
585
- ".. ... ... \n",
586
- "395 gpt-3.5-turbo-0301 [{'index': 0, 'message': {'role': 'assistant',... \n",
587
- "396 gpt-3.5-turbo-0301 [{'index': 0, 'message': {'role': 'assistant',... \n",
588
- "397 gpt-3.5-turbo-0301 [{'index': 0, 'message': {'role': 'assistant',... \n",
589
- "398 gpt-3.5-turbo-0301 [{'index': 0, 'message': {'role': 'assistant',... \n",
590
- "399 gpt-3.5-turbo-0301 [{'index': 0, 'message': {'role': 'assistant',... \n",
591
- "\n",
592
- " usage \\\n",
593
- "0 {\\n \"prompt_tokens\": 54,\\n \"completion_token... \n",
594
- "1 {\\n \"prompt_tokens\": 51,\\n \"completion_token... \n",
595
- "2 {\\n \"prompt_tokens\": 71,\\n \"completion_token... \n",
596
- "3 {\\n \"prompt_tokens\": 49,\\n \"completion_token... \n",
597
- "4 {\\n \"prompt_tokens\": 59,\\n \"completion_token... \n",
598
- ".. ... \n",
599
- "395 {'prompt_tokens': 125, 'completion_tokens': 23... \n",
600
- "396 {'prompt_tokens': 123, 'completion_tokens': 22... \n",
601
- "397 {'prompt_tokens': 119, 'completion_tokens': 20... \n",
602
- "398 {'prompt_tokens': 133, 'completion_tokens': 32... \n",
603
- "399 {'prompt_tokens': 129, 'completion_tokens': 21... \n",
604
- "\n",
605
- " prediction \n",
606
- "0 Hello! It looks like you mentioned a product: ... \n",
607
- "1 Hello! It seems like you are referring to a pr... \n",
608
- "2 Hello! It seems like you're interested in the ... \n",
609
- "3 It seems like you are looking for information ... \n",
610
- "4 Hello! It seems like you are looking for infor... \n",
611
- ".. ... \n",
612
- "395 Minntz,Joint,Indoor,Emerald Cut,1g,co-76GP441T. \n",
613
- "396 The Growers Circle,Double Down,Indoor,3.5g,5RA... \n",
614
- "397 Pacific Stone,Sugar Shake,Blue Dream,Roll Your... \n",
615
- "398 Multi Joint,Zoo Dawg x Cosa Nostra,The Grower ... \n",
616
- "399 Flight Bites,Gummy,S'mores,10 count,100mg CO₂ ... \n",
617
- "\n",
618
- "[400 rows x 12 columns]"
619
- ]
620
- },
621
- "execution_count": 181,
622
- "metadata": {},
623
- "output_type": "execute_result"
624
- }
625
- ],
626
- "source": [
627
- "predictions_df"
628
- ]
629
- },
630
- {
631
- "cell_type": "code",
632
- "execution_count": 182,
633
- "metadata": {},
634
- "outputs": [
635
- {
636
- "data": {
637
- "text/plain": [
638
- "669"
639
- ]
640
- },
641
- "execution_count": 182,
642
- "metadata": {},
643
- "output_type": "execute_result"
644
- }
645
- ],
646
- "source": [
647
- "from util import compare_completion_and_prediction\n",
648
- "\n",
649
- "# Function that uses compare_completion_and_prediction to return num_correct and return zero if there is an error\n",
650
- "def get_num_correct(completion, prediction):\n",
651
- " try:\n",
652
- " return compare_completion_and_prediction(completion, prediction)['num_correct']\n",
653
- " except:\n",
654
- " return 0 # this will be the case when format is incorrect\n",
655
- " \n",
656
- "# Apply get_num_correct function to predictions_df dataframe\n",
657
- "predictions_df['num_correct'] = predictions_df.apply(lambda row: get_num_correct(row['completion'], row['prediction']), axis=1)\n",
658
- "predictions_df['num_correct'].sum() # out of 1000 possible correct predictions (20 samples * 5 cols per sample) * (2 system messages * 2 models * 5 n_shot values)"
659
- ]
660
- },
661
- {
662
- "cell_type": "code",
663
- "execution_count": 187,
664
- "metadata": {},
665
- "outputs": [
666
- {
667
- "data": {
668
- "text/plain": [
669
- "model system_message n_shot\n",
670
- "gpt-3.5-turbo 1 0 0.00\n",
671
- " 1 0.00\n",
672
- " 2 0 0.00\n",
673
- "gpt-4 1 0 0.00\n",
674
- " 1 0.00\n",
675
- " 2 0 0.00\n",
676
- "gpt-3.5-turbo 1 2 0.24\n",
677
- " 2 1 0.24\n",
678
- " 2 0.27\n",
679
- " 3 0.36\n",
680
- " 1 3 0.40\n",
681
- " 5 0.44\n",
682
- "gpt-4 2 2 0.45\n",
683
- " 1 2 0.45\n",
684
- " 2 1 0.47\n",
685
- "gpt-3.5-turbo 2 5 0.56\n",
686
- "gpt-4 1 3 0.62\n",
687
- " 2 3 0.67\n",
688
- " 5 0.73\n",
689
- " 1 5 0.79\n",
690
- "Name: num_correct, dtype: float64"
691
- ]
692
- },
693
- "execution_count": 187,
694
- "metadata": {},
695
- "output_type": "execute_result"
696
- }
697
- ],
698
- "source": [
699
- "predictions_df.groupby(['model', 'system_message', 'n_shot'])['num_correct'].sum().sort_values() / 100 # out of 100 possible correct predictions (20 samples * 5 cols per sample)"
700
- ]
701
- },
702
- {
703
- "cell_type": "code",
704
- "execution_count": 184,
705
- "metadata": {},
706
- "outputs": [],
707
- "source": [
708
- "new_predictions_df.to_csv('../data/cookies_llm_eval_proc_preds.csv', index=False)"
709
- ]
710
- },
711
- {
712
- "cell_type": "code",
713
- "execution_count": 76,
714
- "metadata": {},
715
- "outputs": [
716
- {
717
- "data": {
718
- "text/html": [
719
- "Waiting for W&B process to finish... <strong style=\"color:green\">(success).</strong>"
720
- ],
721
- "text/plain": [
722
- "<IPython.core.display.HTML object>"
723
- ]
724
- },
725
- "metadata": {},
726
- "output_type": "display_data"
727
- },
728
- {
729
- "data": {
730
- "text/html": [
731
- "<style>\n",
732
- " table.wandb td:nth-child(1) { padding: 0 10px; text-align: left ; width: auto;} td:nth-child(2) {text-align: left ; width: 100%}\n",
733
- " .wandb-row { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: flex-start; width: 100% }\n",
734
- " .wandb-col { display: flex; flex-direction: column; flex-basis: 100%; flex: 1; padding: 10px; }\n",
735
- " </style>\n",
736
- "<div class=\"wandb-row\"><div class=\"wandb-col\"><h3>Run history:</h3><br/><table class=\"wandb\"><tr><td>usage/completion_tokens</td><td>▆▆▁▁▁▁▁▁▁▁█▄▁▁▁▁▁▁▁▃▁▁▁▆▂▆▃▅▄▅▆▄▃▁▁▁▁▁▁▁</td></tr><tr><td>usage/elapsed_time</td><td>▄▆▁▁▁▁▂▁▂▁█▃▁▁▁▂▁▁▂▁▁▁▁▄▂▄▂▃▃▄▅▂▁▁▁▁▂▁▁▁</td></tr><tr><td>usage/prompt_tokens</td><td>▁▁▂▂▄▄▆▅██▁▁▃▃▄▅▅██▁▁▃▃▁▁▁▁▁▁▂▁▂▁▄▄▆▆██▁</td></tr><tr><td>usage/total_tokens</td><td>▄▄▂▂▃▃▅▅█▇▆▃▂▂▄▅▅▇▇▂▁▃▂▄▂▄▂▄▃▄▄▃▂▄▃▅▆██▁</td></tr></table><br/></div><div class=\"wandb-col\"><h3>Run summary:</h3><br/><table class=\"wandb\"><tr><td>usage/completion_tokens</td><td>62</td></tr><tr><td>usage/elapsed_time</td><td>2.40086</td></tr><tr><td>usage/prompt_tokens</td><td>54</td></tr><tr><td>usage/total_tokens</td><td>116</td></tr></table><br/></div></div>"
737
- ],
738
- "text/plain": [
739
- "<IPython.core.display.HTML object>"
740
- ]
741
- },
742
- "metadata": {},
743
- "output_type": "display_data"
744
- },
745
- {
746
- "data": {
747
- "text/html": [
748
- " View run <strong style=\"color:#cdcd00\">rose-puddle-7</strong> at: <a href='https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval/runs/rbtf91s6' target=\"_blank\">https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval/runs/rbtf91s6</a><br/>Synced 6 W&B file(s), 422 media file(s), 422 artifact file(s) and 0 other file(s)"
749
- ],
750
- "text/plain": [
751
- "<IPython.core.display.HTML object>"
752
- ]
753
- },
754
- "metadata": {},
755
- "output_type": "display_data"
756
- },
757
- {
758
- "data": {
759
- "text/html": [
760
- "Find logs at: <code>./wandb/run-20230626_114056-rbtf91s6/logs</code>"
761
- ],
762
- "text/plain": [
763
- "<IPython.core.display.HTML object>"
764
- ]
765
- },
766
- "metadata": {},
767
- "output_type": "display_data"
768
- }
769
- ],
770
- "source": [
771
- "autolog.disable()"
772
- ]
773
- }
774
- ],
775
- "metadata": {
776
- "kernelspec": {
777
- "display_name": "kd-llm-dc",
778
- "language": "python",
779
- "name": "python3"
780
- },
781
- "language_info": {
782
- "codemirror_mode": {
783
- "name": "ipython",
784
- "version": 3
785
- },
786
- "file_extension": ".py",
787
- "mimetype": "text/x-python",
788
- "name": "python",
789
- "nbconvert_exporter": "python",
790
- "pygments_lexer": "ipython3",
791
- "version": "3.10.11"
792
- },
793
- "orig_nbformat": 4
794
- },
795
- "nbformat": 4,
796
- "nbformat_minor": 2
797
- }