cmagganas commited on
Commit
b78e021
1 Parent(s): 23722a9

Delete app/data_prep.ipynb

Browse files
Files changed (1) hide show
  1. app/data_prep.ipynb +0 -283
app/data_prep.ipynb DELETED
@@ -1,283 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "attachments": {},
5
- "cell_type": "markdown",
6
- "metadata": {},
7
- "source": [
8
- "## This notebook is to show how to load csv data and into jsonl format for the LLM data cleaner.\n",
9
- "\n",
10
- "First, we load the data."
11
- ]
12
- },
13
- {
14
- "cell_type": "code",
15
- "execution_count": null,
16
- "metadata": {},
17
- "outputs": [
18
- {
19
- "data": {
20
- "text/html": [
21
- "<div>\n",
22
- "<style scoped>\n",
23
- " .dataframe tbody tr th:only-of-type {\n",
24
- " vertical-align: middle;\n",
25
- " }\n",
26
- "\n",
27
- " .dataframe tbody tr th {\n",
28
- " vertical-align: top;\n",
29
- " }\n",
30
- "\n",
31
- " .dataframe thead th {\n",
32
- " text-align: right;\n",
33
- " }\n",
34
- "</style>\n",
35
- "<table border=\"1\" class=\"dataframe\">\n",
36
- " <thead>\n",
37
- " <tr style=\"text-align: right;\">\n",
38
- " <th></th>\n",
39
- " <th>sku</th>\n",
40
- " <th>product_name (pos)</th>\n",
41
- " <th>brand (pos)</th>\n",
42
- " <th>product_category (pos)</th>\n",
43
- " <th>strain_name (pos)</th>\n",
44
- " <th>product_weight_grams (pos)</th>\n",
45
- " <th>brand (manual review)</th>\n",
46
- " <th>product_category (manual review)</th>\n",
47
- " <th>sub_product_category (manual review)</th>\n",
48
- " <th>strain_name (manual review)</th>\n",
49
- " <th>product_weight_grams (manual review)</th>\n",
50
- " </tr>\n",
51
- " </thead>\n",
52
- " <tbody>\n",
53
- " <tr>\n",
54
- " <th>0</th>\n",
55
- " <td>bl-842922110296</td>\n",
56
- " <td>STIIIZY - Birthday Cake Pod 1g</td>\n",
57
- " <td>NaN</td>\n",
58
- " <td>VAPE PENS 1G</td>\n",
59
- " <td>NaN</td>\n",
60
- " <td>1.0</td>\n",
61
- " <td>STIIIZY</td>\n",
62
- " <td>Vape</td>\n",
63
- " <td>Vape</td>\n",
64
- " <td>Birthday Cake</td>\n",
65
- " <td>1</td>\n",
66
- " </tr>\n",
67
- " <tr>\n",
68
- " <th>1</th>\n",
69
- " <td>co-6ARLLX12</td>\n",
70
- " <td>SMASH Hits - Hippie Slayer - Indoor - 1g</td>\n",
71
- " <td>SMASH Hits</td>\n",
72
- " <td>NaN</td>\n",
73
- " <td>Hippie Slayer</td>\n",
74
- " <td>NaN</td>\n",
75
- " <td>SMASH Hits</td>\n",
76
- " <td>Preroll</td>\n",
77
- " <td>Joint</td>\n",
78
- " <td>Hippie Slayer</td>\n",
79
- " <td>1</td>\n",
80
- " </tr>\n",
81
- " <tr>\n",
82
- " <th>2</th>\n",
83
- " <td>bl-090035986141</td>\n",
84
- " <td>Eighth Brothers - Black Jack 1g Preroll</td>\n",
85
- " <td>NaN</td>\n",
86
- " <td>PREROLLS</td>\n",
87
- " <td>NaN</td>\n",
88
- " <td>NaN</td>\n",
89
- " <td>Eighth Brothers</td>\n",
90
- " <td>Preroll</td>\n",
91
- " <td>Joint</td>\n",
92
- " <td>Black Jack</td>\n",
93
- " <td>1</td>\n",
94
- " </tr>\n",
95
- " <tr>\n",
96
- " <th>3</th>\n",
97
- " <td>bl-850002822274</td>\n",
98
- " <td>GRIZZLY PEAK - Indica Bone 0.5g 7PK Prerolls</td>\n",
99
- " <td>NaN</td>\n",
100
- " <td>PREROLL PACKS</td>\n",
101
- " <td>NaN</td>\n",
102
- " <td>NaN</td>\n",
103
- " <td>GRIZZLY PEAK</td>\n",
104
- " <td>Preroll</td>\n",
105
- " <td>Joint</td>\n",
106
- " <td>NaN</td>\n",
107
- " <td>3.5</td>\n",
108
- " </tr>\n",
109
- " <tr>\n",
110
- " <th>4</th>\n",
111
- " <td>co-76GP441T</td>\n",
112
- " <td>Minntz - Emerald Cut - Indoor - Joint - 1g</td>\n",
113
- " <td>Minntz</td>\n",
114
- " <td>NaN</td>\n",
115
- " <td>Emerald Cut</td>\n",
116
- " <td>NaN</td>\n",
117
- " <td>Minntz</td>\n",
118
- " <td>Preroll</td>\n",
119
- " <td>Joint</td>\n",
120
- " <td>Emerald Cut</td>\n",
121
- " <td>1</td>\n",
122
- " </tr>\n",
123
- " </tbody>\n",
124
- "</table>\n",
125
- "</div>"
126
- ],
127
- "text/plain": [
128
- " sku product_name (pos) brand (pos) \\\n",
129
- "0 bl-842922110296 STIIIZY - Birthday Cake Pod 1g NaN \n",
130
- "1 co-6ARLLX12 SMASH Hits - Hippie Slayer - Indoor - 1g SMASH Hits \n",
131
- "2 bl-090035986141 Eighth Brothers - Black Jack 1g Preroll NaN \n",
132
- "3 bl-850002822274 GRIZZLY PEAK - Indica Bone 0.5g 7PK Prerolls NaN \n",
133
- "4 co-76GP441T Minntz - Emerald Cut - Indoor - Joint - 1g Minntz \n",
134
- "\n",
135
- " product_category (pos) strain_name (pos) product_weight_grams (pos) \\\n",
136
- "0 VAPE PENS 1G NaN 1.0 \n",
137
- "1 NaN Hippie Slayer NaN \n",
138
- "2 PREROLLS NaN NaN \n",
139
- "3 PREROLL PACKS NaN NaN \n",
140
- "4 NaN Emerald Cut NaN \n",
141
- "\n",
142
- " brand (manual review) product_category (manual review) \\\n",
143
- "0 STIIIZY Vape \n",
144
- "1 SMASH Hits Preroll \n",
145
- "2 Eighth Brothers Preroll \n",
146
- "3 GRIZZLY PEAK Preroll \n",
147
- "4 Minntz Preroll \n",
148
- "\n",
149
- " sub_product_category (manual review) strain_name (manual review) \\\n",
150
- "0 Vape Birthday Cake \n",
151
- "1 Joint Hippie Slayer \n",
152
- "2 Joint Black Jack \n",
153
- "3 Joint NaN \n",
154
- "4 Joint Emerald Cut \n",
155
- "\n",
156
- " product_weight_grams (manual review) \n",
157
- "0 1 \n",
158
- "1 1 \n",
159
- "2 1 \n",
160
- "3 3.5 \n",
161
- "4 1 "
162
- ]
163
- },
164
- "metadata": {},
165
- "output_type": "display_data"
166
- }
167
- ],
168
- "source": [
169
- "import warnings\n",
170
- "warnings.filterwarnings('ignore')\n",
171
- "\n",
172
- "import numpy as np\n",
173
- "import pandas as pd\n",
174
- "\n",
175
- "# Load tab-delimited file into pandas dataframe\n",
176
- "cookies = pd.read_csv('../data/Cookies-AI-Gold-Standard - Cookies-AI-Gold-Standard.csv', sep=',')\n",
177
- "\n",
178
- "cookies.head()"
179
- ]
180
- },
181
- {
182
- "attachments": {},
183
- "cell_type": "markdown",
184
- "metadata": {},
185
- "source": [
186
- "### Data Preparation\n",
187
- "We transform the dataset into a pandas dataframe, with a column for prompt and completion.\n",
188
- "\n",
189
- "The prompt contains the \"dirty\" columns, and completion contains the \"cleaned\" columns."
190
- ]
191
- },
192
- {
193
- "cell_type": "code",
194
- "execution_count": null,
195
- "metadata": {},
196
- "outputs": [],
197
- "source": [
198
- "from datasets import Dataset, DatasetDict\n",
199
- "from sklearn.model_selection import train_test_split\n",
200
- "\n",
201
- "# split the dataset into train, val and test datasets 80/20\n",
202
- "cookies_train, cookies_test = train_test_split(cookies, test_size=0.20, random_state=42)\n",
203
- "\n",
204
- "# list of input and output columns\n",
205
- "input_columns = ['sku','product_name (pos)','brand (pos)','product_category (pos)','strain_name (pos)','product_weight_grams (pos)']\n",
206
- "output_columns = ['brand (manual review)','product_category (manual review)','sub_product_category (manual review)','strain_name (manual review)','product_weight_grams (manual review)']\n",
207
- "\n",
208
- "# functtion to convert pandas dataframe row to csv string\n",
209
- "def row_to_csv(row):\n",
210
- " csv_string = ','.join(str(value) for value in row.values)\n",
211
- " return csv_string\n",
212
- "\n",
213
- "# create dataframe with prompt and completion columns\n",
214
- "\n",
215
- "# apply row_to_csv function to each row of the training dataframe\n",
216
- "input_rows = cookies_train[input_columns ].apply(row_to_csv, axis=1)\n",
217
- "output_rows = cookies_train[output_columns].apply(row_to_csv, axis=1)\n",
218
- "\n",
219
- "# create dataframe with prompt and completion columns for training dataset\n",
220
- "prompt_df = pd.DataFrame(\n",
221
- " zip(input_rows,\n",
222
- " output_rows)\n",
223
- " , columns = ['prompt','completion'])\n",
224
- "\n",
225
- "# save dataframe to jsonl file for training\n",
226
- "prompt_df.to_json(\"../data/cookies_train.jsonl\", orient='records', lines=True)\n",
227
- "\n",
228
- "# apply row_to_csv function to each row of the test dataframe\n",
229
- "input_test_rows = cookies_test[input_columns ].apply(row_to_csv, axis=1)\n",
230
- "output_test_rows = cookies_test[output_columns].apply(row_to_csv, axis=1)\n",
231
- "\n",
232
- "# create dataframe with prompt and completion columns for test dataset\n",
233
- "test_df = pd.DataFrame(\n",
234
- " zip(input_test_rows,\n",
235
- " output_test_rows)\n",
236
- " , columns = ['prompt','completion'])\n",
237
- "test_df.head()\n",
238
- "\n",
239
- "# save dataframe to jsonl file for test\n",
240
- "test_df.to_json(\"../data/cookies_test.jsonl\", orient='records', lines=True)"
241
- ]
242
- },
243
- {
244
- "cell_type": "code",
245
- "execution_count": null,
246
- "metadata": {},
247
- "outputs": [],
248
- "source": [
249
- "import pandas as pd\n",
250
- "\n",
251
- "# write a function that samples n rows from a jsonl file\n",
252
- "def sample_jsonl(path_or_buf='../data/cookies_train.jsonl',n_samples=5): \n",
253
- " jsonObj = pd.read_json(path_or_buf=path_or_buf, lines=True)\n",
254
- " return jsonObj.sample(n_samples, random_state=42)"
255
- ]
256
- },
257
- {
258
- "cell_type": "code",
259
- "execution_count": null,
260
- "metadata": {},
261
- "outputs": [],
262
- "source": [
263
- "# write a function that adds prompt and completion samples to messages\n",
264
- "def add_samples(messages, n_samples=None):\n",
265
- " if n_samples is None:\n",
266
- " return messages\n",
267
- " samples = sample_jsonl(n_samples=n_samples)\n",
268
- " for i in range(n_samples):\n",
269
- " messages.append({\"role\": \"user\", \"content\": samples.iloc[i]['prompt']})\n",
270
- " messages.append({\"role\": \"assistant\", \"content\": samples.iloc[i]['completion']})\n",
271
- " return messages"
272
- ]
273
- }
274
- ],
275
- "metadata": {
276
- "language_info": {
277
- "name": "python"
278
- },
279
- "orig_nbformat": 4
280
- },
281
- "nbformat": 4,
282
- "nbformat_minor": 2
283
- }