Spaces:

kaleidoscope-data
/

data-cleaning-llm

Runtime error

App Files Files Community

cmagganas commited on Jul 20, 2023

Commit

b78e021

•

1 Parent(s): 23722a9

Delete app/data_prep.ipynb

Browse files

Files changed (1) hide show

app/data_prep.ipynb +0 -283

app/data_prep.ipynb DELETED Viewed

@@ -1,283 +0,0 @@
-{
- "cells": [
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## This notebook is to show how to load csv data and into jsonl format for the LLM data cleaner.\n",
-    "\n",
-    "First, we load the data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>sku</th>\n",
-       "      <th>product_name (pos)</th>\n",
-       "      <th>brand (pos)</th>\n",
-       "      <th>product_category (pos)</th>\n",
-       "      <th>strain_name (pos)</th>\n",
-       "      <th>product_weight_grams (pos)</th>\n",
-       "      <th>brand (manual review)</th>\n",
-       "      <th>product_category (manual review)</th>\n",
-       "      <th>sub_product_category (manual review)</th>\n",
-       "      <th>strain_name (manual review)</th>\n",
-       "      <th>product_weight_grams (manual review)</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>bl-842922110296</td>\n",
-       "      <td>STIIIZY - Birthday Cake Pod 1g</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>VAPE PENS 1G</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>STIIIZY</td>\n",
-       "      <td>Vape</td>\n",
-       "      <td>Vape</td>\n",
-       "      <td>Birthday Cake</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>co-6ARLLX12</td>\n",
-       "      <td>SMASH Hits - Hippie Slayer - Indoor - 1g</td>\n",
-       "      <td>SMASH Hits</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>Hippie Slayer</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>SMASH Hits</td>\n",
-       "      <td>Preroll</td>\n",
-       "      <td>Joint</td>\n",
-       "      <td>Hippie Slayer</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>bl-090035986141</td>\n",
-       "      <td>Eighth Brothers - Black Jack 1g Preroll</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>PREROLLS</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>Eighth Brothers</td>\n",
-       "      <td>Preroll</td>\n",
-       "      <td>Joint</td>\n",
-       "      <td>Black Jack</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>bl-850002822274</td>\n",
-       "      <td>GRIZZLY PEAK - Indica Bone 0.5g 7PK Prerolls</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>PREROLL PACKS</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>GRIZZLY PEAK</td>\n",
-       "      <td>Preroll</td>\n",
-       "      <td>Joint</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>3.5</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>co-76GP441T</td>\n",
-       "      <td>Minntz - Emerald Cut - Indoor - Joint - 1g</td>\n",
-       "      <td>Minntz</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>Emerald Cut</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>Minntz</td>\n",
-       "      <td>Preroll</td>\n",
-       "      <td>Joint</td>\n",
-       "      <td>Emerald Cut</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "               sku                            product_name (pos) brand (pos)  \\\n",
-       "0  bl-842922110296                STIIIZY - Birthday Cake Pod 1g         NaN   \n",
-       "1      co-6ARLLX12      SMASH Hits - Hippie Slayer - Indoor - 1g  SMASH Hits   \n",
-       "2  bl-090035986141       Eighth Brothers - Black Jack 1g Preroll         NaN   \n",
-       "3  bl-850002822274  GRIZZLY PEAK - Indica Bone 0.5g 7PK Prerolls         NaN   \n",
-       "4      co-76GP441T    Minntz - Emerald Cut - Indoor - Joint - 1g      Minntz   \n",
-       "\n",
-       "  product_category (pos) strain_name (pos)  product_weight_grams (pos)  \\\n",
-       "0           VAPE PENS 1G               NaN                         1.0   \n",
-       "1                    NaN     Hippie Slayer                         NaN   \n",
-       "2               PREROLLS               NaN                         NaN   \n",
-       "3          PREROLL PACKS               NaN                         NaN   \n",
-       "4                    NaN       Emerald Cut                         NaN   \n",
-       "\n",
-       "  brand (manual review) product_category (manual review)  \\\n",
-       "0               STIIIZY                             Vape   \n",
-       "1            SMASH Hits                          Preroll   \n",
-       "2       Eighth Brothers                          Preroll   \n",
-       "3          GRIZZLY PEAK                          Preroll   \n",
-       "4                Minntz                          Preroll   \n",
-       "\n",
-       "  sub_product_category (manual review) strain_name (manual review)  \\\n",
-       "0                                 Vape               Birthday Cake   \n",
-       "1                                Joint               Hippie Slayer   \n",
-       "2                                Joint                  Black Jack   \n",
-       "3                                Joint                         NaN   \n",
-       "4                                Joint                 Emerald Cut   \n",
-       "\n",
-       "  product_weight_grams (manual review)  \n",
-       "0                                    1  \n",
-       "1                                    1  \n",
-       "2                                    1  \n",
-       "3                                  3.5  \n",
-       "4                                    1  "
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import warnings\n",
-    "warnings.filterwarnings('ignore')\n",
-    "\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "\n",
-    "# Load tab-delimited file into pandas dataframe\n",
-    "cookies = pd.read_csv('../data/Cookies-AI-Gold-Standard - Cookies-AI-Gold-Standard.csv', sep=',')\n",
-    "\n",
-    "cookies.head()"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Data Preparation\n",
-    "We transform the dataset into a pandas dataframe, with a column for prompt and completion.\n",
-    "\n",
-    "The prompt contains the \"dirty\" columns, and completion contains the \"cleaned\" columns."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import Dataset, DatasetDict\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "\n",
-    "# split the dataset into train, val and test datasets 80/20\n",
-    "cookies_train, cookies_test = train_test_split(cookies, test_size=0.20, random_state=42)\n",
-    "\n",
-    "# list of input and output columns\n",
-    "input_columns  = ['sku','product_name (pos)','brand (pos)','product_category (pos)','strain_name (pos)','product_weight_grams (pos)']\n",
-    "output_columns = ['brand (manual review)','product_category (manual review)','sub_product_category (manual review)','strain_name (manual review)','product_weight_grams (manual review)']\n",
-    "\n",
-    "# functtion to convert pandas dataframe row to csv string\n",
-    "def row_to_csv(row):\n",
-    "    csv_string = ','.join(str(value) for value in row.values)\n",
-    "    return csv_string\n",
-    "\n",
-    "# create dataframe with prompt and completion columns\n",
-    "\n",
-    "# apply row_to_csv function to each row of the training dataframe\n",
-    "input_rows  = cookies_train[input_columns ].apply(row_to_csv, axis=1)\n",
-    "output_rows = cookies_train[output_columns].apply(row_to_csv, axis=1)\n",
-    "\n",
-    "# create dataframe with prompt and completion columns for training dataset\n",
-    "prompt_df = pd.DataFrame(\n",
-    "    zip(input_rows,\n",
-    "        output_rows)\n",
-    "    , columns = ['prompt','completion'])\n",
-    "\n",
-    "# save dataframe to jsonl file for training\n",
-    "prompt_df.to_json(\"../data/cookies_train.jsonl\", orient='records', lines=True)\n",
-    "\n",
-    "# apply row_to_csv function to each row of the test dataframe\n",
-    "input_test_rows  = cookies_test[input_columns ].apply(row_to_csv, axis=1)\n",
-    "output_test_rows = cookies_test[output_columns].apply(row_to_csv, axis=1)\n",
-    "\n",
-    "# create dataframe with prompt and completion columns for test dataset\n",
-    "test_df = pd.DataFrame(\n",
-    "    zip(input_test_rows,\n",
-    "        output_test_rows)\n",
-    "    , columns = ['prompt','completion'])\n",
-    "test_df.head()\n",
-    "\n",
-    "# save dataframe to jsonl file for test\n",
-    "test_df.to_json(\"../data/cookies_test.jsonl\", orient='records', lines=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "# write a function that samples n rows from a jsonl file\n",
-    "def sample_jsonl(path_or_buf='../data/cookies_train.jsonl',n_samples=5):    \n",
-    "    jsonObj = pd.read_json(path_or_buf=path_or_buf, lines=True)\n",
-    "    return jsonObj.sample(n_samples, random_state=42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# write a function that adds prompt and completion samples to messages\n",
-    "def add_samples(messages, n_samples=None):\n",
-    "    if n_samples is None:\n",
-    "        return messages\n",
-    "    samples = sample_jsonl(n_samples=n_samples)\n",
-    "    for i in range(n_samples):\n",
-    "        messages.append({\"role\": \"user\", \"content\": samples.iloc[i]['prompt']})\n",
-    "        messages.append({\"role\": \"assistant\", \"content\": samples.iloc[i]['completion']})\n",
-    "    return messages"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}