mychen76
/

Llama3.1_8b_cgta_merged_16bits

@@ -900,19 +900,6 @@
     "torch.cuda.empty_cache()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b7522c18-31dc-4ed2-a205-0ac080c4b59b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# from transformers import TextStreamer\n",
-    "# text_streamer = TextStreamer(tokenizer, skip_prompt = True)\n",
-    "# _ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,\n",
-    "#                    use_cache = True, temperature = 1.5, min_p = 0.1)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "841a48fe",
@@ -927,7 +914,7 @@
     "tags": []
    },
    "source": [
-    "### Prepare the dataset   \n",
     "\n",
     "We will use 10K rows from the `ultrachat_200k` database."
    ]
@@ -1343,10 +1330,67 @@
     "task_input = format_task_input(task_type, task_context)\n",
     "\n",
     "messages = [{\"role\": \"user\", \"content\": task_input},]\n",
-    "inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
     "generate_response(inputs)\n"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,

     "torch.cuda.empty_cache()"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "841a48fe",
     "tags": []
    },
    "source": [
+    "##### Prepare the dataset   \n",
     "\n",
     "We will use 10K rows from the `ultrachat_200k` database."
    ]
     "task_input = format_task_input(task_type, task_context)\n",
     "\n",
     "messages = [{\"role\": \"user\", \"content\": task_input},]\n",
+    "inputs = tokenizer.apply_chat_template(messages, tokenize=False, \n",
+    "                                       add_generation_prompt=True)\n",
     "generate_response(inputs)\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "1962fbcc-5d11-4cc3-9abe-0d78d8d780e8",
+   "metadata": {},
+   "source": [
+    "#### Test Sample-3 Streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "a9183096-5882-4595-b872-911a51557703",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-09-21T10:54:14.949160Z",
+     "iopub.status.busy": "2024-09-21T10:54:14.948971Z",
+     "iopub.status.idle": "2024-09-21T10:54:19.811260Z",
+     "shell.execute_reply": "2024-09-21T10:54:19.810881Z",
+     "shell.execute_reply.started": "2024-09-21T10:54:14.949147Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<|tasktype|>\n",
+      "extractive question answering\n",
+      "<|taskinput|>\n",
+      "{{context}}\n",
+      "\n",
+      "Q: How would you decide whether to keep the same set of chat tokens when training further models? \n",
+      "\n",
+      "Context:\n",
+      "When setting the template for a model that’s already been trained for chat, you should ensure that the template exactly matches the message formatting that the model saw during training, or else you will probably experience performance degradation. This is true even if you’re training the model further - you will probably get the best performance if you keep the chat tokens constant. \n",
+      "\n",
+      "What is the reason behind the statement that the model will probably experience performance degradation?\n",
+      "<|taskoutput|>\n",
+      "Tokenization\n",
+      "<|eot_id|>\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import TextStreamer\n",
+    "text_streamer = TextStreamer(tokenizer, skip_prompt = True)\n",
+    "\n",
+    "prompt = tokenizer(inputs, return_tensors=\"pt\").to('cuda')\n",
+    "\n",
+    "_ = model.generate(input_ids = prompt['input_ids'], \n",
+    "                   streamer = text_streamer, \n",
+    "                   max_new_tokens = 128,\n",
+    "                   use_cache = True, temperature = 1.5, min_p = 0.1, \n",
+    "                   eos_token_id=terminators)\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,