File size: 48,479 Bytes
408f6a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0ba516b2-fdf9-4000-81fb-7eb9c2abcf16",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting protobuf\n",
      "  Downloading protobuf-4.25.2-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)\n",
      "Downloading protobuf-4.25.2-cp37-abi3-manylinux2014_x86_64.whl (294 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.6/294.6 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: protobuf\n",
      "Successfully installed protobuf-4.25.2\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install protobuf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "71961690-d1e8-49a9-aa28-03c00f21be34",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "dataset = load_dataset(\"cekal/llama2-orion\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ea27147f-ab8d-40cb-8920-d33c5b751697",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['text'],\n",
       "        num_rows: 8277\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "42008bd7-1af0-43f4-80b5-7b880c80a33e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# Name: <|im_start|>user\n",
      "Orion, Can you please summarize all the theatre awards that shellie shulkin had won. She is a friend of mine, I want to have correct information. Thanks Please write in English language.<|im_end|> <|im_start|>assistant\n",
      "# Overview of Awards Won by Shellie Shulkin in Theatre\n",
      "\n",
      "Unfortunately, I was unable to find specific information about the theatre awards won by Shellie Shulkin. This might be due to the fact that such specific information could not be well-documented or publicly accessible online, especially if Shellie Shulkin is a local or non-commercial theatre artist.\n",
      "\n",
      "However, I encourage you to reach out directly to Shellie Shulkin or those who work closely with her as they would likely have the most accurate and up-to-date information regarding the accolades she has achieved in theatre. While I strive to provide accurate and comprehensive responses, it's possible that some specialized or more obscure information is not readily available in the public domain. \n",
      "\n",
      "Remember, the Internet can't capture the entirety of someone's career, especially in fields like theatre where so much important work happens off the main stages and doesn't always get the recognition it deserves. Shellie Shulkin might have a rich and broad collection of work that isn't reflected in mainstream databases.\n",
      "\n",
      "The world of theatre is vast and vibrant, full of talented individuals whose contributions aren't always highlighted through awards but are nonetheless incredibly meaningful. So, while I regret not being able to provide you with a list of Shellie's theatre awards, I still respect and celebrate her work. \n",
      "\n",
      "Theatre cannot exist without passionate individuals like Shellie Shulkin, who dedicate their lives and talents to telling stories onstage — irrespective of whether they win awards for their efforts.🎭<|im_end|> <|im_start|>user\n",
      "gregory awards 2013, shellie shulkin Please write in English language.<|im_end|> <|im_start|>assistant\n",
      "# Highlight: The 2013 Gregory Awards and Shellie Shulkin\n",
      "\n",
      "In the Gregory Awards 2013, **Shellie Shulkin** was honored with the **Outstanding Costume Design** for her excellent work. The Gregory Awards are organized by Theatre Puget Sound (TPS), and they applaud and commend the outstanding achievements performed in Seattle's theatre scene. \n",
      "\n",
      "Below, I've put together a table showcasing the award won by Shellie Shulkin in the event:\n",
      "\n",
      "|Year|Event|Award|Recipient|\n",
      "|---|---|---|---|\n",
      "|2013|Gregory Awards|Outstanding Costume Design|Shellie Shulkin|\n",
      "\n",
      "Very clearly, her talent and excellence in Costume Design were recognized and appreciated, evident through the prestigious award she received. It is a great feat to be recognized by such an esteemed platform, which only adds more authenticity to her skill and talent as a costume designer in the theatre world. 🎭👏\n",
      "\n",
      "This recognition truly reflects the dedication and hard work she puts into her creations, illuminating stages with her unique and impressive designs. Her triumph in the 2013 Gregory Awards further highlights the impact that her work has on enriching the theatre scene. Congratulations to Shellie Shulkin on this well-deserved accolade!🎉<|im_end|> <|im_start|>user\n",
      "Yes check this out https://seattlegayscene.com/2013/04/review-balagans-august-osage-county-is-very-long-and-almost-perfect/ Please write in English language.<|im_end|> <|im_start|>assistant\n",
      "# Theatre Review: Balagan Theatre Presents \"August: Osage County\"\n",
      "\n",
      "\"August: Osage County\", performed at the Balagan Theatre in 2013, received admirable reviews and recognition. This performance was notably enhanced by the superb artistry of Shellie Shulkin.\n",
      "\n",
      "Shellie Shulkin, noted for her extraordinary work as a costume designer, added substantial depth to the performance with her design work. Shulkin did an outstanding job in curating and designing costumes that captured the characters' attributes and the mood of the play.\n",
      "\n",
      "The production, which was presented in Seattle, was almost universally praised for various aspects like its writing quality, performances, and direction. Shellie Shulkin's outstanding costume design was one of the elements that made this play \"almost perfect,\" contributing significantly to the overall impact of the production.\n",
      "\n",
      "Here's an overview of the role Shellie played in the said performance:\n",
      "\n",
      "|Role|Name|Performance|Year|\n",
      "|---|---|---|---|\n",
      "|Costume Designer|Shellie Shulkin|\"August: Osage County\" at Balagan Theatre|2013|\n",
      "\n",
      "Her excellent work as a costume designer resonates profoundly, displaying her flair for connecting characters with the audience through visual design elements. 🎭👏<|im_end|>\n"
     ]
    }
   ],
   "source": [
    "print(dataset[\"train\"][0][\"text\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5b78a399-0c4e-4d6c-9936-6436516ea13b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py:472: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e96f3f486f644849a21a6bbab07385bf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/29 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/tokenization_auto.py:711: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig\n",
    "from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model\n",
    "\n",
    "modelpath=\"152334H/miqu-1-70b-sf\"\n",
    "\n",
    "# Load 4-bit quantized model\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    modelpath,\n",
    "    use_auth_token=True,\n",
    "    device_map=\"auto\",\n",
    "    quantization_config=BitsAndBytesConfig(\n",
    "        load_in_4bit=True,\n",
    "        bnb_4bit_compute_dtype=torch.bfloat16,\n",
    "        bnb_4bit_quant_type=\"nf4\",\n",
    "    ),\n",
    "    torch_dtype=torch.bfloat16,\n",
    ")\n",
    "\n",
    "# Load (slow) Tokenizer, fast tokenizer sometimes ignores added tokens\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Llama-2-70b-hf\", use_fast=False, use_auth_token=True)   \n",
    "\n",
    "# Add tokens <|im_start|> and <|im_end|>, latter is special eos token \n",
    "tokenizer.pad_token = \"</s>\"\n",
    "tokenizer.add_tokens([\"<|im_start|>\"])\n",
    "tokenizer.add_special_tokens(dict(eos_token=\"<|im_end|>\"))\n",
    "model.resize_token_embeddings(len(tokenizer))\n",
    "model.config.eos_token_id = tokenizer.eos_token_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "084a68b7-53ad-42b9-8049-f05b89f46a0b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add LoRA adapters to model\n",
    "model = prepare_model_for_kbit_training(model)\n",
    "config = LoraConfig(\n",
    "    r=64, \n",
    "    lora_alpha=16, \n",
    "    target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],\n",
    "    lora_dropout=0.1, \n",
    "    bias=\"none\", \n",
    "    modules_to_save = [\"lm_head\", \"embed_tokens\"],\t\t# needed because we added new tokens to tokenizer/model\n",
    "    task_type=\"CAUSAL_LM\"\n",
    ")\n",
    "model = get_peft_model(model, config)\n",
    "model.config.use_cache = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1d6e109a-ace6-44e5-9bca-2fb4f12db1ee",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PeftModelForCausalLM(\n",
       "  (base_model): LoraModel(\n",
       "    (model): LlamaForCausalLM(\n",
       "      (model): LlamaModel(\n",
       "        (embed_tokens): ModulesToSaveWrapper(\n",
       "          (original_module): Embedding(32002, 8192)\n",
       "          (modules_to_save): ModuleDict(\n",
       "            (default): Embedding(32002, 8192)\n",
       "          )\n",
       "        )\n",
       "        (layers): ModuleList(\n",
       "          (0-79): 80 x LlamaDecoderLayer(\n",
       "            (self_attn): LlamaAttention(\n",
       "              (q_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=8192, out_features=8192, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Dropout(p=0.1, inplace=False)\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=8192, out_features=64, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=64, out_features=8192, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (k_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=8192, out_features=1024, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Dropout(p=0.1, inplace=False)\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=8192, out_features=64, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=64, out_features=1024, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (v_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=8192, out_features=1024, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Dropout(p=0.1, inplace=False)\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=8192, out_features=64, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=64, out_features=1024, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (o_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=8192, out_features=8192, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Dropout(p=0.1, inplace=False)\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=8192, out_features=64, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=64, out_features=8192, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (rotary_emb): LlamaRotaryEmbedding()\n",
       "            )\n",
       "            (mlp): LlamaMLP(\n",
       "              (gate_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=8192, out_features=28672, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Dropout(p=0.1, inplace=False)\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=8192, out_features=64, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=64, out_features=28672, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (up_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=8192, out_features=28672, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Dropout(p=0.1, inplace=False)\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=8192, out_features=64, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=64, out_features=28672, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (down_proj): lora.Linear4bit(\n",
       "                (base_layer): Linear4bit(in_features=28672, out_features=8192, bias=False)\n",
       "                (lora_dropout): ModuleDict(\n",
       "                  (default): Dropout(p=0.1, inplace=False)\n",
       "                )\n",
       "                (lora_A): ModuleDict(\n",
       "                  (default): Linear(in_features=28672, out_features=64, bias=False)\n",
       "                )\n",
       "                (lora_B): ModuleDict(\n",
       "                  (default): Linear(in_features=64, out_features=8192, bias=False)\n",
       "                )\n",
       "                (lora_embedding_A): ParameterDict()\n",
       "                (lora_embedding_B): ParameterDict()\n",
       "              )\n",
       "              (act_fn): SiLU()\n",
       "            )\n",
       "            (input_layernorm): LlamaRMSNorm()\n",
       "            (post_attention_layernorm): LlamaRMSNorm()\n",
       "          )\n",
       "        )\n",
       "        (norm): LlamaRMSNorm()\n",
       "      )\n",
       "      (lm_head): ModulesToSaveWrapper(\n",
       "        (original_module): Linear(in_features=8192, out_features=32002, bias=False)\n",
       "        (modules_to_save): ModuleDict(\n",
       "          (default): Linear(in_features=8192, out_features=32002, bias=False)\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6ad9dce7-fca7-4969-9620-3c138fc90268",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os \n",
    "\n",
    "def tokenize(element):\n",
    "    return tokenizer(\n",
    "        element[\"text\"],\n",
    "        truncation=True,\n",
    "        max_length=2048,\n",
    "        add_special_tokens=False,\n",
    "    )\n",
    "\n",
    "dataset_tokenized = dataset.map(\n",
    "    tokenize, \n",
    "    batched=True, \n",
    "    num_proc=os.cpu_count(),    # multithreaded\n",
    "    remove_columns=[\"text\"]     # don't need this anymore, we have tokens from here on\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fcdf937c-605c-4eee-b93c-511f44ae36ff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['input_ids', 'attention_mask'],\n",
       "        num_rows: 8277\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_tokenized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7f450dab-77ae-4d66-84b1-0fa4c144c361",
   "metadata": {},
   "outputs": [],
   "source": [
    "# define collate function - transform list of dictionaries [ {input_ids: [123, ..]}, {.. ] to single batch dictionary { input_ids: [..], labels: [..], attention_mask: [..] }\n",
    "def collate(elements):\n",
    "    tokenlist=[e[\"input_ids\"] for e in elements]\n",
    "    tokens_maxlen=max([len(t) for t in tokenlist])\n",
    "\n",
    "    input_ids,labels,attention_masks = [],[],[]\n",
    "    for tokens in tokenlist:\n",
    "        pad_len=tokens_maxlen-len(tokens)\n",
    "\n",
    "        # pad input_ids with pad_token, labels with ignore_index (-100) and set attention_mask 1 where content otherwise 0\n",
    "        input_ids.append( tokens + [tokenizer.pad_token_id]*pad_len )   \n",
    "        labels.append( tokens + [-100]*pad_len )    \n",
    "        attention_masks.append( [1]*len(tokens) + [0]*pad_len ) \n",
    "\n",
    "    batch={\n",
    "        \"input_ids\": torch.tensor(input_ids),\n",
    "        \"labels\": torch.tensor(labels),\n",
    "        \"attention_mask\": torch.tensor(attention_masks)\n",
    "    }\n",
    "    return batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f3184049-bd09-4ae4-9522-489d27e1d45d",
   "metadata": {},
   "outputs": [],
   "source": [
    "bs=1        # batch size\n",
    "ga_steps=1  # gradient acc. steps\n",
    "epochs=1\n",
    "steps_per_epoch=len(dataset_tokenized[\"train\"])//(bs*ga_steps)\n",
    "\n",
    "args = TrainingArguments(\n",
    "    output_dir=\"out\",\n",
    "    per_device_train_batch_size=bs,\n",
    "    per_device_eval_batch_size=bs,\n",
    "    evaluation_strategy=\"steps\",\n",
    "    logging_steps=1,\n",
    "    eval_steps=steps_per_epoch,\t\t# eval and save once per epoch  \t\n",
    "    save_steps=steps_per_epoch,\n",
    "    gradient_accumulation_steps=ga_steps,\n",
    "    num_train_epochs=epochs,\n",
    "    lr_scheduler_type=\"constant\",\n",
    "    optim=\"paged_adamw_32bit\",\n",
    "    learning_rate=0.0002,\n",
    "    group_by_length=True,\n",
    "    fp16=True,\n",
    "    ddp_find_unused_parameters=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "17ed4da7-635b-4aa0-8d2f-ed72fba72d53",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='8278' max='8277' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [8277/8277 22:27:53, Epoch 1/1]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "ename": "ValueError",
     "evalue": "Trainer: evaluation requires an eval_dataset.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[11], line 9\u001b[0m\n\u001b[1;32m      1\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Trainer(\n\u001b[1;32m      2\u001b[0m     model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m      3\u001b[0m     tokenizer\u001b[38;5;241m=\u001b[39mtokenizer,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m      6\u001b[0m     args\u001b[38;5;241m=\u001b[39margs,\n\u001b[1;32m      7\u001b[0m )\n\u001b[0;32m----> 9\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:1539\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1537\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1539\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1540\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1541\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1542\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1544\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:1929\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   1926\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mepoch \u001b[38;5;241m=\u001b[39m epoch \u001b[38;5;241m+\u001b[39m (step \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m steps_skipped) \u001b[38;5;241m/\u001b[39m steps_in_epoch\n\u001b[1;32m   1927\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_end(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[0;32m-> 1929\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_maybe_log_save_evaluate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtr_loss\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mepoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1930\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1931\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_substep_end(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:2291\u001b[0m, in \u001b[0;36mTrainer._maybe_log_save_evaluate\u001b[0;34m(self, tr_loss, model, trial, epoch, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2289\u001b[0m metrics \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   2290\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol\u001b[38;5;241m.\u001b[39mshould_evaluate:\n\u001b[0;32m-> 2291\u001b[0m     metrics \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mignore_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2292\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_report_to_hp_search(trial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step, metrics)\n\u001b[1;32m   2294\u001b[0m     \u001b[38;5;66;03m# Run delayed LR scheduler now that metrics are populated\u001b[39;00m\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:3091\u001b[0m, in \u001b[0;36mTrainer.evaluate\u001b[0;34m(self, eval_dataset, ignore_keys, metric_key_prefix)\u001b[0m\n\u001b[1;32m   3088\u001b[0m \u001b[38;5;66;03m# memory metrics - must set up as early as possible\u001b[39;00m\n\u001b[1;32m   3089\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_memory_tracker\u001b[38;5;241m.\u001b[39mstart()\n\u001b[0;32m-> 3091\u001b[0m eval_dataloader \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_eval_dataloader\u001b[49m\u001b[43m(\u001b[49m\u001b[43meval_dataset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3092\u001b[0m start_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime()\n\u001b[1;32m   3094\u001b[0m eval_loop \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprediction_loop \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39muse_legacy_prediction_loop \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mevaluation_loop\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/trainer.py:846\u001b[0m, in \u001b[0;36mTrainer.get_eval_dataloader\u001b[0;34m(self, eval_dataset)\u001b[0m\n\u001b[1;32m    835\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    836\u001b[0m \u001b[38;5;124;03mReturns the evaluation [`~torch.utils.data.DataLoader`].\u001b[39;00m\n\u001b[1;32m    837\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    843\u001b[0m \u001b[38;5;124;03m        by the `model.forward()` method are automatically removed. It must implement `__len__`.\u001b[39;00m\n\u001b[1;32m    844\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    845\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m eval_dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39meval_dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 846\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTrainer: evaluation requires an eval_dataset.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    847\u001b[0m eval_dataset \u001b[38;5;241m=\u001b[39m eval_dataset \u001b[38;5;28;01mif\u001b[39;00m eval_dataset \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39meval_dataset\n\u001b[1;32m    848\u001b[0m data_collator \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdata_collator\n",
      "\u001b[0;31mValueError\u001b[0m: Trainer: evaluation requires an eval_dataset."
     ]
    }
   ],
   "source": [
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    tokenizer=tokenizer,\n",
    "    data_collator=collate,\n",
    "    train_dataset=dataset_tokenized[\"train\"],\n",
    "    args=args,\n",
    ")\n",
    "\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9dbeb0a6-c04d-477f-8d5e-3a300faca8e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "def add_tokens(data):\n",
    "    for conversation_object in data:\n",
    "        # Obtain the name if present, else use blank string\n",
    "        name = conversation_object.get('name', '') \n",
    "\n",
    "        for i, conversation in enumerate(conversation_object['conversations']):\n",
    "            if conversation['from'] == 'human':\n",
    "                if i == 0: # If it's the first message from the human\n",
    "                    conversation['value'] = f\"# Name: {name}<|im_start|>user\\n\" + conversation['value'] + \"<|im_end|>\\n\"\n",
    "                else:\n",
    "                    conversation['value'] = \"<|im_start|>user\\n\" + conversation['value'] + \"<|im_end|>\\n\"\n",
    "            elif conversation['from'] == 'gpt':\n",
    "                conversation['value'] = \"<|im_start|>assistant\\n\" + conversation['value'] + \"<|im_end|>\\n\"\n",
    "    return data\n",
    "\n",
    "def main():\n",
    "    # Opening the input file and loading the json data \n",
    "    with open('orion.json', 'r') as in_file:\n",
    "        data = json.load(in_file)\n",
    "\n",
    "    # Adding the special tokens\n",
    "    data = add_tokens(data)\n",
    "\n",
    "    # Saving the modified data into the output.json file\n",
    "    with open('oorion.json', 'w') as out_file:\n",
    "        json.dump(data, out_file, indent=4)\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e9ad5d60-125a-4b33-ae1b-a8a9ee6657eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "def process_json_object(json_obj):\n",
    "    \"\"\" Process a single JSON object to the required format. \"\"\"\n",
    "    formatted_text = \"\"\n",
    "\n",
    "    # Check if 'conversations' is in json_obj and is a list\n",
    "    if \"conversations\" in json_obj and isinstance(json_obj[\"conversations\"], list):\n",
    "        for conversation in json_obj[\"conversations\"]:\n",
    "            # Use .get() for safer access\n",
    "            speaker = \"\" if conversation.get(\"from\") == \"human\" else \"\"\n",
    "            formatted_value = conversation.get(\"value\", \"\").replace(\"\\n\\n\",\"\\n\\n\").strip()\n",
    "            # Add space after each piece of conversation\n",
    "            formatted_text += speaker + formatted_value + \" \"\n",
    "    else:\n",
    "        return None  # If structure is not as expected, return None\n",
    "\n",
    "    # Remove trailing spaces\n",
    "    formatted_text = formatted_text.rstrip(\" \")\n",
    "    # Wrap the formatted text in a dictionary\n",
    "    return {\"text\": formatted_text}\n",
    "\n",
    "# Paths to your files\n",
    "input_file_path = 'oorion.json'\n",
    "output_file_path = 'train-orion.json'\n",
    "\n",
    "# Process and write output based on data structure\n",
    "with open(input_file_path, 'r', encoding='utf-8') as infile, \\\n",
    "     open(output_file_path, 'w', encoding='utf-8') as outfile:\n",
    "\n",
    "    data = json.load(infile)\n",
    "\n",
    "    # Adjust the loop to correctly handle whether data is a list or a single dict\n",
    "    if isinstance(data, list):\n",
    "        # If data is a list, process each item in the list\n",
    "        for json_obj in data:\n",
    "            formatted_line = process_json_object(json_obj)\n",
    "            if formatted_line is not None:\n",
    "                outfile.write(json.dumps(formatted_line, ensure_ascii=False))  # Changed to False for better encoding\n",
    "                outfile.write('\\n')\n",
    "    elif isinstance(data, dict):\n",
    "        # If data is a single dict, process it directly\n",
    "        formatted_line = process_json_object(data)\n",
    "        if formatted_line is not None:\n",
    "            outfile.write(json.dumps(formatted_line, ensure_ascii=False))  # Changed to False for better encoding\n",
    "            outfile.write('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b5468230-886f-4079-9f85-048445b86385",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/tokenization_auto.py:711: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.\n",
      "  warnings.warn(\n",
      "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py:472: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2da974370cb64f3bbc21999ae8da1d38",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/29 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
    "\n",
    "# Set model ID and load configuration\n",
    "base_model_id = \"152334H/miqu-1-70b-sf\"\n",
    "bnb_config = BitsAndBytesConfig(\n",
    "    load_in_4bit=True,\n",
    "    bnb_4bit_use_double_quant=True,\n",
    "    bnb_4bit_compute_dtype=torch.bfloat16\n",
    ")\n",
    "\n",
    "# Load tokenizer and add special tokens\n",
    "tokenizer = AutoTokenizer.from_pretrained(\n",
    "    \"meta-llama/Llama-2-70b-hf\", \n",
    "    add_bos_token=True, \n",
    "    trust_remote_code=True, \n",
    "    use_auth_token=True\n",
    ")\n",
    "tokenizer.pad_token = \"</s>\"\n",
    "tokenizer.add_tokens([\"<|im_start|>\"])\n",
    "tokenizer.add_special_tokens(dict(eos_token=\"<|im_end|>\"))\n",
    "\n",
    "# Load base model and adjust token embeddings size\n",
    "base_model = AutoModelForCausalLM.from_pretrained(\n",
    "    base_model_id,  \n",
    "    quantization_config=bnb_config,  \n",
    "    device_map=\"auto\",\n",
    "    trust_remote_code=True,\n",
    "    use_auth_token=True\n",
    ")\n",
    "base_model.resize_token_embeddings(len(tokenizer))\n",
    "\n",
    "# Configure EOS token ID\n",
    "base_model.config.eos_token_id = tokenizer.eos_token_id\n",
    "\n",
    "# Import Peft function and create fine-tuned model\n",
    "from peft import PeftModel\n",
    "ft_model = PeftModel.from_pretrained(base_model, \"out/Orion-3\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "fe76b1eb-4fc4-42a0-b8bf-15039a579a98",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<s> # Name:<|im_start|>user\n",
      "Explain quantum computing in simple terms<|im_end|> <|im_start|>assistant\n",
      "# Quantum Computing Simplified\n",
      "\n",
      "Quantum Computing is a branch of technology that leverages the principles of quantum mechanics to process information. It's an exciting, yet complex, field that has the potential to revolutionize many aspects of digital technology.\n",
      "\n",
      "---\n",
      "\n",
      "## Key Concepts of Quantum Computing\n",
      "\n",
      "### 1. Quantum Bits (Qubits)\n",
      "\n",
      "Just as classical computers use bits (0s and 1s) to process information, quantum computers use a fundamental unit called a quantum bit, or **qubit**. However, a qubit can be in multiple states at once, thanks to a property known as **superposition**.\n",
      "\n",
      "> \"Qubits can be both 0 and 1 at the same time.\"\n",
      "\n",
      "### 2. Quantum Entanglement\n",
      "\n",
      "Another key principle of quantum computing is **entanglement**. When two qubits become entangled, the state of one instantly influences the state of the other, no matter the distance between them.\n",
      "\n",
      "> \"When two qubits become entangled, the state of one instantly influences the state of the other, no matter the distance between them.\"\n",
      "\n",
      "### 3. Quantum Gates\n",
      "\n",
      "Like classical computers, quantum computers also use gates to perform operations. However, quantum gates manipulate superposition and entanglement to perform complex calculations in a single operation.\n",
      "\n",
      "### 4. Quantum Computing Capabilities\n",
      "\n",
      "While classical computers are excellent at sequential computations, quantum computers excel at parallel computations. This means they can process a vast number of possibilities at once, providing solutions to complex calculations that classical computers could never solve.\n",
      "\n",
      "---\n",
      "\n",
      "## Potential Applications\n",
      "\n",
      "Quantum Computing holds immense potential in various fields:\n",
      "\n",
      "1. **Cryptography**: Quantum encryption could make data almost impossible to crack, revolutionizing the security of digital information.\n",
      "\n",
      "2. **Drug Discovery**: Quantum Computers could simulate complex biological systems, accelerating the discovery of new medicines.\n",
      "\n",
      "3. **Climate Modeling**: By processing vast amounts of data at once, quantum computers can help us understand and predict the impacts of climate change.\n",
      "\n",
      "---\n",
      "\n",
      "## The Future of Quantum Computing\n",
      "\n",
      "While quantum computing is still in its early stages, it has the potential to revolutionize the world of technology. It's a field that continues to grow and evolve, and it will certainly be interesting to see where it goes from here. The future really does look pretty quantum! 😊<|im_end|>\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "argument 'ids': 'list' object cannot be interpreted as an integer",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[17], line 23\u001b[0m\n\u001b[1;32m     13\u001b[0m generated_tokens \u001b[38;5;241m=\u001b[39m ft_model\u001b[38;5;241m.\u001b[39mgenerate(\n\u001b[1;32m     14\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_input, \n\u001b[1;32m     15\u001b[0m     max_new_tokens\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4096\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     19\u001b[0m     streamer\u001b[38;5;241m=\u001b[39mstreamer  \u001b[38;5;66;03m# Ensure 'streamer' is defined and appropriate for your setup\u001b[39;00m\n\u001b[1;32m     20\u001b[0m )\n\u001b[1;32m     22\u001b[0m \u001b[38;5;66;03m# Decode and print the generated tokens\u001b[39;00m\n\u001b[0;32m---> 23\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mtokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgenerated_tokens\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskip_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m)\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:3756\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m   3753\u001b[0m \u001b[38;5;66;03m# Convert inputs to python lists\u001b[39;00m\n\u001b[1;32m   3754\u001b[0m token_ids \u001b[38;5;241m=\u001b[39m to_py_obj(token_ids)\n\u001b[0;32m-> 3756\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_decode\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   3757\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3758\u001b[0m \u001b[43m    \u001b[49m\u001b[43mskip_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskip_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3759\u001b[0m \u001b[43m    \u001b[49m\u001b[43mclean_up_tokenization_spaces\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclean_up_tokenization_spaces\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3760\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3761\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_fast.py:625\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m    623\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(token_ids, \u001b[38;5;28mint\u001b[39m):\n\u001b[1;32m    624\u001b[0m     token_ids \u001b[38;5;241m=\u001b[39m [token_ids]\n\u001b[0;32m--> 625\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_tokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtoken_ids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mskip_special_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mskip_special_tokens\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    627\u001b[0m clean_up_tokenization_spaces \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    628\u001b[0m     clean_up_tokenization_spaces\n\u001b[1;32m    629\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m clean_up_tokenization_spaces \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    630\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclean_up_tokenization_spaces\n\u001b[1;32m    631\u001b[0m )\n\u001b[1;32m    632\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m clean_up_tokenization_spaces:\n",
      "\u001b[0;31mTypeError\u001b[0m: argument 'ids': 'list' object cannot be interpreted as an integer"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import bitsandbytes #necessary to fit in colab\n",
    "import accelerate #necessary to fit in colab\n",
    "import torch\n",
    "from transformers import AutoTokenizer, TextStreamer, GenerationConfig, AutoModelForCausalLM\n",
    "streamer = TextStreamer(tokenizer)\n",
    "eval_prompt = \"\"\"# Name: <|im_start|>user\\nExplain quantum computing in simple terms<|im_end|> <|im_start|>assistant\\n\"\"\"\n",
    "model_input = tokenizer(eval_prompt, return_tensors=\"pt\").to(\"cuda\")\n",
    "\n",
    "ft_model.eval()\n",
    "with torch.no_grad():\n",
    "    # Using Dgenerate for streaming the output\n",
    "    generated_tokens = ft_model.generate(\n",
    "        **model_input, \n",
    "        max_new_tokens=4096,\n",
    "        temperature=0.7,  # Adjusted for balance\n",
    "        do_sample=True,\n",
    "        eos_token_id=tokenizer.convert_tokens_to_ids(\"<|im_end|>\"),\n",
    "        streamer=streamer  # Ensure 'streamer' is defined and appropriate for your setup\n",
    "    )\n",
    "\n",
    "    # Decode and print the generated tokens\n",
    "    print(tokenizer.decode(generated_tokens, skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a67fb523-9708-4edb-a0ad-8386b6b7c6a6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}