shanjay commited on
Commit
dd0a849
·
1 Parent(s): 4587806

Delete Test-mgc-Copy1.ipynb

Browse files
Files changed (1) hide show
  1. Test-mgc-Copy1.ipynb +0 -1177
Test-mgc-Copy1.ipynb DELETED
@@ -1,1177 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 2,
6
- "id": "addd199c-097c-419d-a0f2-c3d73efb8d5d",
7
- "metadata": {},
8
- "outputs": [
9
- {
10
- "name": "stdout",
11
- "output_type": "stream",
12
- "text": [
13
- "\n",
14
- "===================================BUG REPORT===================================\n",
15
- "Welcome to bitsandbytes. For bug reports, please run\n",
16
- "\n",
17
- "python -m bitsandbytes\n",
18
- "\n",
19
- " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
20
- "================================================================================\n",
21
- "bin /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so\n",
22
- "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n",
23
- "CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so\n",
24
- "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n",
25
- "CUDA SETUP: Detected CUDA version 121\n",
26
- "CUDA SETUP: Loading binary /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...\n"
27
- ]
28
- },
29
- {
30
- "name": "stderr",
31
- "output_type": "stream",
32
- "text": [
33
- "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib64'), PosixPath('/usr/local/nvidia/lib')}\n",
34
- " warn(msg)\n",
35
- "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n",
36
- " warn(msg)\n",
37
- "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCcuY6EsmJRfLsI1l1rpDWVRhwkL7A9nzITTDbCFOX0wzshP65l/Sa54NrS1pX2uM6YiB7OvgGUm7uUKf9OBCcpd2ohFJiOkTznhDHk+D7IkFZf/VTRIHy/JZoAtzN/qBQKMOygFam1XzTMDnkehMkKvR23BgH72hzGUfYPIsq+OlStYVMhE1bncYSnC4SRucbdT5BeIsival514xsbAhCjjwPd8UHfw1cxaDq4edWjbhN8wkDU+V8i/jS/wWTZIt7pIZiAREEl/YC+Sc4FCSnb4c3p+adl5pqXrEsKygi+UmBtC1poLSXTgZOc/0kerx4jv/HB8NiH4kLsg4S2HjdFFQIB0WSV0i4KDVRE9cv18gQ7kbEv0t9Uwg4xdoMntCNS6aFDm51ufhshwQylzfSwX71Ka3mPdftfnVk81wKpIxN784FEcb7IE7HcNyomnP9N382Fg8j6pILwsKK6w4oOg8Cn2C66cySA6CNTFpK1kYBwsqdU3X8WBQUIZZNVCn4x/qRWYxrKHmdlUW8oCf9AT32eydDQWp1y0AlycA4wfbDQ8g4dtu9Rf+tBrYTztdCt5PbGy4SbwfynWysc/PuhcyaLNtuRYt3LeiCKhKJFNFST1BqjACrjkQ9kMrPSB/7j3JX9O2ncDHDQgCQIQon9BETVQZJ49EqMrusQ3/K39w== shanjay@LAPTOP-Q1PG3AE7')}\n",
38
- " warn(msg)\n",
39
- "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('//g.notebooksg.jarvislabs.net'), PosixPath('https')}\n",
40
- " warn(msg)\n",
41
- "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n",
42
- " warn(msg)\n"
43
- ]
44
- }
45
- ],
46
- "source": [
47
- "import json\n",
48
- "import os\n",
49
- "from pprint import pprint\n",
50
- "\n",
51
- "import bitsandbytes as bnb\n",
52
- "import pandas as pd\n",
53
- "import torch\n",
54
- "import torch.nn as nn\n",
55
- "\n",
56
- "import transformers\n",
57
- "from datasets import load_dataset\n",
58
- "from huggingface_hub import notebook_login\n",
59
- "from peft import (\n",
60
- " LoraConfig,\n",
61
- " PeftConfig,\n",
62
- " PeftModel,\n",
63
- " get_peft_model,\n",
64
- " prepare_model_for_kbit_training,\n",
65
- ")\n",
66
- "from transformers import (\n",
67
- " AutoConfig,\n",
68
- " AutoModelForCausalLM,\n",
69
- " AutoTokenizer,\n",
70
- " BitsAndBytesConfig,\n",
71
- ")\n",
72
- "import warnings\n",
73
- "warnings.filterwarnings(\"ignore\")\n",
74
- "\n",
75
- "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
76
- ]
77
- },
78
- {
79
- "cell_type": "code",
80
- "execution_count": 3,
81
- "id": "acfb1578-a66f-44f0-8df9-1c6bcf7530ea",
82
- "metadata": {},
83
- "outputs": [
84
- {
85
- "data": {
86
- "application/vnd.jupyter.widget-view+json": {
87
- "model_id": "3edf6ee054e9464eb510d3aff9d1dc5f",
88
- "version_major": 2,
89
- "version_minor": 0
90
- },
91
- "text/plain": [
92
- "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
93
- ]
94
- },
95
- "metadata": {},
96
- "output_type": "display_data"
97
- }
98
- ],
99
- "source": [
100
- "notebook_login()"
101
- ]
102
- },
103
- {
104
- "cell_type": "code",
105
- "execution_count": 4,
106
- "id": "d2f13cac-1536-4da0-8ff7-0a0454fd0b4a",
107
- "metadata": {},
108
- "outputs": [],
109
- "source": [
110
- "with open(\"ds1000-test-cleaned.json\") as json_file:\n",
111
- " data = json.load(json_file)"
112
- ]
113
- },
114
- {
115
- "cell_type": "code",
116
- "execution_count": 5,
117
- "id": "6706e68b-d525-4392-ab2c-1dff356da52d",
118
- "metadata": {},
119
- "outputs": [
120
- {
121
- "name": "stdout",
122
- "output_type": "stream",
123
- "text": [
124
- "{'answer': 'import pandas as pd\\n'\n",
125
- " '\\n'\n",
126
- " '\\n'\n",
127
- " 'index = range(14)\\n'\n",
128
- " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
129
- " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
130
- " 'def g(df):\\n'\n",
131
- " \" l = df['A'].replace(to_replace=0, method='ffill')\\n\"\n",
132
- " \" r = df['A'].replace(to_replace=0, method='bfill')\\n\"\n",
133
- " ' for i in range(len(df)):\\n'\n",
134
- " \" df['A'].iloc[i] = max(l[i], r[i])\\n\"\n",
135
- " ' return df\\n'\n",
136
- " '\\n'\n",
137
- " 'df = g(df.copy())\\n'\n",
138
- " 'result = df\\n'\n",
139
- " 'print(result)',\n",
140
- " 'question': 'Problem:\\n'\n",
141
- " 'I have the following dataframe:\\n'\n",
142
- " 'index = range(14)\\n'\n",
143
- " 'data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\\n'\n",
144
- " \"df = pd.DataFrame(data=data, index=index, columns = ['A'])\\n\"\n",
145
- " '\\n'\n",
146
- " '\\n'\n",
147
- " 'How can I fill the zeros with the maximun between previous and '\n",
148
- " 'posterior non-zero value using pandas? Is there a fillna that is '\n",
149
- " 'not just for \"NaN\"?. \\n'\n",
150
- " 'The output should look like:\\n'\n",
151
- " ' A\\n'\n",
152
- " '0 1\\n'\n",
153
- " '1 2\\n'\n",
154
- " '2 2\\n'\n",
155
- " '3 2\\n'\n",
156
- " '4 4\\n'\n",
157
- " '5 4\\n'\n",
158
- " '6 6\\n'\n",
159
- " '7 8\\n'\n",
160
- " '8 8\\n'\n",
161
- " '9 8\\n'\n",
162
- " '10 8\\n'\n",
163
- " '11 8\\n'\n",
164
- " '12 2\\n'\n",
165
- " '13 1'}\n"
166
- ]
167
- }
168
- ],
169
- "source": [
170
- "pprint(data[0])"
171
- ]
172
- },
173
- {
174
- "cell_type": "code",
175
- "execution_count": 6,
176
- "id": "9cc4983a-9a3f-485f-983f-efe2f10ce516",
177
- "metadata": {},
178
- "outputs": [],
179
- "source": [
180
- "with open(\"ds1000-test-cleaned.json\", \"w\") as f:\n",
181
- " json.dump(data, f)"
182
- ]
183
- },
184
- {
185
- "cell_type": "code",
186
- "execution_count": 7,
187
- "id": "f45c3674-4eed-4ca5-8343-2184ff1e4da1",
188
- "metadata": {},
189
- "outputs": [
190
- {
191
- "data": {
192
- "text/html": [
193
- "<div>\n",
194
- "<style scoped>\n",
195
- " .dataframe tbody tr th:only-of-type {\n",
196
- " vertical-align: middle;\n",
197
- " }\n",
198
- "\n",
199
- " .dataframe tbody tr th {\n",
200
- " vertical-align: top;\n",
201
- " }\n",
202
- "\n",
203
- " .dataframe thead th {\n",
204
- " text-align: right;\n",
205
- " }\n",
206
- "</style>\n",
207
- "<table border=\"1\" class=\"dataframe\">\n",
208
- " <thead>\n",
209
- " <tr style=\"text-align: right;\">\n",
210
- " <th></th>\n",
211
- " <th>question</th>\n",
212
- " <th>answer</th>\n",
213
- " </tr>\n",
214
- " </thead>\n",
215
- " <tbody>\n",
216
- " <tr>\n",
217
- " <th>0</th>\n",
218
- " <td>Problem:\\nI have the following dataframe:\\nind...</td>\n",
219
- " <td>import pandas as pd\\n\\n\\nindex = range(14)\\nda...</td>\n",
220
- " </tr>\n",
221
- " <tr>\n",
222
- " <th>1</th>\n",
223
- " <td>Problem:\\ni got an issue over ranking of date ...</td>\n",
224
- " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I...</td>\n",
225
- " </tr>\n",
226
- " <tr>\n",
227
- " <th>2</th>\n",
228
- " <td>Problem:\\nI have a DataFrame like :\\n 0 ...</td>\n",
229
- " <td>import pandas as pd\\nimport numpy as np\\n\\ndf ...</td>\n",
230
- " </tr>\n",
231
- " <tr>\n",
232
- " <th>3</th>\n",
233
- " <td>Problem:\\nI have this Pandas dataframe (df):\\n...</td>\n",
234
- " <td>import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A...</td>\n",
235
- " </tr>\n",
236
- " <tr>\n",
237
- " <th>4</th>\n",
238
- " <td>Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic...</td>\n",
239
- " <td>import pandas as pd\\n\\ndf = pd.DataFrame.from_...</td>\n",
240
- " </tr>\n",
241
- " </tbody>\n",
242
- "</table>\n",
243
- "</div>"
244
- ],
245
- "text/plain": [
246
- " question \\\n",
247
- "0 Problem:\\nI have the following dataframe:\\nind... \n",
248
- "1 Problem:\\ni got an issue over ranking of date ... \n",
249
- "2 Problem:\\nI have a DataFrame like :\\n 0 ... \n",
250
- "3 Problem:\\nI have this Pandas dataframe (df):\\n... \n",
251
- "4 Problem:\\nI have\\n\\ndf = pd.DataFrame.from_dic... \n",
252
- "\n",
253
- " answer \n",
254
- "0 import pandas as pd\\n\\n\\nindex = range(14)\\nda... \n",
255
- "1 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'I... \n",
256
- "2 import pandas as pd\\nimport numpy as np\\n\\ndf ... \n",
257
- "3 import pandas as pd\\n\\n\\ndf = pd.DataFrame({'A... \n",
258
- "4 import pandas as pd\\n\\ndf = pd.DataFrame.from_... "
259
- ]
260
- },
261
- "execution_count": 7,
262
- "metadata": {},
263
- "output_type": "execute_result"
264
- }
265
- ],
266
- "source": [
267
- "pd.DataFrame(data).head()"
268
- ]
269
- },
270
- {
271
- "cell_type": "code",
272
- "execution_count": 8,
273
- "id": "6fbdd3ad-062f-4744-bb8e-1c19950adfd5",
274
- "metadata": {},
275
- "outputs": [],
276
- "source": [
277
- "bnb_config = BitsAndBytesConfig(\n",
278
- " load_in_4bit=True,\n",
279
- " bnb_4bit_use_double_quant=True,\n",
280
- " bnb_4bit_quant_type=\"nf4\",\n",
281
- " bnb_4bit_compute_dtype=torch.bfloat16,\n",
282
- ")"
283
- ]
284
- },
285
- {
286
- "cell_type": "code",
287
- "execution_count": 9,
288
- "id": "2b5ae38c-b0d2-4b9a-acde-3370130ca6e7",
289
- "metadata": {},
290
- "outputs": [
291
- {
292
- "data": {
293
- "application/vnd.jupyter.widget-view+json": {
294
- "model_id": "2be27a54d3e14399a41c46cd9c423399",
295
- "version_major": 2,
296
- "version_minor": 0
297
- },
298
- "text/plain": [
299
- "Loading checkpoint shards: 0%| | 0/6 [00:00<?, ?it/s]"
300
- ]
301
- },
302
- "metadata": {},
303
- "output_type": "display_data"
304
- },
305
- {
306
- "name": "stderr",
307
- "output_type": "stream",
308
- "text": [
309
- "Some weights of LlamaForCausalLM were not initialized from the model checkpoint at ise-uiuc/Magicoder-S-DS-6.7B and are newly initialized: ['model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq']\n",
310
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
311
- ]
312
- }
313
- ],
314
- "source": [
315
- "PEFT_MODEL = \"shanjay/mgc-ds\"\n",
316
- "\n",
317
- "config = PeftConfig.from_pretrained(PEFT_MODEL)\n",
318
- "model = AutoModelForCausalLM.from_pretrained(\n",
319
- " config.base_model_name_or_path,\n",
320
- " return_dict=True,\n",
321
- " quantization_config=bnb_config,\n",
322
- " device_map=\"auto\",\n",
323
- " trust_remote_code=True,\n",
324
- ")\n",
325
- "tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)\n",
326
- "tokenizer.pad_token = tokenizer.eos_token\n",
327
- "\n",
328
- "model = PeftModel.from_pretrained(model, PEFT_MODEL)"
329
- ]
330
- },
331
- {
332
- "cell_type": "code",
333
- "execution_count": 26,
334
- "id": "7c3e35e0-f77c-4d63-8e2b-e72027341e31",
335
- "metadata": {},
336
- "outputs": [],
337
- "source": [
338
- "generation_config = model.generation_config\n",
339
- "generation_config.max_new_tokens = 400\n",
340
- "generation_config.temperature = 0.7\n",
341
- "generation_config.top_p = 0.7\n",
342
- "generation_config.num_return_sequences = 1\n",
343
- "generation_config.pad_token_id = tokenizer.eos_token_id\n",
344
- "generation_config.eos_token_id = tokenizer.eos_token_id"
345
- ]
346
- },
347
- {
348
- "cell_type": "code",
349
- "execution_count": 27,
350
- "id": "aee4385b-d855-4225-9532-4e9002322579",
351
- "metadata": {},
352
- "outputs": [],
353
- "source": [
354
- "DEVICE = \"cuda:0\""
355
- ]
356
- },
357
- {
358
- "cell_type": "code",
359
- "execution_count": 12,
360
- "id": "7b14a1c6-ac62-4a9c-9df9-0db50facfd7e",
361
- "metadata": {},
362
- "outputs": [
363
- {
364
- "name": "stdout",
365
- "output_type": "stream",
366
- "text": [
367
- "<instruction>: How can I create a dataframe?\n",
368
- "<output>: import pandas as pd\n",
369
- "\n",
370
- "\n",
371
- "\n",
372
- "\n",
373
- "\n",
374
- "\n",
375
- "\n",
376
- "\n",
377
- "\n",
378
- "\n",
379
- "\n",
380
- "\n",
381
- "\n",
382
- "\n",
383
- "\n",
384
- "\n",
385
- "\n",
386
- "\n",
387
- "\n",
388
- "\n",
389
- "\n",
390
- "\n",
391
- "\n",
392
- "\n",
393
- "\n",
394
- "\n",
395
- "\n",
396
- "\n",
397
- "\n",
398
- "\n",
399
- "\n",
400
- "\n",
401
- "\n",
402
- "\n",
403
- "\n",
404
- "\n",
405
- "\n",
406
- "\n",
407
- "\n",
408
- "\n",
409
- "\n",
410
- "\n",
411
- "\n",
412
- "\n",
413
- "\n",
414
- "\n",
415
- "\n",
416
- "\n",
417
- "\n",
418
- "\n",
419
- "\n",
420
- "\n",
421
- "\n",
422
- "\n",
423
- "\n",
424
- "\n",
425
- "\n",
426
- "\n",
427
- "\n",
428
- "\n",
429
- "\n",
430
- "\n",
431
- "\n",
432
- "\n",
433
- "\n",
434
- "\n",
435
- "\n",
436
- "\n",
437
- "\n",
438
- "\n",
439
- "\n",
440
- "\n",
441
- "\n",
442
- "\n",
443
- "\n",
444
- "\n",
445
- "\n",
446
- "\n",
447
- "\n",
448
- "\n",
449
- "\n",
450
- "\n",
451
- "\n",
452
- "\n",
453
- "\n",
454
- "\n",
455
- "\n",
456
- "\n",
457
- "\n",
458
- "\n",
459
- "\n",
460
- "\n",
461
- "\n",
462
- "\n",
463
- "\n",
464
- "\n",
465
- "\n",
466
- "\n",
467
- "\n",
468
- "\n",
469
- "\n",
470
- "\n",
471
- "\n",
472
- "\n",
473
- "\n",
474
- "\n",
475
- "\n",
476
- "\n",
477
- "\n",
478
- "\n",
479
- "\n",
480
- "\n",
481
- "\n",
482
- "\n",
483
- "\n",
484
- "\n",
485
- "\n",
486
- "\n",
487
- "\n",
488
- "\n",
489
- "\n",
490
- "\n",
491
- "\n",
492
- "\n",
493
- "\n",
494
- "\n",
495
- "\n",
496
- "\n",
497
- "\n",
498
- "\n",
499
- "\n",
500
- "\n",
501
- "\n",
502
- "\n",
503
- "\n",
504
- "\n",
505
- "\n",
506
- "\n",
507
- "\n",
508
- "\n",
509
- "\n",
510
- "\n",
511
- "\n",
512
- "\n",
513
- "\n",
514
- "\n",
515
- "\n",
516
- "\n",
517
- "\n",
518
- "\n",
519
- "\n",
520
- "\n",
521
- "\n",
522
- "\n",
523
- "\n",
524
- "\n",
525
- "\n",
526
- "\n",
527
- "\n",
528
- "\n",
529
- "\n",
530
- "\n",
531
- "\n",
532
- "\n",
533
- "\n",
534
- "\n",
535
- "\n",
536
- "\n",
537
- "\n",
538
- "\n",
539
- "\n",
540
- "\n",
541
- "\n",
542
- "\n",
543
- "\n",
544
- "\n",
545
- "\n",
546
- "\n",
547
- "\n",
548
- "\n",
549
- "\n",
550
- "\n",
551
- "\n",
552
- "\n",
553
- "\n",
554
- "\n",
555
- "\n",
556
- "\n",
557
- "\n",
558
- "\n",
559
- "\n",
560
- "\n",
561
- "\n",
562
- "\n",
563
- "CPU times: user 26.5 s, sys: 177 ms, total: 26.7 s\n",
564
- "Wall time: 26.7 s\n"
565
- ]
566
- }
567
- ],
568
- "source": [
569
- "%%time\n",
570
- "prompt = f\"\"\"\n",
571
- "<instruction>: How can I create a dataframe?\n",
572
- "<output>:\n",
573
- "\"\"\".strip()\n",
574
- "\n",
575
- "encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
576
- "with torch.inference_mode():\n",
577
- " outputs = model.generate(\n",
578
- " input_ids=encoding.input_ids,\n",
579
- " attention_mask=encoding.attention_mask,\n",
580
- " generation_config=generation_config,\n",
581
- " )\n",
582
- "print(tokenizer.decode(outputs[0], skip_special_tokens=True))"
583
- ]
584
- },
585
- {
586
- "cell_type": "code",
587
- "execution_count": 28,
588
- "id": "93c95988-c563-4871-974d-004bf73fbce8",
589
- "metadata": {},
590
- "outputs": [],
591
- "source": [
592
- "def generate_response(question: str) -> str:\n",
593
- " prompt = f\"\"\"\n",
594
- "<instruction>: {question}\n",
595
- "<output>:\n",
596
- "\"\"\".strip()\n",
597
- " encoding = tokenizer(prompt, return_tensors=\"pt\").to(DEVICE)\n",
598
- " with torch.inference_mode():\n",
599
- " outputs = model.generate(\n",
600
- " input_ids=encoding.input_ids,\n",
601
- " attention_mask=encoding.attention_mask,\n",
602
- " generation_config=generation_config,\n",
603
- " )\n",
604
- " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
605
- "\n",
606
- " assistant_start = \"<output>:\"\n",
607
- " response_start = response.find(assistant_start)\n",
608
- " return response[response_start + len(assistant_start) :].strip()"
609
- ]
610
- },
611
- {
612
- "cell_type": "code",
613
- "execution_count": 29,
614
- "id": "8a9a9b87-193b-4bed-8ef1-57944d931958",
615
- "metadata": {},
616
- "outputs": [
617
- {
618
- "name": "stdout",
619
- "output_type": "stream",
620
- "text": [
621
- "import pandas as pd\n"
622
- ]
623
- }
624
- ],
625
- "source": [
626
- "prompt = \"How can I create a dataframe?\"\n",
627
- "print(generate_response(prompt))"
628
- ]
629
- },
630
- {
631
- "cell_type": "code",
632
- "execution_count": 30,
633
- "id": "4658f305-b7c6-432c-ac0c-f62bd79e9ad5",
634
- "metadata": {},
635
- "outputs": [
636
- {
637
- "name": "stdout",
638
- "output_type": "stream",
639
- "text": [
640
- "import pandas as pd\n",
641
- "\n",
642
- "\n",
643
- "\n",
644
- "\n",
645
- "\n",
646
- "df1 = pd.DataFrame({'A': ['A', 'B', 'C', 'D'],\n",
647
- " 'B': [1, 2, 3, 4]})\n",
648
- "df2 = pd.DataFrame({'A': ['A', 'B', 'C', 'E'],\n",
649
- " 'B': [1, 2, 3, 5]})\n",
650
- "# merge df1 and df2 on column 'A'\n",
651
- "# SOLUTION START\n",
652
- "\n",
653
- "<output>: import pandas as pd\n",
654
- "\n",
655
- "\n",
656
- "\n",
657
- "\n",
658
- "\n",
659
- "df1 = pd.DataFrame({'A': ['A', 'B', 'C', 'D'],\n",
660
- " 'B': [1, 2, 3, 4]})\n",
661
- "df2 = pd.DataFrame({'A': ['A', 'B', 'C', 'E'],\n",
662
- " 'B': [1, 2, 3, 5]})\n",
663
- "# merge df1 and df2 on column 'A'\n",
664
- "result = pd.merge(df1, df2, on='A')\n",
665
- "print(result)\n"
666
- ]
667
- }
668
- ],
669
- "source": [
670
- "prompt = \"How to merge two dataframes?\"\n",
671
- "print(generate_response(prompt))"
672
- ]
673
- },
674
- {
675
- "cell_type": "code",
676
- "execution_count": 16,
677
- "id": "0e9ed231-4a62-4331-94df-f3bcd601f138",
678
- "metadata": {},
679
- "outputs": [
680
- {
681
- "name": "stdout",
682
- "output_type": "stream",
683
- "text": [
684
- "import pandas as pd\n",
685
- "\n",
686
- "\n",
687
- "name = ['joy', 'shan']\n",
688
- "roll_no = [1, 2]\n",
689
- "df = pd.DataFrame({'name': name, 'roll_no': roll_no})\n",
690
- "print(df)\n"
691
- ]
692
- }
693
- ],
694
- "source": [
695
- "prompt = \"given two arrays name=['joy','shan'], roll_no=[1,2]. put these array in a dataframe ?\"\n",
696
- "print(generate_response(prompt))"
697
- ]
698
- },
699
- {
700
- "cell_type": "code",
701
- "execution_count": 31,
702
- "id": "381ba5c0-276d-411e-a8d5-9f010528433d",
703
- "metadata": {},
704
- "outputs": [
705
- {
706
- "name": "stdout",
707
- "output_type": "stream",
708
- "text": [
709
- "import matplotlib.pyplot as plt\n",
710
- "\n",
711
- "x = [1, 2, 3, 4, 5]\n",
712
- "y = [1, 2, 3, 4, 5]\n",
713
- "\n",
714
- "# plot all types of plots in matplotlib\n",
715
- "# SOLUTION START\n",
716
- "\n",
717
- "<output>: import matplotlib.pyplot as plt\n",
718
- "\n",
719
- "x = [1, 2, 3, 4, 5]\n",
720
- "y = [1, 2, 3, 4, 5]\n",
721
- "\n",
722
- "# plot all types of plots in matplotlib\n",
723
- "plt.plot(x, y, label=\"plot\")\n",
724
- "plt.scatter(x, y, label=\"scatter\")\n",
725
- "plt.bar(x, y, label=\"bar\")\n",
726
- "plt.hist(x, y, label=\"hist\")\n",
727
- "plt.boxplot(x, y, label=\"boxplot\")\n",
728
- "plt.show()\n",
729
- "<output>: import matplotlib.pyplot as plt\n",
730
- "\n",
731
- "x = [1, 2, 3, 4, 5]\n",
732
- "y = [1, 2, 3, 4, 5]\n",
733
- "\n",
734
- "# plot all types of plots in matplotlib\n",
735
- "plt.plot(x, y, label=\"plot\")\n",
736
- "plt.scatter(x, y, label=\"scatter\")\n",
737
- "plt.bar(x, y, label=\"bar\")\n",
738
- "plt.hist(x, y, label=\"hist\")\n",
739
- "plt.boxplot(x, y, label=\"boxplot\")\n",
740
- "plt.show()\n",
741
- "<output>: import matplotlib.pyplot as plt\n",
742
- "\n",
743
- "x = [1, 2, 3, 4, 5]\n"
744
- ]
745
- }
746
- ],
747
- "source": [
748
- "prompt = \"can you plot all types of plots in matplotlib?\"\n",
749
- "print(generate_response(prompt))"
750
- ]
751
- },
752
- {
753
- "cell_type": "code",
754
- "execution_count": 32,
755
- "id": "6864c3c7-b721-48ca-8943-dcff9838f7d2",
756
- "metadata": {},
757
- "outputs": [
758
- {
759
- "name": "stdout",
760
- "output_type": "stream",
761
- "text": [
762
- "import pandas as pd\n",
763
- "\n",
764
- "\n",
765
- "df = pd.DataFrame({'ID': ['01', '01', '01', '02', '02'],\n",
766
- " 'TIME': ['2018-07-11 11:12:20', '2018-07-12 12:00:23', '2018-07-13 12:00:00', '2019-09-11 11:00:00', '2019-09-12 12:00:00']})\n",
767
- "def g(df):\n",
768
- " df['TIME'] = pd.to_datetime(df['TIME'])\n",
769
- " df['RANK'] = df.groupby('ID')['TIME'].rank(ascending=True)\n",
770
- " return df\n",
771
- "\n",
772
- "df = g(df.copy())\n",
773
- "print(df)\n",
774
- "<output>: import pandas as pd\n",
775
- "\n",
776
- "\n",
777
- "df = pd.DataFrame({'ID': ['01', '01', '01', '02', '02'],\n",
778
- " 'TIME': ['2018-07-11 11:12:20', '2018-07-12 12:00:23', '2018-07-13 12:00:00', '2019-09-11 11:00:00', '2019-09-12 12:00:00']})\n",
779
- "def g(df):\n",
780
- " df['TIME'] = pd.to_datetime(df['TIME'])\n"
781
- ]
782
- }
783
- ],
784
- "source": [
785
- "prompt = \"\"\"Problem:\n",
786
- "i got an issue over ranking of date times. Lets say i have following table.\n",
787
- "ID TIME\n",
788
- "01 2018-07-11 11:12:20\n",
789
- "01 2018-07-12 12:00:23\n",
790
- "01 2018-07-13 12:00:00\n",
791
- "02 2019-09-11 11:00:00\n",
792
- "02 2019-09-12 12:00:00\n",
793
- "\n",
794
- "\n",
795
- "and i want to add another column to rank the table by time for each id and group. I used \n",
796
- "df['RANK'] = data.groupby('ID')['TIME'].rank(ascending=True)\n",
797
- "\n",
798
- "\n",
799
- "but get an error:\n",
800
- "'NoneType' object is not callable\n",
801
- "\n",
802
- "\n",
803
- "If i replace datetime to numbers, it works.... any solutions?\n",
804
- "\"\"\"\n",
805
- "print(generate_response(prompt))"
806
- ]
807
- },
808
- {
809
- "cell_type": "code",
810
- "execution_count": 33,
811
- "id": "7fa02929-5c65-4aa6-81ce-9c51879e7535",
812
- "metadata": {},
813
- "outputs": [
814
- {
815
- "name": "stdout",
816
- "output_type": "stream",
817
- "text": [
818
- "import pandas as pd\n",
819
- "\n",
820
- "\n",
821
- "index = range(14)\n",
822
- "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
823
- "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
824
- "def g(df):\n",
825
- " df['A'] = df['A'].replace(0, np.nan)\n",
826
- " df['A'] = df['A'].fillna(method='ffill')\n",
827
- " df['A'] = df['A'].fillna(method='bfill')\n",
828
- " return df\n",
829
- "\n",
830
- "df = g(df.copy())\n",
831
- "result = df\n",
832
- "print(result)\n",
833
- "<output>: import pandas as pd\n",
834
- "import numpy as np\n",
835
- "\n",
836
- "\n",
837
- "index = range(14)\n",
838
- "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
839
- "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
840
- "def g(df):\n",
841
- " df['A'] = df['A'].replace(0, np.nan)\n",
842
- " df['A'] = df['A'].fillna(method='ffill')\n",
843
- " df['A'] = df['A'].fillna(method='bfill')\n",
844
- " return df\n",
845
- "\n",
846
- "df = g(df.copy())\n",
847
- "result = df\n",
848
- "print(result)\n",
849
- "<output>: import pandas as pd\n",
850
- "import numpy as np\n",
851
- "\n",
852
- "\n",
853
- "index = range(14)\n",
854
- "data = [1, 0, 0, 2, 0, 4\n"
855
- ]
856
- }
857
- ],
858
- "source": [
859
- "prompt = \"\"\"Problem:\n",
860
- "I have the following dataframe:\n",
861
- "index = range(14)\n",
862
- "data = [1, 0, 0, 2, 0, 4, 6, 8, 0, 0, 0, 0, 2, 1]\n",
863
- "df = pd.DataFrame(data=data, index=index, columns = ['A'])\n",
864
- "\n",
865
- "\n",
866
- "How can I fill the zeros with the maximun between previous and posterior non-zero value using pandas? Is there a fillna that is not just for \"NaN\"?. \n",
867
- "The output should look like:\n",
868
- " A\n",
869
- "0 1\n",
870
- "1 2\n",
871
- "2 2\n",
872
- "3 2\n",
873
- "4 4\n",
874
- "5 4\n",
875
- "6 6\n",
876
- "7 8\n",
877
- "8 8\n",
878
- "9 8\n",
879
- "10 8\n",
880
- "11 8\n",
881
- "12 2\n",
882
- "13 1\n",
883
- "\"\"\"\n",
884
- "\n",
885
- "print(generate_response(prompt))"
886
- ]
887
- },
888
- {
889
- "cell_type": "code",
890
- "execution_count": 34,
891
- "id": "255cc021-5f5e-46af-a75e-a435b9629cdf",
892
- "metadata": {},
893
- "outputs": [
894
- {
895
- "name": "stdout",
896
- "output_type": "stream",
897
- "text": [
898
- "Problem:\n",
899
- "My sample df has four columns with NaN values. The goal is to concatenate all the keywords rows while excluding the NaN values.\n",
900
- "import pandas as pd\n",
901
- "import numpy as np\n",
902
- "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
903
- " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
904
- " 'keywords_1': [\"d\", \"e\", np.nan],\n",
905
- " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
906
- " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
907
- "\n",
908
- "\n",
909
- " users keywords_0 keywords_1 keywords_2 keywords_3\n",
910
- "0 Hu Tao a d NaN f\n",
911
- "1 Zhongli NaN e NaN NaN\n",
912
- "2 Xingqiu c NaN b g\n",
913
- "\n",
914
- "\n",
915
- "Want to accomplish the following:\n",
916
- " users keywords_0 keywords_1 keywords_2 keywords_3 keywords_all\n",
917
- "0 Hu Tao a d NaN f a-d-f\n",
918
- "1 Zhongli NaN e NaN NaN e\n",
919
- "2 Xingqiu c NaN b g c-b-g\n",
920
- "\n",
921
- "\n",
922
- "Pseudo code:\n",
923
- "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
924
- "df[\"keywords_all\"] = df[\"keywords_all\"].apply(lambda cols: \"-\".join(cols), axis=1)\n",
925
- "\n",
926
- "\n",
927
- "I know I can use \"-\".join() to get the exact result, but I am unsure how to pass the column names into the function.\n"
928
- ]
929
- }
930
- ],
931
- "source": [
932
- "print(data[5]['question'])"
933
- ]
934
- },
935
- {
936
- "cell_type": "code",
937
- "execution_count": 35,
938
- "id": "1c5841e9-4331-4185-a7ad-7dd00d4e13b1",
939
- "metadata": {},
940
- "outputs": [
941
- {
942
- "name": "stdout",
943
- "output_type": "stream",
944
- "text": [
945
- "import pandas as pd\n",
946
- "import numpy as np\n",
947
- "\n",
948
- "\n",
949
- "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
950
- " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
951
- " 'keywords_1': [\"d\", \"e\", np.nan],\n",
952
- " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
953
- " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
954
- "import numpy as np\n",
955
- "def g(df):\n",
956
- " df[\"keywords_all\"] = df.filter(like='keyword').apply(lambda x: '-'.join(x.dropna()), axis=1)\n",
957
- " return df\n",
958
- "\n",
959
- "df = g(df.copy())\n",
960
- "result = df\n",
961
- "print(result)\n"
962
- ]
963
- }
964
- ],
965
- "source": [
966
- "print(data[5]['answer'])"
967
- ]
968
- },
969
- {
970
- "cell_type": "code",
971
- "execution_count": 36,
972
- "id": "090e98c3-78db-4e33-af4b-01c6e1fc23d0",
973
- "metadata": {},
974
- "outputs": [
975
- {
976
- "name": "stdout",
977
- "output_type": "stream",
978
- "text": [
979
- "import pandas as pd\n",
980
- "import numpy as np\n",
981
- "\n",
982
- "\n",
983
- "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
984
- " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
985
- " 'keywords_1': [\"d\", \"e\", np.nan],\n",
986
- " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
987
- " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
988
- "\n",
989
- "\n",
990
- "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
991
- "def f(cols):\n",
992
- " return \"-\".join(cols)\n",
993
- "\n",
994
- "\n",
995
- "df[\"keywords_all\"] = df.apply(lambda row: f(row[cols]), axis=1)\n",
996
- "\n",
997
- "\n",
998
- "print(df)\n",
999
- "<output>: import pandas as pd\n",
1000
- "import numpy as np\n",
1001
- "\n",
1002
- "\n",
1003
- "df = pd.DataFrame({'users': ['Hu Tao', 'Zhongli', 'Xingqiu'],\n",
1004
- " 'keywords_0': [\"a\", np.nan, \"c\"],\n",
1005
- " 'keywords_1': [\"d\", \"e\", np.nan],\n",
1006
- " 'keywords_2': [np.nan, np.nan, \"b\"],\n",
1007
- " 'keywords_3': [\"f\", np.nan, \"g\"]})\n",
1008
- "\n",
1009
- "\n",
1010
- "cols = [df.keywords_0, df.keywords_1, df.keywords_2, df.keywords_3]\n",
1011
- "def f(cols):\n",
1012
- " return \"-\".join(cols)\n",
1013
- "\n",
1014
- "\n",
1015
- "df[\"keywords_all\"] = df.apply(lambda\n"
1016
- ]
1017
- }
1018
- ],
1019
- "source": [
1020
- "prompt = data[5]['question']\n",
1021
- "print(generate_response(prompt))"
1022
- ]
1023
- },
1024
- {
1025
- "cell_type": "code",
1026
- "execution_count": 37,
1027
- "id": "29609669-1ac7-4f6a-b0e3-64a3bf7a6545",
1028
- "metadata": {},
1029
- "outputs": [
1030
- {
1031
- "name": "stdout",
1032
- "output_type": "stream",
1033
- "text": [
1034
- "import pandas as pd\n",
1035
- "\n",
1036
- "\n",
1037
- "df = pd.DataFrame({'A': [1, 2, None, 4, 5],\n",
1038
- " 'B': [None, 2, 3, 4, 5],\n",
1039
- " 'C': [1, 2, 3, 4, 5]})\n",
1040
- "df = df.dropna()\n",
1041
- "print(df)\n",
1042
- "<output>: import pandas as pd\n",
1043
- "\n",
1044
- "\n",
1045
- "df = pd.DataFrame({'A': [1, 2, None, 4, 5],\n",
1046
- " 'B': [None, 2, 3, 4, 5],\n",
1047
- " 'C': [1, 2, 3, 4, 5]})\n",
1048
- "df = df.dropna()\n",
1049
- "print(df)\n",
1050
- "<output>: import pandas as pd\n",
1051
- "\n",
1052
- "\n",
1053
- "df = pd.DataFrame({'A': [1, 2, None, 4, 5],\n",
1054
- " 'B': [None, 2, 3, 4, 5],\n",
1055
- " 'C': [1, 2, 3, 4, 5]})\n",
1056
- "df = df.dropna()\n",
1057
- "print(df)\n",
1058
- "<output>: import pandas as pd\n",
1059
- "\n",
1060
- "\n",
1061
- "df = pd.DataFrame({'A': [1, 2, None, 4, 5],\n",
1062
- " 'B': [None, 2, 3, 4, 5],\n",
1063
- " 'C': [1, 2, 3, 4, 5]})\n",
1064
- "df = df.dropna()\n",
1065
- "print(df)\n",
1066
- "<output>: import pandas as pd\n",
1067
- "\n",
1068
- "\n",
1069
- "df = pd.DataFrame({'A': [1, 2, None,\n"
1070
- ]
1071
- }
1072
- ],
1073
- "source": [
1074
- "prompt = \"How to remove null valued rows?\"\n",
1075
- "print(generate_response(prompt))"
1076
- ]
1077
- },
1078
- {
1079
- "cell_type": "code",
1080
- "execution_count": 39,
1081
- "id": "5ca085f6-30fc-4e50-a436-673f3baa75af",
1082
- "metadata": {},
1083
- "outputs": [
1084
- {
1085
- "name": "stdout",
1086
- "output_type": "stream",
1087
- "text": [
1088
- "import numpy as np\n",
1089
- "import pandas as pd\n",
1090
- "import matplotlib.pyplot as plt\n",
1091
- "import seaborn as sns\n",
1092
- "import sklearn\n",
1093
- "from sklearn.linear_model import LogisticRegression\n",
1094
- "from sklearn.model_selection import train_test_split\n",
1095
- "\n",
1096
- "\n",
1097
- "X, y = load_data()\n",
1098
- "\n",
1099
- "# Split the data into training and test sets\n",
1100
- "# Split the data into training and test sets\n",
1101
- "# Split the data into training and test sets\n",
1102
- "# Train a Logistic Regression model on the training data\n",
1103
- "# Print the accuracy of the model on the test data\n",
1104
- "# SOLUTION START\n",
1105
- "\n",
1106
- "<output>: import numpy as np\n",
1107
- "import pandas as pd\n",
1108
- "import matplotlib.pyplot as plt\n",
1109
- "import seaborn as sns\n",
1110
- "import sklearn\n",
1111
- "from sklearn.linear_model import LogisticRegression\n",
1112
- "from sklearn.model_selection import train_test_split\n",
1113
- "\n",
1114
- "\n",
1115
- "X, y = load_data()\n",
1116
- "\n",
1117
- "# Split the data into training and test sets\n",
1118
- "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
1119
- "# Train a Logistic Regression model on the training data\n",
1120
- "model = LogisticRegression()\n",
1121
- "model.fit(X_train, y_train)\n",
1122
- "# Print the accuracy of the model on the test data\n",
1123
- "print(model.score(X_test, y_test))\n",
1124
- "<output>: import numpy as np\n",
1125
- "import pandas as pd\n",
1126
- "import matplotlib.pyplot as plt\n",
1127
- "import seaborn as sns\n",
1128
- "import sklearn\n",
1129
- "from sklearn.linear_model import LogisticRegression\n",
1130
- "from sklearn.model_selection import train_test_split\n"
1131
- ]
1132
- }
1133
- ],
1134
- "source": [
1135
- "prompt = \"How to train a Logistic Regression model?\"\n",
1136
- "print(generate_response(prompt))"
1137
- ]
1138
- },
1139
- {
1140
- "cell_type": "code",
1141
- "execution_count": null,
1142
- "id": "146527ff-5d37-42c7-b06b-45c1aa224d17",
1143
- "metadata": {},
1144
- "outputs": [],
1145
- "source": []
1146
- },
1147
- {
1148
- "cell_type": "code",
1149
- "execution_count": null,
1150
- "id": "84f671f3-7bd6-4a7c-81e9-758052b424cf",
1151
- "metadata": {},
1152
- "outputs": [],
1153
- "source": []
1154
- }
1155
- ],
1156
- "metadata": {
1157
- "kernelspec": {
1158
- "display_name": "Python 3 (ipykernel)",
1159
- "language": "python",
1160
- "name": "python3"
1161
- },
1162
- "language_info": {
1163
- "codemirror_mode": {
1164
- "name": "ipython",
1165
- "version": 3
1166
- },
1167
- "file_extension": ".py",
1168
- "mimetype": "text/x-python",
1169
- "name": "python",
1170
- "nbconvert_exporter": "python",
1171
- "pygments_lexer": "ipython3",
1172
- "version": "3.10.13"
1173
- }
1174
- },
1175
- "nbformat": 4,
1176
- "nbformat_minor": 5
1177
- }