File size: 3,642 Bytes
b4e7c84 afb9af6 b4e7c84 afb9af6 cef5f51 afb9af6 4247f47 afb9af6 b4e7c84 afb9af6 b4e7c84 afb9af6 b4e7c84 5580aca b4e7c84 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
---
language:
- ms
---
# Full Parameter Finetuning TinyLlama 16384 context length on Malaysian instructions dataset
README at https://github.com/mesolitica/malaya/tree/5.1/session/tiny-llama#instructions-7b-16384-context-length
We use exact Llama2 Instruct chat template.
WandB, https://wandb.ai/mesolitica/fpf-tinyllama-1.1b-hf-instructions-16k-function-call?workspace=user-husein-mesolitica
WandB report, https://wandb.ai/mesolitica/fpf-mallam-5b-instructions-16k/reports/Instruction-finetuning--Vmlldzo2MjE5Njg2
## Dataset
Dataset gathered at https://huggingface.co/collections/mesolitica/malaysian-synthetic-dataset-656c2673fe7fe0b1e9e25fe2
Notebook to prepare dataset at https://github.com/mesolitica/malaysian-dataset/blob/master/llm-instruction/combine-malay-no-alignment-multitasks-partial-ultrachat-v2.ipynb
## Limitations
This model is a quick demonstration that the base model can be easily fine-tuned to achieve some performance.
It does have minimal moderation mechanisms.
## how-to
```python
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
def parse_llama_chat(
messages,
function_call = None,
default_system = 'Anda adalah pembantu AI yang berguna dan mampu jawab segala soalan yang diberikan.'
):
if messages[0]['role'] != 'system':
system = default_system
start_index = 0
else:
system = messages[0]['content']
start_index = 1
user_query = messages[-1]['content']
users, assistants = [], []
for q in messages[start_index:-1]:
if q['role'] == 'user':
users.append(q['content'])
elif q['role'] == 'assistant':
assistants.append(q['content'])
texts = [f'<s>[INST] <<SYS>>\n{system}\n<</SYS>>\n\n']
if function_call:
fs = []
for f in function_call:
f = json.dumps(f, indent=4)
fs.append(f)
fs = '\n\n'.join(fs)
texts.append(f'\n[FUNCTIONCALL]\n{fs}\n')
for u, a in zip(users, assistants):
texts.append(f'{u.strip()} [/INST] {a.strip()} </s><s>[INST] ')
texts.append(f'{user_query.strip()} [/INST]')
prompt = ''.join(texts).strip()
return prompt
TORCH_DTYPE = 'bfloat16'
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=getattr(torch, TORCH_DTYPE)
)
tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-tinyllama-1.1b-16k-instructions')
model = AutoModelForCausalLM.from_pretrained(
'mesolitica/malaysian-tinyllama-1.1b-16k-instructions',
use_flash_attention_2 = True,
quantization_config = nf4_config
)
messages = [
{'role': 'system', 'content': 'awak adalah AI yang mampu jawab segala soalan'},
{'role': 'user', 'content': 'kwsp tu apa'}
]
prompt = parse_llama_chat(messages)
inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
generate_kwargs = dict(
inputs,
max_new_tokens=1024,
top_p=0.95,
top_k=50,
temperature=0.9,
do_sample=True,
num_beams=1,
)
r = model.generate(**generate_kwargs)
print(tokenizer.decode(r[0]))
```
```
<s> [INST] <<SYS>>
awak adalah AI yang mampu jawab segala soalan
<</SYS>>
kwsp tu apa [/INST] KWSP (Kumpulan Wang Simpanan Pekerja) merupakan sistem persaraan yang disediakan oleh kerajaan Malaysia untuk memberikan simpanan dan kebajikan kepada pekerja dan pekerja yang berumur 55 tahun ke atas. KWSP adalah singkatan bagi "Kumpulan Wang Simpanan Pekerja" dan ia merupakan salah satu dana persaraan yang popular di Malaysia. </s>
``` |