|
--- |
|
library_name: peft |
|
base_model: unsloth/gemma-7b-bnb-4bit |
|
--- |
|
|
|
test parameter by use split="test" |
|
|
|
code to create dataset |
|
|
|
```python |
|
import random |
|
|
|
|
|
alpaca_prompt = """<original>{}</original> |
|
<translate to="{}">{}""" |
|
|
|
BOS_TOKEN = tokenizer.bos_token # Must add EOS_TOKEN |
|
EOS_TOKEN = "</translate>"+tokenizer.eos_token # Must add EOS_TOKEN |
|
def formatting_prompts_func(examples): |
|
translations = examples["translation"] |
|
texts = [] |
|
text_en = "" |
|
text_th = "" |
|
translate_to = 'th' |
|
max_group_count = 1 |
|
group_count = 0 |
|
for translation in translations: |
|
|
|
if group_count >= max_group_count: |
|
if(translate_to == 'th'): |
|
text = alpaca_prompt.format(text_en, translate_to, text_th) + EOS_TOKEN |
|
else: |
|
text = alpaca_prompt.format(text_th, translate_to, text_en) + EOS_TOKEN |
|
texts.append(text) |
|
text_en = "" |
|
text_th = "" |
|
max_group_count = random.randint(1, 5) |
|
group_count = 0 |
|
translate_to = random.choice(['en', 'th']) |
|
|
|
num_newlines = random.randint(1, 5) |
|
newlines = '\n' * num_newlines |
|
if(text_en == ""): |
|
text_en = translation['en'] |
|
text_th = translation['th'] |
|
else: |
|
text_en = text_en+newlines+translation['en'] |
|
text_th = text_th+newlines+translation['th'] |
|
group_count = group_count+1 |
|
if(translate_to == 'th'): |
|
text = alpaca_prompt.format(text_en, translate_to, text_th) + EOS_TOKEN |
|
else: |
|
text = alpaca_prompt.format(text_th, translate_to, text_en) + EOS_TOKEN |
|
texts.append(text) |
|
return { "text" : texts, } |
|
|
|
|
|
from datasets import load_dataset |
|
dataset = load_dataset("scb_mt_enth_2020",'enth',split="test") |
|
dataset = dataset.map(formatting_prompts_func, batched = True,remove_columns=["translation",'subdataset']) |
|
dataset = dataset.train_test_split(test_size=0.1, shuffle=True) |
|
dataset['train'][0:5] |
|
``` |