File size: 1,988 Bytes
ea1a6a4 2de76b0 9060863 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
---
library_name: peft
base_model: unsloth/gemma-7b-bnb-4bit
---
test parameter by use split="test"
code to create dataset
```python
import random
alpaca_prompt = """<original>{}</original>
<translate to="{}">{}"""
BOS_TOKEN = tokenizer.bos_token # Must add EOS_TOKEN
EOS_TOKEN = "</translate>"+tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
translations = examples["translation"]
texts = []
text_en = ""
text_th = ""
translate_to = 'th'
max_group_count = 1
group_count = 0
for translation in translations:
if group_count >= max_group_count:
if(translate_to == 'th'):
text = alpaca_prompt.format(text_en, translate_to, text_th) + EOS_TOKEN
else:
text = alpaca_prompt.format(text_th, translate_to, text_en) + EOS_TOKEN
texts.append(text)
text_en = ""
text_th = ""
max_group_count = random.randint(1, 5)
group_count = 0
translate_to = random.choice(['en', 'th'])
num_newlines = random.randint(1, 5)
newlines = '\n' * num_newlines
if(text_en == ""):
text_en = translation['en']
text_th = translation['th']
else:
text_en = text_en+newlines+translation['en']
text_th = text_th+newlines+translation['th']
group_count = group_count+1
if(translate_to == 'th'):
text = alpaca_prompt.format(text_en, translate_to, text_th) + EOS_TOKEN
else:
text = alpaca_prompt.format(text_th, translate_to, text_en) + EOS_TOKEN
texts.append(text)
return { "text" : texts, }
from datasets import load_dataset
dataset = load_dataset("scb_mt_enth_2020",'enth',split="test")
dataset = dataset.map(formatting_prompts_func, batched = True,remove_columns=["translation",'subdataset'])
dataset = dataset.train_test_split(test_size=0.1, shuffle=True)
dataset['train'][0:5]
``` |