ping98k
/

gemma-7b-translator-0.2-lora

Model card Files Files and versions Community

gemma-7b-translator-0.2-lora / README.md

ping98k's picture

Update README.md

2de76b0 verified 9 months ago

|

1.99 kB

	---
	library_name: peft
	base_model: unsloth/gemma-7b-bnb-4bit
	---

	test parameter by use split="test"

	code to create dataset

	```python
	import random


	alpaca_prompt = """<original>{}</original>
	<translate to="{}">{}"""

	BOS_TOKEN = tokenizer.bos_token # Must add EOS_TOKEN
	EOS_TOKEN = "</translate>"+tokenizer.eos_token # Must add EOS_TOKEN
	def formatting_prompts_func(examples):
	translations = examples["translation"]
	texts = []
	text_en = ""
	text_th = ""
	translate_to = 'th'
	max_group_count = 1
	group_count = 0
	for translation in translations:

	if group_count >= max_group_count:
	if(translate_to == 'th'):
	text = alpaca_prompt.format(text_en, translate_to, text_th) + EOS_TOKEN
	else:
	text = alpaca_prompt.format(text_th, translate_to, text_en) + EOS_TOKEN
	texts.append(text)
	text_en = ""
	text_th = ""
	max_group_count = random.randint(1, 5)
	group_count = 0
	translate_to = random.choice(['en', 'th'])

	num_newlines = random.randint(1, 5)
	newlines = '\n' * num_newlines
	if(text_en == ""):
	text_en = translation['en']
	text_th = translation['th']
	else:
	text_en = text_en+newlines+translation['en']
	text_th = text_th+newlines+translation['th']
	group_count = group_count+1
	if(translate_to == 'th'):
	text = alpaca_prompt.format(text_en, translate_to, text_th) + EOS_TOKEN
	else:
	text = alpaca_prompt.format(text_th, translate_to, text_en) + EOS_TOKEN
	texts.append(text)
	return { "text" : texts, }


	from datasets import load_dataset
	dataset = load_dataset("scb_mt_enth_2020",'enth',split="test")
	dataset = dataset.map(formatting_prompts_func, batched = True,remove_columns=["translation",'subdataset'])
	dataset = dataset.train_test_split(test_size=0.1, shuffle=True)
	dataset['train'][0:5]
	```