File size: 2,570 Bytes
37015e3 05ab353 37015e3 05ab353 37015e3 05ab353 37015e3 05ab353 37015e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
model:
task: text-generation
system_prompt: "너는 주어진 Context에서 Question에 대한 Answer를 찾는 챗봇이야. Context에서 Answer가 될 수 있는 부분을 찾아서 그대로 적어줘. 단, Answer는 주관식이 아니라 단답형으로 적어야 해."
path: MLP-KTLim/llama-3-Korean-Bllossom-8B
torch_dtype: auto
device_map: auto
attn_implementation: sdpa
dataset:
path: jijihuny/economics_qa
name: train
shuffle: false
test_size: null
include_answer: true
metric:
path: jijihuny/ecqa
generation:
# 프롬프트를 포함하지 않음(false)
return_full_text: false
# 생성할 최대 토큰 숫자
max_new_tokens: null
# Stochastic Decoding Algorithm
do_sample: false
# 상위 K개의 Vocab
top_k: 1
# Smallest subset V' s.t \sum_{v \in V} v \geq p
top_p: 0.95
# softmax(x/T)
# T > 1 => smooth(uniform as T -> \infty)
# 0 <= T < 1 => sharpen(deterministic as T -> 0+)
temperature: 1.0
# penalty on generated token. temperature보다 높아야함
repetition_penalty: null
# Contrastive search
# Degeneration penalty
# argmax (1-alpha) * p(v, x_{<i}) - alpha * max_{j<i}(similarity(v, x_j))
penalty_alpha: null
# https://arxiv.org/abs/2309.03883
dola_layers: null
train:
instruction_template: "<|start_header_id|>user<|end_header_id|>"
response_template: "<|start_header_id|>assistant<|end_header_id|>"
use_completion_only_data_collator: false
quantization:
load_in_4bit: true
bnb_4bit_quant_type: nf4
bnb_4bit_compute_dtype: bfloat16
bnb_4bit_use_double_quant: true
lora:
r: 16
lora_alpha: 32
lora_dropout: 0.05
bias: none
target_modules:
- up_proj
- down_proj
- gate_proj
- k_proj
- q_proj
- v_proj
- o_proj
# - lm_head
task_type: CAUSAL_LM
args:
output_dir: llama3-qlora-r16-a32
run_name: llama3-qlora-r16-a32
report_to: wandb
# dataloader_num_workers: 4
torch_empty_cache_steps: 3
# group_by_length: true
max_seq_length: 2048
eval_strategy: steps
per_device_train_batch_size: 16
per_device_eval_batch_size: 32
gradient_accumulation_steps: 1
eval_accumulation_steps: 1
optim: paged_adamw_8bit
bf16: true
bf16_full_eval: true
learning_rate: 0.0002
weight_decay: 0.01
num_train_epochs: 3
warmup_ratio: 0.005
max_grad_norm: 2.0
eval_steps: 0.2
eval_on_start: false
save_steps: 0.2
logging_steps: 1
push_to_hub: true
# torch_compile: true
seed: 42 |