dushuai112233 commited on
Commit
7043406
·
verified ·
1 Parent(s): d75cc4b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -1
app.py CHANGED
@@ -35,7 +35,10 @@ def main():
35
 
36
  # Tokenize the datasets
37
  def tokenize_function(examples):
38
- return tokenizer(examples['question'], padding='max_length', truncation=True, max_length=128)
 
 
 
39
 
40
  train_dataset = train_dataset.map(tokenize_function, batched=True)
41
  val_dataset = val_dataset.map(tokenize_function, batched=True)
 
35
 
36
  # Tokenize the datasets
37
  def tokenize_function(examples):
38
+ # 注意: 对于 Causal LM,通常会使用输入文本作为标签(shifted label)
39
+ encodings = tokenizer(examples['question'], padding='max_length', truncation=True, max_length=128)
40
+ encodings['labels'] = encodings['input_ids'].copy() # Causal LM labels should be same as input_ids
41
+ return encodings
42
 
43
  train_dataset = train_dataset.map(tokenize_function, batched=True)
44
  val_dataset = val_dataset.map(tokenize_function, batched=True)