Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -35,7 +35,10 @@ def main():
|
|
35 |
|
36 |
# Tokenize the datasets
|
37 |
def tokenize_function(examples):
|
38 |
-
|
|
|
|
|
|
|
39 |
|
40 |
train_dataset = train_dataset.map(tokenize_function, batched=True)
|
41 |
val_dataset = val_dataset.map(tokenize_function, batched=True)
|
|
|
35 |
|
36 |
# Tokenize the datasets
|
37 |
def tokenize_function(examples):
|
38 |
+
# 注意: 对于 Causal LM,通常会使用输入文本作为标签(shifted label)
|
39 |
+
encodings = tokenizer(examples['question'], padding='max_length', truncation=True, max_length=128)
|
40 |
+
encodings['labels'] = encodings['input_ids'].copy() # Causal LM labels should be same as input_ids
|
41 |
+
return encodings
|
42 |
|
43 |
train_dataset = train_dataset.map(tokenize_function, batched=True)
|
44 |
val_dataset = val_dataset.map(tokenize_function, batched=True)
|