Spaces:

dushuai112233
/

LLM

Paused

dushuai112233 commited on Jan 3

Commit

7043406

verified ·

1 Parent(s): d75cc4b

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -35,7 +35,10 @@ def main():
     # Tokenize the datasets
     def tokenize_function(examples):
-        return tokenizer(examples['question'], padding='max_length', truncation=True, max_length=128)
     train_dataset = train_dataset.map(tokenize_function, batched=True)
     val_dataset = val_dataset.map(tokenize_function, batched=True)

     # Tokenize the datasets
     def tokenize_function(examples):
+        # 注意: 对于 Causal LM，通常会使用输入文本作为标签（shifted label）
+        encodings = tokenizer(examples['question'], padding='max_length', truncation=True, max_length=128)
+        encodings['labels'] = encodings['input_ids'].copy()  # Causal LM labels should be same as input_ids
+        return encodings
     train_dataset = train_dataset.map(tokenize_function, batched=True)
     val_dataset = val_dataset.map(tokenize_function, batched=True)