Update app.py
Browse files
app.py
CHANGED
|
@@ -37,6 +37,11 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
| 37 |
data = {"text": [clean_text]}
|
| 38 |
dataset = Dataset.from_dict(data)
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
# Tokenization function
|
| 41 |
def tokenize_function(examples):
|
| 42 |
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
|
|
|
|
| 37 |
data = {"text": [clean_text]}
|
| 38 |
dataset = Dataset.from_dict(data)
|
| 39 |
|
| 40 |
+
# Set a padding token manually
|
| 41 |
+
tokenizer.pad_token = tokenizer.eos_token # Use EOS as PAD token
|
| 42 |
+
# Alternatively, add a new custom pad token
|
| 43 |
+
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
| 44 |
+
|
| 45 |
# Tokenization function
|
| 46 |
def tokenize_function(examples):
|
| 47 |
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
|