Spaces:
Sleeping
Sleeping
Update fine_tune.py
Browse files- fine_tune.py +14 -28
fine_tune.py
CHANGED
@@ -1,46 +1,33 @@
|
|
1 |
from datasets import load_dataset
|
2 |
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
|
3 |
-
import torch
|
4 |
from sklearn.metrics import accuracy_score
|
5 |
|
6 |
# Load the dataset
|
7 |
dataset = load_dataset("sms_spam")
|
8 |
|
9 |
-
# Print the dataset structure and inspect the columns
|
10 |
-
print(dataset)
|
11 |
-
print(dataset['train'][0]) # Print the first row of the 'train' split
|
12 |
-
|
13 |
# Initialize the tokenizer
|
14 |
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
15 |
|
16 |
# Initialize the model
|
17 |
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
|
18 |
|
19 |
-
# Tokenize the dataset
|
20 |
def tokenize_function(examples):
|
21 |
return tokenizer(examples["sms"], padding="max_length", truncation=True)
|
22 |
|
23 |
-
# Apply the tokenization to the dataset
|
24 |
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
25 |
|
26 |
-
#
|
27 |
train_dataset = tokenized_datasets["train"]
|
|
|
28 |
|
29 |
-
#
|
30 |
-
eval_dataset = tokenized_datasets.get("test", tokenized_datasets.get("validation"))
|
31 |
-
|
32 |
-
# If neither 'test' nor 'validation' exists, manually split the dataset
|
33 |
-
if eval_dataset is None:
|
34 |
-
eval_dataset = train_dataset.shuffle(seed=42).select([i for i in range(len(train_dataset)//10)]) # Take 10% as eval dataset
|
35 |
-
train_dataset = train_dataset.select([i for i in range(len(train_dataset)//10, len(train_dataset))]) # Take the remaining 90% as train dataset
|
36 |
-
|
37 |
-
# Set up training arguments
|
38 |
training_args = TrainingArguments(
|
39 |
output_dir="./results",
|
40 |
-
evaluation_strategy="steps",
|
41 |
-
save_strategy="steps",
|
42 |
-
eval_steps=500,
|
43 |
-
save_steps=500,
|
44 |
learning_rate=2e-5,
|
45 |
per_device_train_batch_size=16,
|
46 |
per_device_eval_batch_size=64,
|
@@ -52,25 +39,24 @@ training_args = TrainingArguments(
|
|
52 |
metric_for_best_model="accuracy",
|
53 |
)
|
54 |
|
55 |
-
#
|
56 |
def compute_metrics(p):
|
57 |
-
|
58 |
-
|
59 |
-
return {"accuracy": accuracy_score(labels, preds)}
|
60 |
|
61 |
-
#
|
62 |
trainer = Trainer(
|
63 |
model=model,
|
64 |
args=training_args,
|
65 |
train_dataset=train_dataset,
|
66 |
eval_dataset=eval_dataset,
|
67 |
-
compute_metrics=compute_metrics,
|
68 |
)
|
69 |
|
70 |
# Train the model
|
71 |
trainer.train()
|
72 |
|
73 |
-
# Save the model
|
74 |
model.save_pretrained("./fine_tuned_model")
|
75 |
tokenizer.save_pretrained("./fine_tuned_model")
|
76 |
|
|
|
1 |
from datasets import load_dataset
|
2 |
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
|
|
|
3 |
from sklearn.metrics import accuracy_score
|
4 |
|
5 |
# Load the dataset
|
6 |
dataset = load_dataset("sms_spam")
|
7 |
|
|
|
|
|
|
|
|
|
8 |
# Initialize the tokenizer
|
9 |
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
10 |
|
11 |
# Initialize the model
|
12 |
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
|
13 |
|
14 |
+
# Tokenize the dataset
|
15 |
def tokenize_function(examples):
|
16 |
return tokenizer(examples["sms"], padding="max_length", truncation=True)
|
17 |
|
|
|
18 |
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
19 |
|
20 |
+
# Split into train and evaluation datasets
|
21 |
train_dataset = tokenized_datasets["train"]
|
22 |
+
eval_dataset = tokenized_datasets.get("test", tokenized_datasets["validation"])
|
23 |
|
24 |
+
# Training arguments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
training_args = TrainingArguments(
|
26 |
output_dir="./results",
|
27 |
+
evaluation_strategy="steps",
|
28 |
+
save_strategy="steps",
|
29 |
+
eval_steps=500,
|
30 |
+
save_steps=500,
|
31 |
learning_rate=2e-5,
|
32 |
per_device_train_batch_size=16,
|
33 |
per_device_eval_batch_size=64,
|
|
|
39 |
metric_for_best_model="accuracy",
|
40 |
)
|
41 |
|
42 |
+
# Metrics for evaluation
|
43 |
def compute_metrics(p):
|
44 |
+
preds = p.predictions.argmax(axis=1)
|
45 |
+
return {"accuracy": accuracy_score(p.label_ids, preds)}
|
|
|
46 |
|
47 |
+
# Trainer
|
48 |
trainer = Trainer(
|
49 |
model=model,
|
50 |
args=training_args,
|
51 |
train_dataset=train_dataset,
|
52 |
eval_dataset=eval_dataset,
|
53 |
+
compute_metrics=compute_metrics,
|
54 |
)
|
55 |
|
56 |
# Train the model
|
57 |
trainer.train()
|
58 |
|
59 |
+
# Save the model and tokenizer
|
60 |
model.save_pretrained("./fine_tuned_model")
|
61 |
tokenizer.save_pretrained("./fine_tuned_model")
|
62 |
|