Anuj02003 commited on
Commit
9a79179
·
verified ·
1 Parent(s): e220540

Update fine_tune.py

Browse files
Files changed (1) hide show
  1. fine_tune.py +14 -28
fine_tune.py CHANGED
@@ -1,46 +1,33 @@
1
  from datasets import load_dataset
2
  from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
3
- import torch
4
  from sklearn.metrics import accuracy_score
5
 
6
  # Load the dataset
7
  dataset = load_dataset("sms_spam")
8
 
9
- # Print the dataset structure and inspect the columns
10
- print(dataset)
11
- print(dataset['train'][0]) # Print the first row of the 'train' split
12
-
13
  # Initialize the tokenizer
14
  tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
15
 
16
  # Initialize the model
17
  model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
18
 
19
- # Tokenize the dataset using the correct column
20
  def tokenize_function(examples):
21
  return tokenizer(examples["sms"], padding="max_length", truncation=True)
22
 
23
- # Apply the tokenization to the dataset
24
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
25
 
26
- # Check if 'test' split exists, else use 'validation' or create your own split
27
  train_dataset = tokenized_datasets["train"]
 
28
 
29
- # If there is no 'test' split, you can use 'validation' or manually split the dataset
30
- eval_dataset = tokenized_datasets.get("test", tokenized_datasets.get("validation"))
31
-
32
- # If neither 'test' nor 'validation' exists, manually split the dataset
33
- if eval_dataset is None:
34
- eval_dataset = train_dataset.shuffle(seed=42).select([i for i in range(len(train_dataset)//10)]) # Take 10% as eval dataset
35
- train_dataset = train_dataset.select([i for i in range(len(train_dataset)//10, len(train_dataset))]) # Take the remaining 90% as train dataset
36
-
37
- # Set up training arguments
38
  training_args = TrainingArguments(
39
  output_dir="./results",
40
- evaluation_strategy="steps", # Evaluate every 'eval_steps'
41
- save_strategy="steps", # Save every 'save_steps'
42
- eval_steps=500, # Evaluate every 500 steps
43
- save_steps=500, # Save every 500 steps
44
  learning_rate=2e-5,
45
  per_device_train_batch_size=16,
46
  per_device_eval_batch_size=64,
@@ -52,25 +39,24 @@ training_args = TrainingArguments(
52
  metric_for_best_model="accuracy",
53
  )
54
 
55
- # Define compute_metrics function (optional, if you want to track metrics)
56
  def compute_metrics(p):
57
- predictions, labels = p
58
- preds = predictions.argmax(axis=1)
59
- return {"accuracy": accuracy_score(labels, preds)}
60
 
61
- # Initialize the Trainer
62
  trainer = Trainer(
63
  model=model,
64
  args=training_args,
65
  train_dataset=train_dataset,
66
  eval_dataset=eval_dataset,
67
- compute_metrics=compute_metrics, # Optional: to compute accuracy
68
  )
69
 
70
  # Train the model
71
  trainer.train()
72
 
73
- # Save the model after training
74
  model.save_pretrained("./fine_tuned_model")
75
  tokenizer.save_pretrained("./fine_tuned_model")
76
 
 
1
  from datasets import load_dataset
2
  from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
 
3
  from sklearn.metrics import accuracy_score
4
 
5
  # Load the dataset
6
  dataset = load_dataset("sms_spam")
7
 
 
 
 
 
8
  # Initialize the tokenizer
9
  tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
10
 
11
  # Initialize the model
12
  model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
13
 
14
+ # Tokenize the dataset
15
  def tokenize_function(examples):
16
  return tokenizer(examples["sms"], padding="max_length", truncation=True)
17
 
 
18
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
19
 
20
+ # Split into train and evaluation datasets
21
  train_dataset = tokenized_datasets["train"]
22
+ eval_dataset = tokenized_datasets.get("test", tokenized_datasets["validation"])
23
 
24
+ # Training arguments
 
 
 
 
 
 
 
 
25
  training_args = TrainingArguments(
26
  output_dir="./results",
27
+ evaluation_strategy="steps",
28
+ save_strategy="steps",
29
+ eval_steps=500,
30
+ save_steps=500,
31
  learning_rate=2e-5,
32
  per_device_train_batch_size=16,
33
  per_device_eval_batch_size=64,
 
39
  metric_for_best_model="accuracy",
40
  )
41
 
42
+ # Metrics for evaluation
43
  def compute_metrics(p):
44
+ preds = p.predictions.argmax(axis=1)
45
+ return {"accuracy": accuracy_score(p.label_ids, preds)}
 
46
 
47
+ # Trainer
48
  trainer = Trainer(
49
  model=model,
50
  args=training_args,
51
  train_dataset=train_dataset,
52
  eval_dataset=eval_dataset,
53
+ compute_metrics=compute_metrics,
54
  )
55
 
56
  # Train the model
57
  trainer.train()
58
 
59
+ # Save the model and tokenizer
60
  model.save_pretrained("./fine_tuned_model")
61
  tokenizer.save_pretrained("./fine_tuned_model")
62