jaynopponep commited on
Commit
021a5c8
·
verified ·
1 Parent(s): 71170a2

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +21 -38
train.py CHANGED
@@ -1,58 +1,29 @@
1
- from model import get_model
2
- import torch
3
- from transformers import BertTokenizer, Trainer, TrainingArguments
4
  from datasets import load_dataset
5
  import numpy as np
6
  from sklearn.metrics import accuracy_score, precision_recall_fscore_support
7
- from torch.utils.data import DataLoader
8
- from sklearn.utils.class_weight import compute_class_weight
9
 
10
- # Other imports and code remain the same...
11
-
12
- # Compute class weights
13
- class_weights = compute_class_weight(
14
- 'balanced', classes=np.unique(train_dataset['labels']), y=train_dataset['labels'])
15
- class_weights = torch.tensor(class_weights, dtype=torch.float)
16
-
17
- # Update the model's classifier with class weights
18
- model.classifier.weight.data = class_weights
19
- # Load dataset dynamically or from a config
20
  dataset_name = "NicolaiSivesind/human-vs-machine"
21
  dataset = load_dataset(dataset_name)
22
 
 
23
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
24
 
25
- def compute_metrics(pred):
26
- labels = pred.label_ids
27
- preds = np.argmax(pred.predictions, axis=1)
28
- precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
29
- acc = accuracy_score(labels, preds)
30
- return {
31
- 'accuracy': acc,
32
- 'f1': f1,
33
- 'precision': precision,
34
- 'recall': recall
35
- }
36
-
37
  def tokenize_function(examples):
38
- # Add any specific preprocessing steps if necessary
39
  return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
40
 
41
- def get_tokenizer():
42
- try:
43
- return BertTokenizer.from_pretrained('./trained_model')
44
- except Exception:
45
- return BertTokenizer.from_pretrained('bert-base-uncased')
46
-
47
  tokenized_dataset = dataset.map(tokenize_function, batched=True)
48
  tokenized_dataset = tokenized_dataset.rename_column("original_label_name", "labels")
49
  tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
50
 
51
  train_dataset = tokenized_dataset["train"]
52
  eval_dataset = tokenized_dataset["validation"]
53
- model = get_model()
54
 
55
- # Make training arguments configurable
 
 
 
56
  training_args = TrainingArguments(
57
  output_dir="./results",
58
  num_train_epochs=3,
@@ -62,16 +33,28 @@ training_args = TrainingArguments(
62
  weight_decay=0.01,
63
  logging_dir='./logs',
64
  evaluation_strategy="steps",
65
- save_steps=500, # Save model every 500 steps
66
  logging_steps=100,
67
  )
68
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  trainer = Trainer(
70
  model=model,
71
  args=training_args,
72
  train_dataset=train_dataset,
73
  eval_dataset=eval_dataset,
74
- compute_metrics=compute_metrics # Define this function to compute additional metrics
75
  )
76
 
77
  trainer.train()
 
1
+ from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
 
 
2
  from datasets import load_dataset
3
  import numpy as np
4
  from sklearn.metrics import accuracy_score, precision_recall_fscore_support
 
 
5
 
6
+ # Load dataset
 
 
 
 
 
 
 
 
 
7
  dataset_name = "NicolaiSivesind/human-vs-machine"
8
  dataset = load_dataset(dataset_name)
9
 
10
+ # Tokenizer
11
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
12
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def tokenize_function(examples):
 
14
  return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
15
 
 
 
 
 
 
 
16
  tokenized_dataset = dataset.map(tokenize_function, batched=True)
17
  tokenized_dataset = tokenized_dataset.rename_column("original_label_name", "labels")
18
  tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
19
 
20
  train_dataset = tokenized_dataset["train"]
21
  eval_dataset = tokenized_dataset["validation"]
 
22
 
23
+ # Model
24
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
25
+
26
+ # Training Arguments
27
  training_args = TrainingArguments(
28
  output_dir="./results",
29
  num_train_epochs=3,
 
33
  weight_decay=0.01,
34
  logging_dir='./logs',
35
  evaluation_strategy="steps",
36
+ save_steps=500,
37
  logging_steps=100,
38
  )
39
 
40
+ def compute_metrics(pred):
41
+ labels = pred.label_ids
42
+ preds = np.argmax(pred.predictions, axis=-1)
43
+ precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
44
+ acc = accuracy_score(labels, preds)
45
+ return {
46
+ 'accuracy': acc,
47
+ 'f1': f1,
48
+ 'precision': precision,
49
+ 'recall': recall
50
+ }
51
+
52
  trainer = Trainer(
53
  model=model,
54
  args=training_args,
55
  train_dataset=train_dataset,
56
  eval_dataset=eval_dataset,
57
+ compute_metrics=compute_metrics
58
  )
59
 
60
  trainer.train()