jaynopponep commited on
Commit
d4939c3
·
1 Parent(s): 608b701

Changing train.py

Browse files
Files changed (3) hide show
  1. .idea/.name +1 -1
  2. __pycache__/model.cpython-312.pyc +0 -0
  3. train.py +16 -10
.idea/.name CHANGED
@@ -1 +1 @@
1
- model.py
 
1
+ train.py
__pycache__/model.cpython-312.pyc ADDED
Binary file (1.01 kB). View file
 
train.py CHANGED
@@ -3,27 +3,31 @@ import torch
3
  from transformers import BertTokenizer, Trainer, TrainingArguments
4
  from datasets import load_dataset
5
 
6
- # If the dataset is gated/private, make sure you have run huggingface-cli login
7
- dataset = load_dataset("NicolaiSivesind/human-vs-machine")
8
- # Tokenizer and Model Initialization
9
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
10
 
 
11
 
12
  def tokenize_function(examples):
 
13
  return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
14
 
15
-
16
  def get_tokenizer():
17
- return BertTokenizer.from_pretrained('./trained_model')
18
-
 
 
19
 
20
  tokenized_dataset = dataset.map(tokenize_function, batched=True)
21
  tokenized_dataset = tokenized_dataset.rename_column("original_label_name", "labels")
22
  tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
 
23
  train_dataset = tokenized_dataset["train"]
24
  eval_dataset = tokenized_dataset["validation"]
25
  model = get_model()
26
 
 
27
  training_args = TrainingArguments(
28
  output_dir="./results",
29
  num_train_epochs=3,
@@ -32,17 +36,19 @@ training_args = TrainingArguments(
32
  warmup_steps=500,
33
  weight_decay=0.01,
34
  logging_dir='./logs',
35
- evaluation_strategy="steps"
 
 
36
  )
37
 
38
  trainer = Trainer(
39
  model=model,
40
  args=training_args,
41
  train_dataset=train_dataset,
42
- eval_dataset=eval_dataset
 
43
  )
44
 
45
  trainer.train()
46
  model.save_pretrained("./trained_model")
47
  tokenizer.save_pretrained("./trained_model")
48
-
 
3
  from transformers import BertTokenizer, Trainer, TrainingArguments
4
  from datasets import load_dataset
5
 
6
+ # Load dataset dynamically or from a config
7
+ dataset_name = "NicolaiSivesind/human-vs-machine"
8
+ dataset = load_dataset(dataset_name)
 
9
 
10
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
11
 
12
  def tokenize_function(examples):
13
+ # Add any specific preprocessing steps if necessary
14
  return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
15
 
 
16
  def get_tokenizer():
17
+ try:
18
+ return BertTokenizer.from_pretrained('./trained_model')
19
+ except Exception:
20
+ return BertTokenizer.from_pretrained('bert-base-uncased')
21
 
22
  tokenized_dataset = dataset.map(tokenize_function, batched=True)
23
  tokenized_dataset = tokenized_dataset.rename_column("original_label_name", "labels")
24
  tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
25
+
26
  train_dataset = tokenized_dataset["train"]
27
  eval_dataset = tokenized_dataset["validation"]
28
  model = get_model()
29
 
30
+ # Make training arguments configurable
31
  training_args = TrainingArguments(
32
  output_dir="./results",
33
  num_train_epochs=3,
 
36
  warmup_steps=500,
37
  weight_decay=0.01,
38
  logging_dir='./logs',
39
+ evaluation_strategy="steps",
40
+ save_steps=500, # Save model every 500 steps
41
+ logging_steps=100,
42
  )
43
 
44
  trainer = Trainer(
45
  model=model,
46
  args=training_args,
47
  train_dataset=train_dataset,
48
+ eval_dataset=eval_dataset,
49
+ compute_metrics=compute_metrics # Define this function to compute additional metrics
50
  )
51
 
52
  trainer.train()
53
  model.save_pretrained("./trained_model")
54
  tokenizer.save_pretrained("./trained_model")