khulnasoft commited on
Commit
93a1727
·
verified ·
1 Parent(s): 467b421

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +36 -0
train.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
3
+
4
+ # Load your dataset
5
+ dataset = load_dataset('text', data_files={'train': 'cleaned_data.txt'})
6
+
7
+ # Preprocess the dataset
8
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
9
+ def tokenize_function(examples):
10
+ return tokenizer(examples['text'], padding="max_length", truncation=True)
11
+
12
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
13
+
14
+ # Load model
15
+ model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
16
+
17
+ # Define training arguments
18
+ training_args = TrainingArguments(
19
+ output_dir="./results",
20
+ evaluation_strategy="epoch",
21
+ per_device_train_batch_size=8,
22
+ per_device_eval_batch_size=8,
23
+ num_train_epochs=3,
24
+ weight_decay=0.01,
25
+ )
26
+
27
+ # Create Trainer
28
+ trainer = Trainer(
29
+ model=model,
30
+ args=training_args,
31
+ train_dataset=tokenized_datasets["train"],
32
+ eval_dataset=tokenized_datasets["train"],
33
+ )
34
+
35
+ # Train the model
36
+ trainer.train()