jaynopponep commited on
Commit
6b81bf3
·
verified ·
1 Parent(s): 30b6c65

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +11 -7
train.py CHANGED
@@ -1,8 +1,8 @@
1
  from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
2
- from datasets import load_dataset
3
  import numpy as np
4
- import pandas as pd
5
  from datasets import Dataset
 
6
  from sklearn.metrics import accuracy_score, precision_recall_fscore_support
7
 
8
  # Load dataset
@@ -15,12 +15,15 @@ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
15
  def tokenize_function(examples):
16
  return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
17
 
18
- tokenized_dataset = dataset.map(tokenize_function, batched=True)
19
- tokenized_dataset = tokenized_dataset.rename_column("original_label_name", "labels")
20
- tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
21
 
22
- train_dataset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
23
- eval_dataset = Dataset.from_pandas(eval_df).map(tokenize_function, batched=True)
 
 
 
24
 
25
  # Model
26
  model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
@@ -62,3 +65,4 @@ trainer = Trainer(
62
  trainer.train()
63
  model.save_pretrained("./trained_model")
64
  tokenizer.save_pretrained("./trained_model")
 
 
1
  from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
 
2
  import numpy as np
3
+ import pandas as pd
4
  from datasets import Dataset
5
+ from sklearn.model_selection import train_test_split
6
  from sklearn.metrics import accuracy_score, precision_recall_fscore_support
7
 
8
  # Load dataset
 
15
  def tokenize_function(examples):
16
  return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
17
 
18
+ # Convert DataFrames to Datasets and apply tokenization
19
+ train_dataset = Dataset.from_pandas(train_df)
20
+ eval_dataset = Dataset.from_pandas(eval_df)
21
 
22
+ train_dataset = train_dataset.map(tokenize_function, batched=True)
23
+ train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
24
+
25
+ eval_dataset = eval_dataset.map(tokenize_function, batched=True)
26
+ eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
27
 
28
  # Model
29
  model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
 
65
  trainer.train()
66
  model.save_pretrained("./trained_model")
67
  tokenizer.save_pretrained("./trained_model")
68
+