jaynopponep's picture
Trying to get original'
db17981
raw
history blame
2.12 kB
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# Load dataset
df = pd.read_csv("AI_Human.csv")
train_df, eval_df = train_test_split(df, test_size=0.2)
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
# Convert DataFrames to Datasets and apply tokenization
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)
train_dataset = train_dataset.map(tokenize_function, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# Training Arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
evaluation_strategy="steps",
save_steps=500,
logging_steps=100,
)
def compute_metrics(pred):
labels = pred.label_ids
preds = np.argmax(pred.predictions, axis=-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics
)
trainer.train()
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")