|
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments |
|
import numpy as np |
|
import pandas as pd |
|
from datasets import Dataset |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import accuracy_score, precision_recall_fscore_support |
|
|
|
|
|
df = pd.read_csv("AI_Human.csv") |
|
train_df, eval_df = train_test_split(df, test_size=0.2) |
|
|
|
|
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
|
|
|
def tokenize_function(examples): |
|
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) |
|
|
|
|
|
train_dataset = Dataset.from_pandas(train_df) |
|
eval_dataset = Dataset.from_pandas(eval_df) |
|
|
|
train_dataset = train_dataset.map(tokenize_function, batched=True) |
|
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) |
|
|
|
eval_dataset = eval_dataset.map(tokenize_function, batched=True) |
|
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) |
|
|
|
|
|
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./results", |
|
num_train_epochs=3, |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=8, |
|
warmup_steps=500, |
|
weight_decay=0.01, |
|
logging_dir='./logs', |
|
evaluation_strategy="steps", |
|
save_steps=500, |
|
logging_steps=100, |
|
) |
|
|
|
def compute_metrics(pred): |
|
labels = pred.label_ids |
|
preds = np.argmax(pred.predictions, axis=-1) |
|
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary') |
|
acc = accuracy_score(labels, preds) |
|
return { |
|
'accuracy': acc, |
|
'f1': f1, |
|
'precision': precision, |
|
'recall': recall |
|
} |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=eval_dataset, |
|
compute_metrics=compute_metrics |
|
) |
|
|
|
trainer.train() |
|
model.save_pretrained("./trained_model") |
|
tokenizer.save_pretrained("./trained_model") |
|
|