In [None]:
!pip install -q --upgrade datasets
!pip install -q --upgrade transformers
!pip install -q --upgrade peft
!pip install -q --upgrade trl
!pip install -q bitsandbytes
!pip install -q accelerate
!pip install -q tensorboard
!pip install -q jsonlines
! conda install -y gdown

In [None]:
import itertools
import time
import warnings
from peft import LoraConfig, get_peft_model
from transformers import BertForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from matplotlib import pyplot as plt
from datasets import load_dataset
import torch
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef, roc_auc_score

# Suppress warnings
warnings.filterwarnings("ignore")

# Layer configurations
attention_plus_feed_forward = [
 "bert.encoder.layer.0.attention.self.query",
 "bert.encoder.layer.0.attention.self.key",
 "bert.encoder.layer.0.attention.self.value",
 "bert.encoder.layer.0.attention.output.dense",
 "bert.encoder.layer.0.intermediate.dense",
 "bert.encoder.layer.0.output.dense",
 "bert.encoder.layer.1.attention.self.query",
 "bert.encoder.layer.1.attention.self.key",
 "bert.encoder.layer.1.attention.self.value",
 "bert.encoder.layer.1.attention.output.dense",
 "bert.encoder.layer.1.intermediate.dense",
 "bert.encoder.layer.1.output.dense"
]

attention = [
 "bert.encoder.layer.0.attention.self.query",
 "bert.encoder.layer.0.attention.self.key",
 "bert.encoder.layer.0.attention.self.value",
 "bert.encoder.layer.0.attention.output.dense",
 "bert.encoder.layer.1.attention.self.query",
 "bert.encoder.layer.1.attention.self.key",
 "bert.encoder.layer.1.attention.self.value",
 "bert.encoder.layer.1.attention.output.dense"
]

feed_forward = [
 "bert.encoder.layer.0.intermediate.dense",
 "bert.encoder.layer.0.output.dense",
 "bert.encoder.layer.1.intermediate.dense",
 "bert.encoder.layer.1.output.dense"
]

all_layer_configs = [attention_plus_feed_forward, attention, feed_forward]
tokenizer = AutoTokenizer.from_pretrained('zhihan1996/DNA_bert_6')

def preprocess_function(examples):
 try:
 return tokenizer(
 examples['sequence'],
 padding='max_length',
 truncation=True,
 max_length=512
 )
 except KeyError:
 return tokenizer(
 examples['Sequence'],
 padding='max_length',
 truncation=True,
 max_length=512
 )


def add_labels(examples):
 try:
 examples['labels'] = examples['label']
 return examples
 except KeyError:
 examples['labels'] = examples['Label']
 return examples

def create_task_dataset(task_name):
 if task_name == 'tfbs':
 return load_dataset('csv', data_files='/kaggle/working/tfbs.csv', split='train[0:10000]'), load_dataset('csv', data_files='/kaggle/working/tfbs.csv', split='train[10001:13122]')
 elif task_name == 'dnasplice':
 return load_dataset('csv', data_files='/kaggle/working/dnasplice.csv', split='train[0:10000]'), load_dataset('csv', data_files='/kaggle/working/dnasplice.csv', split='train[10001:13122]')
 elif task_name == 'dnaprom':
 return load_dataset('csv', data_files='/kaggle/working/dnaprom.csv', split='train[0:10000]'), load_dataset('csv', data_files='/kaggle/working/dnaprom.csv', split='train[10001:13122]')
 else:
 raise ValueError(f"Unknown task: {task_name}")

def create_dataset_maps(train_dataset, test_dataset):
 train_dataset = train_dataset.map(preprocess_function, batched=True)
 train_dataset = train_dataset.map(add_labels)
 test_dataset = test_dataset.map(preprocess_function, batched=True)
 test_dataset = test_dataset.map(add_labels)
 return train_dataset, test_dataset

def train_model(train_dataset, test_dataset, model, task, model_name, config_name):
 def specificity_score(y_true, y_pred):
 true_negatives = np.sum((y_pred == 0) & (y_true == 0))
 false_positives = np.sum((y_pred == 1) & (y_true == 0))
 specificity = true_negatives / (true_negatives + false_positives + np.finfo(float).eps)
 return specificity

 def compute_metrics(eval_pred):
 logits, labels = eval_pred
 predictions = np.argmax(logits, axis=-1)
 y_pred = logits[:, 1]

 accuracy = accuracy_score(labels, predictions)
 recall = recall_score(labels, predictions)
 specificity = specificity_score(labels, predictions)
 mcc = matthews_corrcoef(labels, predictions)
 roc_auc = roc_auc_score(labels, y_pred)
 precision = precision_score(labels, predictions)
 f1 = f1_score(labels, predictions)

 return {
 'accuracy': accuracy,
 'recall': recall,
 'specificity': specificity,
 'mcc': mcc,
 'roc_auc': roc_auc,
 'precision': precision,
 'f1': f1
 }

 # Define the training arguments
 training_arguments = TrainingArguments(
 output_dir=f"outputs/{task}/{model_name}_{config_name}",
 num_train_epochs=3,
 fp16=False,
 bf16=False,
 per_device_train_batch_size=20,
 per_device_eval_batch_size=10,
 gradient_accumulation_steps=2,
 gradient_checkpointing=True,
 max_grad_norm=0.3,
 learning_rate=4e-4,
 weight_decay=0.01,
 optim="paged_adamw_32bit",
 lr_scheduler_type="linear",
 max_steps=-1,
 warmup_ratio=0.03,
 group_by_length=True,
 save_steps=10,
 logging_steps=25,
 dataloader_pin_memory=False,
 report_to='tensorboard',
 gradient_checkpointing_kwargs={'use_reentrant': False}
 )

 trainer = Trainer(
 model=model,
 args=training_arguments,
 train_dataset=train_dataset,
 eval_dataset=test_dataset,
 tokenizer=tokenizer,
 compute_metrics=compute_metrics,
 )

 start_time = time.time()
 trainer.train()
 end_time = time.time()

 total_time = end_time - start_time
 metrics = trainer.evaluate()

 return total_time, metrics

def estimate_model_size_lora(model):
 num_parameters = model.get_nb_trainable_parameters()[0]
 size_in_bytes = num_parameters * 4 # Each parameter is 4 bytes (32-bit float)
 size_in_mb = size_in_bytes / (1024 ** 2) # Convert bytes to megabytes
 return size_in_mb
def estimate_model_size_base(model):
 num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

 size_in_bytes = num_parameters * 4 # Each parameter is 4 bytes (32-bit float)
 size_in_mb = size_in_bytes / (1024 ** 2) # Convert bytes to megabytes
 return size_in_mb
# Task loop
task_list = ['dnasplice', 'tfbs', 'dnaprom']
log_file = "training_log.txt"
model_name = 'fabihamakhdoomi/TinyDNABERT'
r_values = [4, 8, 16, 32] # Different r values for ablation study

results = []

for task in task_list:
 print(f"Running TASK : {task}")
 train_dataset, test_dataset = create_task_dataset(task)
 train_dataset, test_dataset = create_dataset_maps(train_dataset, test_dataset) 
 train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
 test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
 


 # Train the base model first
 base_model = BertForSequenceClassification.from_pretrained(
 model_name,
 num_labels=2
 )
 config_name = "base_model"
 print(f"Training MODEL : {config_name} for task : {task}")
 training_time, metrics = train_model(train_dataset, test_dataset, base_model, task, model_name, config_name)
 model_size_mb = estimate_model_size_base(base_model)
 results.append((task, config_name, model_size_mb, metrics['eval_accuracy']))
 with open(log_file, "a") as log:
 log.write(f"Task: {task}, Model: {model_name}, Config: {config_name}, Training Time: {training_time}, Model Size: {model_size_mb} MB, Metrics: {metrics}\n")

 # Train the LoRA models with different r values
 for r in r_values:
 for config_name, config in zip(['attention_plus_feed_forward', 'attention', 'feed_forward'], all_layer_configs):
 base_model = BertForSequenceClassification.from_pretrained(
 model_name,
 num_labels=2
 )
 peft_config = LoraConfig(
 lora_alpha=16,
 lora_dropout=0.2,
 r=r,
 bias="none",
 task_type="SEQ_CLS",
 target_modules=config
 )
 model = get_peft_model(base_model, peft_config)
 print(f"Training MODEL : {config_name} with r={r} for task : {task}")
 training_time, metrics = train_model(train_dataset, test_dataset, model, task, model_name, f"{config_name}_r{r}")
 model_size_mb = estimate_model_size_lora(model)
 results.append((task, f"{config_name}_r{r}", model_size_mb, metrics['eval_accuracy']))
 with open(log_file, "a") as log:
 log.write(f"Task: {task}, Model: {model_name}, Config: {config_name}, r: {r}, Training Time: {training_time}, Model Size: {model_size_mb} MB, Metrics: {metrics}\n")

# Results logging and visualization
tasks = [result[0] for result in results]
configs = [result[1] for result in results]
model_sizes = [result[2] for result in results]
accuracies = [result[3] for result in results]

for i, task in enumerate(task_list):
 plt.figure(figsize=(10, 6))
 task_results = [results[j] for j in range(len(results)) if results[j][0] == task]
 configs = [r[1] for r in task_results]
 sizes = [r[2] for r in task_results]
 accuracies = [r[3] for r in task_results]
 plt.plot(sizes, accuracies, marker='o')
 plt.xlabel('Model Size (MB)')
 plt.ylabel('Accuracy')
 plt.title(f'Model Size vs Accuracy for Task: {task}')
 plt.xticks(sizes, configs, rotation=45, ha="right")
 plt.tight_layout()
 plt.savefig(f'{task}_model_size_vs_accuracy.png')
 plt.show()
