In [None]:
#!pip install "modin[all]" # Install Ray and Dask
# !pip install pytorch 
# !pip install intel-extension-for-pytorch
# !pip install transformers
# !pip install datasets

In [21]:
import modin.pandas as pd
df = pd.read_csv('blooms_taxonomy_dataset.csv')
df

Unnamed: 0,Questions,Category
0,About what proportion of the population of the...,BT1
1,Correctly label the brain lobes indicated on t...,BT1
2,Define compound interest.,BT1
3,Define four types of traceability,BT1
4,Define mercantilism.,BT1
...,...,...
8762,Distinguish between different types of soil st...,BT4
8763,Invent a blockchain-based solution for transpa...,BT6
8764,Compare the advantages and disadvantages of us...,BT4
8765,"Describe the purpose of the ""volatile"" keyword...",BT1


In [22]:
mapping = {"BT1": 0, "BT2": 1, "BT3": 2, "BT4": 3, "BT5": 4, "BT6": 5}
df["Category"] = df["Category"].map(mapping)

In [23]:
df

Unnamed: 0,Questions,Category
0,About what proportion of the population of the...,0
1,Correctly label the brain lobes indicated on t...,0
2,Define compound interest.,0
3,Define four types of traceability,0
4,Define mercantilism.,0
...,...,...
8762,Distinguish between different types of soil st...,3
8763,Invent a blockchain-based solution for transpa...,5
8764,Compare the advantages and disadvantages of us...,3
8765,"Describe the purpose of the ""volatile"" keyword...",0


In [24]:
from transformers import DistilBertTokenizer
import torch
from torch.utils.data import DataLoader
import intel_extension_for_pytorch as ipex
# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the 'Questions' column
inputs = tokenizer(list(df['Questions']), padding=True, truncation=True, return_tensors='pt', max_length=2048)
inputs



{'input_ids': tensor([[  101,  2055,  2054,  ...,     0,     0,     0],
        [  101, 11178,  3830,  ...,     0,     0,     0],
        [  101,  9375,  7328,  ...,     0,     0,     0],
        ...,
        [  101, 12826,  1996,  ...,     0,     0,     0],
        [  101,  6235,  1996,  ...,     0,     0,     0],
        [  101,  4863,  1996,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [25]:
inputs['input_ids'].size()

torch.Size([8767, 123])

In [26]:
labels = torch.tensor(df['Category'].values)
labels

tensor([0, 0, 0,  ..., 3, 0, 1])

In [27]:
from transformers import DistilBertForSequenceClassification

# Load the model with a classification head
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)  # 6 classes: 0 to 5
optimized_model = ipex.optimize(model, dtype=torch.float32)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(inputs['input_ids'], labels, test_size=0.2, random_state=42)


In [29]:
from torch.utils.data import DataLoader, TensorDataset

# Create datasets for training and validation
train_dataset = TensorDataset(train_inputs, train_labels)
val_dataset = TensorDataset(val_inputs, val_labels)

# Create DataLoader for both training and validation
train_dataloader = DataLoader(train_dataset, batch_size=20, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=20)


In [44]:
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR

# Set up the optimizer
optimizer = AdamW(optimized_model.parameters(), lr=0.0001)

# Define the training loop
epochs = 1
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
optimized_model.to(device)

print(device)

cpu




In [45]:
for epoch in range(epochs):
    optimized_model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = optimized_model(input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        print(loss)
    print(f"Epoch {epoch + 1} | Loss: {total_loss / len(train_dataloader)}")

tensor(0.1266, grad_fn=<NllLossBackward0>)
tensor(0.2361, grad_fn=<NllLossBackward0>)
tensor(0.0948, grad_fn=<NllLossBackward0>)
tensor(0.0170, grad_fn=<NllLossBackward0>)
tensor(0.5257, grad_fn=<NllLossBackward0>)
tensor(0.0933, grad_fn=<NllLossBackward0>)
tensor(0.1646, grad_fn=<NllLossBackward0>)
tensor(0.2118, grad_fn=<NllLossBackward0>)
tensor(0.0173, grad_fn=<NllLossBackward0>)
tensor(0.1543, grad_fn=<NllLossBackward0>)
tensor(0.3518, grad_fn=<NllLossBackward0>)
tensor(0.5005, grad_fn=<NllLossBackward0>)
tensor(0.3083, grad_fn=<NllLossBackward0>)
tensor(0.1673, grad_fn=<NllLossBackward0>)
tensor(0.0377, grad_fn=<NllLossBackward0>)
tensor(0.1693, grad_fn=<NllLossBackward0>)
tensor(0.3132, grad_fn=<NllLossBackward0>)
tensor(0.3724, grad_fn=<NllLossBackward0>)
tensor(0.0699, grad_fn=<NllLossBackward0>)
tensor(0.1015, grad_fn=<NllLossBackward0>)
tensor(0.0627, grad_fn=<NllLossBackward0>)
tensor(0.0439, grad_fn=<NllLossBackward0>)
tensor(0.3108, grad_fn=<NllLossBackward0>)
tensor(0.16

In [36]:
optimized_model.eval()
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)
        # Forward pass
        outputs = optimized_model(input_ids)
        predictions = torch.argmax(outputs.logits, dim=-1)

        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

accuracy = correct_predictions / total_predictions
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

Validation Accuracy: 78.96%


In [37]:
def predict(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    
    optimized_model.eval()
    with torch.no_grad():
        outputs = optimized_model(input_ids)
        prediction = torch.argmax(outputs.logits, dim=-1)
    return prediction.item()

# Example prediction
question = "Compare two dog food commercials. What is the difference between them and how do they both sell their products?"
print(predict(question))


3


In [47]:
from torch.nn.functional import softmax

# The mapping of class labels to numeric labels
mapping = {"Remembering": 0, "Understanding": 1, "Applying": 2, "Analyzing": 3, "Evaluating": 4, "Creating": 5}

# Reverse the mapping to get the class name from the index
reverse_mapping = {v: k for k, v in mapping.items()}

def predict(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    
    optimized_model.eval()
    with torch.no_grad():
        # Get the raw logits from the model
        outputs = optimized_model(input_ids)
        logits = outputs.logits
        
        # Apply softmax to get probabilities
        probabilities = softmax(logits, dim=-1)
    
    # Convert probabilities to a list or dictionary of class probabilities
    probabilities = probabilities.squeeze().cpu().numpy()
    
    # Map the probabilities to the class labels using the reverse mapping
    class_probabilities = {reverse_mapping[i]: prob for i, prob in enumerate(probabilities)}
    
    return class_probabilities

# Example prediction
question = "State and explain rules of inference."
class_probabilities = predict(question)

# Display the probabilities for each class label
for class_label, prob in class_probabilities.items():
    print(f"{class_label}: {prob:.4f}")


Remembering: 0.6210
Understanding: 0.2401
Applying: 0.0801
Analyzing: 0.0533
Evaluating: 0.0028
Creating: 0.0026


In [48]:
optimized_model.save_pretrained('./fine_tuned_distilbert')

# Save the tokenizer
tokenizer.save_pretrained('./fine_tuned_distilbert')

('./fine_tuned_distilbert/tokenizer_config.json',
 './fine_tuned_distilbert/special_tokens_map.json',
 './fine_tuned_distilbert/vocab.txt',
 './fine_tuned_distilbert/added_tokens.json')

In [49]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the saved model
optimized_model = DistilBertForSequenceClassification.from_pretrained('./fine_tuned_distilbert')

# Load the saved tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('./fine_tuned_distilbert')


In [50]:
# Example of using the loaded model for prediction
def predict_with_loaded_model(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)

    optimized_model.eval()
    with torch.no_grad():
        outputs = optimized_model(input_ids)
        logits = outputs.logits
        probabilities = softmax(logits, dim=-1)
        
    # Map probabilities to class labels
    probabilities = probabilities.squeeze().cpu().numpy()
    class_probabilities = {reverse_mapping[i]: prob for i, prob in enumerate(probabilities)}
    
    return class_probabilities

# Example usage with the saved model
question = "The accuracy of each position in a sequence of GGTACTGAT is 98%, 95%, 97%, 97%, 98%, 99%, 94%, 93%, and 97% respectively.(a) What is the average PHRED quality score of this sequence?"
class_probabilities = predict_with_loaded_model(question)

# Display class probabilities
for class_label, prob in class_probabilities.items():
    print(f"{class_label}: {prob:.4f}")

Remembering: 0.0049
Understanding: 0.0040
Applying: 0.3104
Analyzing: 0.2497
Evaluating: 0.3769
Creating: 0.0542
