In [4]:
import torch
import pandas as pd
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset
!pip install -U FlagEmbedding
from FlagEmbedding import BGEM3FlagModel
import torch.nn as nn
model = BGEM3FlagModel('BAAI/bge-m3')


# Define paths
test_data_path = "/home/jovyan/work/test_data_random_subset.csv"
model_weights_path = "/home/jovyan/work/model_weights/final_model.pth"

data = data = pd.read_csv(test_data_path)
titles = data['title'].tolist()
labels = data['labels'].tolist()

batch_size = 32
embeddings = []

print('Encoding titles...')
for i in range(0, len(titles), batch_size):
    batch = titles[i:i + batch_size]
    batch_embeddings = model.encode(batch, batch_size=batch_size, max_length=512)['dense_vecs']
    embeddings.extend(batch_embeddings)
    print(f"Processed {i + len(batch)}/{len(titles)} titles")

embeddings_df = pd.DataFrame(embeddings)
embeddings_df['label'] = labels

X_test = torch.FloatTensor(embeddings_df.iloc[:, :-1].values)  # Features
y_test = torch.FloatTensor(embeddings_df.iloc[:, -1].values).view(-1, 1)  # Labels

# Create DataLoader for the test dataset
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the model architecture
class EmbeddingMLPClassifier(nn.Module):
    def __init__(self,
                 input_dim=1024,
                 hidden_layers=[512, 256, 128],
                 dropout_rate=0.2,
                 device=None):
        super(EmbeddingMLPClassifier, self).__init__()
        self.device = device or torch.device('cpu')  # Force CPU usage

        layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_layers:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim

        layers.append(nn.Linear(prev_dim, 1))
        layers.append(nn.Sigmoid())

        self.model = nn.Sequential(*layers)
        self.to(self.device)

    def forward(self, x):
        return self.model(x)

# Load the saved model
device = torch.device('cpu')  # Force evaluation on CPU
model = EmbeddingMLPClassifier(input_dim=X_test.shape[1], device=device)
model.load_state_dict(torch.load(model_weights_path, map_location=device))  # Map weights to CPU
model.eval()

# Evaluate the model on the test set
test_preds, test_labels = [], []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(device)  # Ensure data is on CPU
        batch_y = batch_y.to(device)

        outputs = model(batch_X)
        preds = (outputs > 0.5).float()

        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(batch_y.cpu().numpy())

# Calculate accuracy
test_accuracy = accuracy_score(test_labels, test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

Encoding titles...


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processed 20/20 titles
Test Accuracy: 0.9000
