{ "cells": [ { "cell_type": "markdown", "id": "74bd5ceb-afa1-4bfd-ba39-10af717cf2a5", "metadata": {}, "source": [ "Remember to change the test and model Path!\n", "Since I'm using Embedding to encode headlines to vector, it takes 10+ min. to encode information for test set which I cannot do it on my end since I do not have access to hiddne test set! " ] }, { "cell_type": "code", "execution_count": 1, "id": "a458f2b7-3ab1-479f-9627-ef7ef8ef76b4", "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "from torch.utils.data import Dataset, DataLoader, random_split, SubsetRandomSampler\n", "from tqdm import tqdm\n", "import numpy as np\n", "import random\n", "import os\n", "import copy\n", "from torch.utils.data import TensorDataset\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "d7943628-3454-4d21-a95d-ca53acd9b6dc", "metadata": {}, "outputs": [], "source": [ "class LabelSmoothingBCELoss(nn.Module):\n", " def __init__(self, smoothing=0.1):\n", " \"\"\"\n", " Label Smoothing Binary Cross Entropy Loss\n", " \n", " Args:\n", " smoothing (float): Amount of label smoothing to apply\n", " \"\"\"\n", " super(LabelSmoothingBCELoss, self).__init__()\n", " self.smoothing = smoothing\n", " \n", " def forward(self, predictions, targets):\n", " \"\"\"\n", " Compute label-smoothed binary cross entropy loss\n", " \n", " Args:\n", " predictions (torch.Tensor): Model predictions\n", " targets (torch.Tensor): Binary labels\n", " \n", " Returns:\n", " torch.Tensor: Smoothed loss\n", " \"\"\"\n", " # Apply label smoothing\n", " smooth_targets = targets * (1 - self.smoothing) + 0.5 * self.smoothing\n", " \n", " # Standard Binary Cross Entropy Loss\n", " loss = nn.functional.binary_cross_entropy(predictions, smooth_targets)\n", " \n", " return loss\n", "\n", "class EarlyStoppingCallback:\n", " def __init__(self, patience=5, min_delta=0.001):\n", " \"\"\"\n", " Early stopping mechanism\n", " \n", " Args:\n", " patience (int): Number of epochs to wait for improvement\n", " min_delta (float): Minimum change to qualify as an improvement\n", " \"\"\"\n", " self.patience = patience\n", " self.min_delta = min_delta\n", " self.counter = 0\n", " self.best_loss = float('inf')\n", " self.early_stop = False\n", " self.best_model_state = None\n", " \n", " def __call__(self, val_loss, model):\n", " \"\"\"\n", " Check if training should stop\n", " \n", " Args:\n", " val_loss (float): Current validation loss\n", " model (nn.Module): Current model state\n", " \n", " Returns:\n", " bool: Whether to stop training\n", " \"\"\"\n", " if val_loss < self.best_loss - self.min_delta:\n", " self.best_loss = val_loss\n", " self.counter = 0\n", " # Save the best model state\n", " self.best_model_state = copy.deepcopy(model.state_dict())\n", " else:\n", " self.counter += 1\n", " if self.counter >= self.patience:\n", " self.early_stop = True\n", " \n", " return self.early_stop\n", "\n", "class EnsembleMLPClassifier(nn.Module):\n", " def __init__(self, \n", " input_dim=1024, # BGE embedding dimension\n", " hidden_layers=None,\n", " dropout_rate=0.2,\n", " activation=nn.ReLU(), # Allow passing activation functions dynamically\n", " device=None):\n", " super(EnsembleMLPClassifier, self).__init__()\n", " \n", " # Default configuration if not provided\n", " if hidden_layers is None:\n", " hidden_layers = [512, 256, 128]\n", " \n", " # Set device (GPU if available, else CPU)\n", " self.device = device or torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", " \n", " # Store initialization parameters\n", " self.input_dim = input_dim\n", " self.hidden_layers = hidden_layers\n", " self.dropout_rate = dropout_rate\n", " self.activation = activation\n", " \n", " # Add linear gate mechanism\n", " self.gate = nn.Linear(input_dim, input_dim, bias=False)\n", " \n", " # Create layers dynamically based on hidden_layers specification\n", " layers = []\n", " prev_dim = input_dim\n", " for hidden_dim in hidden_layers:\n", " # Dense Layer with dynamic activation and BatchNorm\n", " layers.extend([\n", " nn.Linear(prev_dim, hidden_dim),\n", " nn.BatchNorm1d(hidden_dim),\n", " activation,\n", " nn.Dropout(dropout_rate)\n", " ])\n", " prev_dim = hidden_dim\n", " \n", " # Final output layer for binary classification\n", " layers.append(nn.Linear(prev_dim, 1))\n", " layers.append(nn.Sigmoid())\n", " \n", " # Create the model and move to device\n", " self.model = nn.Sequential(*layers)\n", " self.to(self.device)\n", "\n", " def forward(self, x):\n", " \"\"\"Forward pass through the network\"\"\"\n", " # Apply gating mechanism\n", " x = self.gate(x) * x\n", " return self.model(x)\n", "\n", "class EnsembleClassifier:\n", " def __init__(self, num_models=5, label_smoothing=0.1):\n", " self.models = self._create_diverse_models(num_models)\n", " self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", " self.label_smoothing = label_smoothing\n", " self.model_weights = None \n", " \n", " def _create_diverse_models(self, num_models):\n", " models = []\n", " \n", " # Predefined configurations for consistency across runs\n", " architectures = [\n", " {'hidden_layers': [512, 256, 128], 'dropout_rate': 0.2, 'activation': nn.ReLU()},\n", " {'hidden_layers': [1024, 512], 'dropout_rate': 0.3, 'activation': nn.LeakyReLU()},\n", " {'hidden_layers': [256, 128, 64], 'dropout_rate': 0.1, 'activation': nn.GELU()},\n", " {'hidden_layers': [512, 128], 'dropout_rate': 0.25, 'activation': nn.SELU()},\n", " {'hidden_layers': [256, 128], 'dropout_rate': 0.15, 'activation': nn.Tanh()}\n", " ]\n", " \n", " # Optimizer strategies\n", " optimizers = [optim.Adam, optim.AdamW, optim.SGD]\n", " \n", " for i in range(num_models):\n", " # Use predefined architectures in a consistent order\n", " config = architectures[i % len(architectures)]\n", " optimizer_fn = optimizers[i % len(optimizers)]\n", " \n", " model = EnsembleMLPClassifier(\n", " input_dim=1024,\n", " hidden_layers=config['hidden_layers'],\n", " dropout_rate=config['dropout_rate'],\n", " activation=config['activation']\n", " )\n", " \n", " # Custom weight initialization\n", " def init_weights(m):\n", " if isinstance(m, nn.Linear):\n", " init_methods = [\n", " nn.init.xavier_uniform_,\n", " nn.init.kaiming_normal_,\n", " nn.init.orthogonal_\n", " ]\n", " init_method = init_methods[i % len(init_methods)] # Consistent initialization\n", " init_method(m.weight)\n", " if m.bias is not None:\n", " nn.init.zeros_(m.bias)\n", " \n", " model.model.apply(init_weights)\n", " \n", " # Attach optimizer to model instance for flexibility\n", " model.optimizer_fn = optimizer_fn\n", " \n", " # Add L2 regularization to the model (Weight Decay)\n", " model.regularization = {\n", " 'weight_decay': 1e-5 # Example regularization value\n", " }\n", " \n", " models.append(model)\n", " \n", " return models\n", " \n", " def train(self, train_dataset, batch_size=32, num_epochs=20):\n", " for model_idx, model in enumerate(tqdm(self.models, desc=\"Training Models\", position=0)):\n", " print(f\"Starting training for Model {model_idx + 1}/{len(self.models)}\")\n", " \n", " # Randomly split 80% for training and 20% for validation\n", " total_size = len(train_dataset)\n", " train_size = int(0.8 * total_size)\n", " val_size = total_size - train_size\n", " \n", " train_subset, val_subset = random_split(train_dataset, [train_size, val_size])\n", " \n", " # Create data loaders for training and validation\n", " train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)\n", " val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)\n", " \n", " # Optimizer with learning rate scheduler\n", " optimizer = optim.AdamW(model.parameters(), lr=1e-3)\n", " scheduler = optim.lr_scheduler.CosineAnnealingLR(\n", " optimizer, \n", " T_max=num_epochs, \n", " eta_min=1e-5\n", " )\n", " \n", " # Label Smoothing Loss\n", " criterion = LabelSmoothingBCELoss(smoothing=self.label_smoothing)\n", " \n", " # Early stopping\n", " early_stopping = EarlyStoppingCallback(patience=4, min_delta=0.001)\n", " \n", " model.train()\n", " epoch_progress = tqdm(range(num_epochs), desc=f\"Model {model_idx} Training\", position=1, leave=False)\n", " \n", " best_val_loss = float('inf')\n", " for epoch in epoch_progress:\n", " total_loss = 0\n", " \n", " # Training phase\n", " for batch in train_loader:\n", " inputs, labels = batch\n", " inputs, labels = inputs.to(model.device), labels.to(model.device)\n", " \n", " optimizer.zero_grad()\n", " outputs = model(inputs)\n", " loss = criterion(outputs, labels.float().unsqueeze(1))\n", " loss.backward()\n", " optimizer.step()\n", " \n", " total_loss += loss.item()\n", " avg_train_loss = total_loss / len(train_loader)\n", " \n", " # Validation phase\n", " model.eval()\n", " val_loss = 0\n", " with torch.no_grad():\n", " for val_batch in val_loader:\n", " val_inputs, val_labels = val_batch\n", " val_inputs, val_labels = val_inputs.to(model.device), val_labels.to(model.device)\n", " val_outputs = model(val_inputs)\n", " val_loss += criterion(val_outputs, val_labels.float().unsqueeze(1)).item()\n", " \n", " avg_val_loss = val_loss / len(val_loader)\n", " epoch_progress.set_postfix({\n", " 'train_loss': avg_train_loss,\n", " 'val_loss': avg_val_loss\n", " })\n", " \n", " # Early stopping check\n", " if early_stopping(avg_val_loss, model):\n", " if early_stopping.best_model_state:\n", " model.load_state_dict(early_stopping.best_model_state)\n", " print(f\"Early stopping triggered for Model {model_idx}\")\n", " break\n", " \n", " # Learning rate adjustment\n", " scheduler.step()\n", " \n", " # Reset to training mode\n", " model.train()\n", " \n", " # Store model's final state after training\n", " model.eval()\n", " \n", " def compute_test_weights(self, test_loader):\n", " \"\"\"\n", " Compute model weights based on test accuracy while emphasizing distinctions.\n", " \"\"\"\n", " model_accuracies = []\n", " for model_idx, model in enumerate(self.models):\n", " correct = 0\n", " total = 0\n", " model.eval()\n", " with torch.no_grad():\n", " for inputs, labels in test_loader:\n", " inputs, labels = inputs.to(model.device), labels.to(model.device)\n", " outputs = model(inputs)\n", " preds = (outputs > 0.5).float()\n", " correct += (preds == labels).sum().item()\n", " total += labels.size(0)\n", " accuracy = correct / total\n", " model_accuracies.append(accuracy)\n", " \n", " # Apply a power transformation for distinction\n", " accuracies = np.array(model_accuracies)\n", " print(f\"Raw model accuracies: {accuracies}\")\n", " \n", " # Use power scaling to exaggerate differences (e.g., square the accuracies)\n", " power_scaling_factor = 2 # Choose 2 for squaring, can experiment with higher values\n", " scaled_accuracies = accuracies ** power_scaling_factor\n", " \n", " # Smooth the accuracies slightly to avoid over-reliance on any single model\n", " smoothed_accuracies = scaled_accuracies * (1 - 0.1) + 0.1 * np.mean(scaled_accuracies)\n", " \n", " # Normalize weights so they sum to 1\n", " weights = smoothed_accuracies / smoothed_accuracies.sum()\n", " \n", " # Store model weights\n", " self.model_weights = torch.tensor(weights, dtype=torch.float32).to(self.device)\n", " print(f\"Model weights after scaling: {self.model_weights}\")\n", "\n", "\n", " def predict(self, test_loader, confidence_threshold=0.5, return_raw_scores=True):\n", " \"\"\"\n", " Prediction with confidence-weighted voting, optionally returning raw scores.\n", " \"\"\"\n", " if self.model_weights is None:\n", " raise ValueError(\"Model weights not computed. Call compute_test_weights first.\")\n", " \n", " all_predictions = []\n", " for model_idx, model in enumerate(self.models):\n", " model.eval()\n", " model_preds = []\n", " with torch.no_grad():\n", " for batch in test_loader:\n", " inputs, _ = batch\n", " inputs = inputs.to(model.device)\n", " outputs = model(inputs)\n", " model_preds.append(outputs)\n", " \n", " # Concatenate predictions for this model\n", " all_predictions.append(torch.cat(model_preds))\n", " \n", " # Stack predictions and compute weighted average\n", " stacked_preds = torch.stack(all_predictions, dim=1).squeeze(-1)\n", " weighted_preds = (stacked_preds * self.model_weights.view(1, -1)).sum(dim=1)\n", " \n", " # Final prediction with thresholding\n", " final_preds = (weighted_preds > confidence_threshold).float()\n", " \n", " # Optionally return raw probabilities for debugging\n", " if return_raw_scores:\n", " return final_preds, weighted_preds.cpu().numpy()\n", " \n", " return final_preds\n", "\n", "\n", " def save_models(self, save_dir='ensemble_models/model_test_4'):\n", " \"\"\"\n", " Save ensemble model weights and model weights with progress tracking\n", " \"\"\"\n", " os.makedirs(save_dir, exist_ok=True)\n", "\n", " save_data = {\n", " 'models': {},\n", " 'model_weights': self.model_weights.cpu().numpy() if self.model_weights is not None else None\n", " }\n", "\n", " for i, model in tqdm(enumerate(self.models), desc=\"Saving Models\", total=len(self.models)):\n", " save_data['models'][i] = model.state_dict()\n", "\n", " torch.save(save_data, os.path.join(save_dir, 'ensemble_checkpoint.pth'))\n", "\n", " def load_models(self, save_dir='ensemble_models/model_test_4'):\n", " \"\"\"\n", " Load ensemble model weights and model weights with progress tracking\n", " \"\"\"\n", " checkpoint_path = os.path.join(save_dir, 'ensemble_checkpoint.pth')\n", "\n", " save_data = torch.load(checkpoint_path)\n", "\n", " for i, model in tqdm(enumerate(self.models), desc=\"Loading Models\", total=len(self.models)):\n", " model.load_state_dict(save_data['models'][i])\n", " model.eval() # Set to evaluation mode\n", "\n", " if save_data['model_weights'] is not None:\n", " self.model_weights = torch.tensor(save_data['model_weights'], dtype=torch.float32).to(self.device)\n", " \n", " def evaluate(self, test_loader):\n", " \"\"\"\n", " Evaluate ensemble performance with weighted voting, supporting both CPU and GPU.\n", " \"\"\"\n", " # Collect ground truth labels\n", " all_labels = torch.cat([labels for _, labels in test_loader], dim=0).to(self.device)\n", " \n", " # Get predictions for the entire test set\n", " test_preds = self.predict(test_loader, return_raw_scores=True)\n", " \n", " # Ensure predictions and labels are on the same device\n", " all_labels = all_labels.cpu().numpy().ravel() # Flatten to 1D\n", " test_preds, raw_probs = test_preds\n", " test_preds = test_preds.cpu().numpy().ravel() # Flatten to 1D\n", " \n", " # Print debug information\n", " # print(\"Ground truth labels (all_labels):\", all_labels)\n", " # print(\"Predicted classes (test_preds):\", test_preds)\n", " # print(\"Raw probabilities (raw_probs):\", raw_probs) \n", " \n", " # Calculate metrics\n", " accuracy = np.mean(test_preds == all_labels)\n", " precision = precision_score(all_labels, test_preds, zero_division=0)\n", " recall = recall_score(all_labels, test_preds, zero_division=0)\n", " \n", " return {\n", " \"accuracy\": accuracy,\n", " \"precision\": precision,\n", " \"recall\": recall\n", " }" ] }, { "cell_type": "code", "execution_count": 3, "id": "a95bb0eb-48ba-4c46-9cc5-4f6a1ee19dee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", "Requirement already satisfied: FlagEmbedding in /opt/conda/lib/python3.11/site-packages (1.3.3)\n", "Requirement already satisfied: torch>=1.6.0 in /opt/conda/lib/python3.11/site-packages (from FlagEmbedding) (2.2.2+cu121)\n", "Requirement already satisfied: transformers==4.44.2 in /opt/conda/lib/python3.11/site-packages (from FlagEmbedding) (4.44.2)\n", "Requirement already satisfied: datasets==2.19.0 in /opt/conda/lib/python3.11/site-packages (from FlagEmbedding) (2.19.0)\n", "Requirement already satisfied: accelerate>=0.20.1 in /opt/conda/lib/python3.11/site-packages (from FlagEmbedding) (1.2.0)\n", "Requirement already satisfied: sentence-transformers in /opt/conda/lib/python3.11/site-packages (from FlagEmbedding) (3.3.1)\n", "Requirement already satisfied: peft in /opt/conda/lib/python3.11/site-packages (from FlagEmbedding) (0.14.0)\n", "Requirement already satisfied: ir-datasets in /opt/conda/lib/python3.11/site-packages (from FlagEmbedding) (0.5.9)\n", "Requirement already satisfied: sentencepiece in /opt/conda/lib/python3.11/site-packages (from FlagEmbedding) (0.2.0)\n", "Requirement already satisfied: protobuf in /opt/conda/lib/python3.11/site-packages (from FlagEmbedding) (4.25.3)\n", "Requirement already satisfied: filelock in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (3.9.0)\n", "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (1.26.4)\n", "Requirement already satisfied: pyarrow>=12.0.0 in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (15.0.2)\n", "Requirement already satisfied: pyarrow-hotfix in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (0.6)\n", "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (0.3.8)\n", "Requirement already satisfied: pandas in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (2.2.2)\n", "Requirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (2.31.0)\n", "Requirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (4.66.2)\n", "Requirement already satisfied: xxhash in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (3.5.0)\n", "Requirement already satisfied: multiprocess in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (0.70.16)\n", "Requirement already satisfied: fsspec<=2024.3.1,>=2023.1.0 in /opt/conda/lib/python3.11/site-packages (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets==2.19.0->FlagEmbedding) (2024.3.1)\n", "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (3.11.10)\n", "Requirement already satisfied: huggingface-hub>=0.21.2 in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (0.26.5)\n", "Requirement already satisfied: packaging in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (24.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.11/site-packages (from datasets==2.19.0->FlagEmbedding) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.11/site-packages (from transformers==4.44.2->FlagEmbedding) (2024.11.6)\n", "Requirement already satisfied: safetensors>=0.4.1 in /opt/conda/lib/python3.11/site-packages (from transformers==4.44.2->FlagEmbedding) (0.4.5)\n", "Requirement already satisfied: tokenizers<0.20,>=0.19 in /opt/conda/lib/python3.11/site-packages (from transformers==4.44.2->FlagEmbedding) (0.19.1)\n", "Requirement already satisfied: psutil in /opt/conda/lib/python3.11/site-packages (from accelerate>=0.20.1->FlagEmbedding) (5.9.8)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (4.11.0)\n", "Requirement already satisfied: sympy in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (1.12)\n", "Requirement already satisfied: networkx in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (3.3)\n", "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (3.1.3)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (12.1.105)\n", "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (8.9.2.26)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (12.1.3.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (11.0.2.54)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (10.3.2.106)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (11.4.5.107)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (12.1.0.106)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.19.3 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (2.19.3)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (12.1.105)\n", "Requirement already satisfied: triton==2.2.0 in /opt/conda/lib/python3.11/site-packages (from torch>=1.6.0->FlagEmbedding) (2.2.0)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12 in /opt/conda/lib/python3.11/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.6.0->FlagEmbedding) (12.4.127)\n", "Requirement already satisfied: beautifulsoup4>=4.4.1 in /opt/conda/lib/python3.11/site-packages (from ir-datasets->FlagEmbedding) (4.12.3)\n", "Requirement already satisfied: inscriptis>=2.2.0 in /opt/conda/lib/python3.11/site-packages (from ir-datasets->FlagEmbedding) (2.5.0)\n", "Requirement already satisfied: lxml>=4.5.2 in /opt/conda/lib/python3.11/site-packages (from ir-datasets->FlagEmbedding) (5.3.0)\n", "Requirement already satisfied: trec-car-tools>=2.5.4 in /opt/conda/lib/python3.11/site-packages (from ir-datasets->FlagEmbedding) (2.6)\n", "Requirement already satisfied: lz4>=3.1.10 in /opt/conda/lib/python3.11/site-packages (from ir-datasets->FlagEmbedding) (4.3.3)\n", "Requirement already satisfied: warc3-wet>=0.2.3 in /opt/conda/lib/python3.11/site-packages (from ir-datasets->FlagEmbedding) (0.2.5)\n", "Requirement already satisfied: warc3-wet-clueweb09>=0.2.5 in /opt/conda/lib/python3.11/site-packages (from ir-datasets->FlagEmbedding) (0.2.5)\n", "Requirement already satisfied: zlib-state>=0.1.3 in /opt/conda/lib/python3.11/site-packages (from ir-datasets->FlagEmbedding) (0.1.9)\n", "Requirement already satisfied: ijson>=3.1.3 in /opt/conda/lib/python3.11/site-packages (from ir-datasets->FlagEmbedding) (3.3.0)\n", "Requirement already satisfied: unlzw3>=0.2.1 in /opt/conda/lib/python3.11/site-packages (from ir-datasets->FlagEmbedding) (0.2.2)\n", "Requirement already satisfied: scikit-learn in /opt/conda/lib/python3.11/site-packages (from sentence-transformers->FlagEmbedding) (1.4.2)\n", "Requirement already satisfied: scipy in /opt/conda/lib/python3.11/site-packages (from sentence-transformers->FlagEmbedding) (1.13.0)\n", "Requirement already satisfied: Pillow in /opt/conda/lib/python3.11/site-packages (from sentence-transformers->FlagEmbedding) (10.3.0)\n", "Requirement already satisfied: soupsieve>1.2 in /opt/conda/lib/python3.11/site-packages (from beautifulsoup4>=4.4.1->ir-datasets->FlagEmbedding) (2.5)\n", "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /opt/conda/lib/python3.11/site-packages (from aiohttp->datasets==2.19.0->FlagEmbedding) (2.4.4)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.11/site-packages (from aiohttp->datasets==2.19.0->FlagEmbedding) (1.3.2)\n", "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.11/site-packages (from aiohttp->datasets==2.19.0->FlagEmbedding) (23.2.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.11/site-packages (from aiohttp->datasets==2.19.0->FlagEmbedding) (1.5.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.11/site-packages (from aiohttp->datasets==2.19.0->FlagEmbedding) (6.1.0)\n", "Requirement already satisfied: propcache>=0.2.0 in /opt/conda/lib/python3.11/site-packages (from aiohttp->datasets==2.19.0->FlagEmbedding) (0.2.1)\n", "Requirement already satisfied: yarl<2.0,>=1.17.0 in /opt/conda/lib/python3.11/site-packages (from aiohttp->datasets==2.19.0->FlagEmbedding) (1.18.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.11/site-packages (from requests>=2.19.0->datasets==2.19.0->FlagEmbedding) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.11/site-packages (from requests>=2.19.0->datasets==2.19.0->FlagEmbedding) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.11/site-packages (from requests>=2.19.0->datasets==2.19.0->FlagEmbedding) (2.2.1)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.11/site-packages (from requests>=2.19.0->datasets==2.19.0->FlagEmbedding) (2024.2.2)\n", "Requirement already satisfied: cbor>=1.0.0 in /opt/conda/lib/python3.11/site-packages (from trec-car-tools>=2.5.4->ir-datasets->FlagEmbedding) (1.0.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.11/site-packages (from jinja2->torch>=1.6.0->FlagEmbedding) (2.1.5)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.11/site-packages (from pandas->datasets==2.19.0->FlagEmbedding) (2.9.0)\n", "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.11/site-packages (from pandas->datasets==2.19.0->FlagEmbedding) (2024.1)\n", "Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.11/site-packages (from pandas->datasets==2.19.0->FlagEmbedding) (2024.1)\n", "Requirement already satisfied: joblib>=1.2.0 in /opt/conda/lib/python3.11/site-packages (from scikit-learn->sentence-transformers->FlagEmbedding) (1.4.0)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.11/site-packages (from scikit-learn->sentence-transformers->FlagEmbedding) (3.4.0)\n", "Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.11/site-packages (from sympy->torch>=1.6.0->FlagEmbedding) (1.3.0)\n", "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas->datasets==2.19.0->FlagEmbedding) (1.16.0)\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a24dee20be054f138b75c100ab2e6a36", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Fetching 30 files: 0%| | 0/30 [00:00