feat: add files

Browse files

Files changed (8) hide show

.gitignore +2 -0
README.md +48 -1
config.json +11 -0
inference_onnx.py +82 -0
inference_safetensors.py +121 -0
models/off-topic-cross-encoder-stsb-roberta-base-CrossEncoder.onnx +3 -0
govtech-stsb-roberta-base-off-topic → models/off-topic-cross-encoder-stsb-roberta-base-CrossEncoder.safetensors +0 -0
requirements.txt +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .venv/
2	+ .DS_store

README.md CHANGED Viewed

@@ -2,4 +2,51 @@
 license: other
 license_name: govtech-singapore
 license_link: LICENSE
----

 license: other
 license_name: govtech-singapore
 license_link: LICENSE
+---
+# Off-Topic Classification Model
+# Off-Topic Classification Model
+This model leverages a fine-tuned **Cross Encoder STSB Roberta Base** to perform binary classification, determining whether a user prompt is off-topic in relation to the system's intended purpose as defined by the system prompt.
+## Model Highlights
+- **Base Model**: [`stsb-roberta-base`](https://huggingface.co/sentence-transformers/stsb-roberta-base)
+- **Maximum Context Length**: 1024 tokens
+- **Task**: Binary classification (on-topic/off-topic)
+## Performance
+| Approach                              | Model                          | ROC-AUC | F1   | Precision | Recall |
+|---------------------------------------|--------------------------------|---------|------|-----------|--------|
+| Fine-tuned cross-encoder classifier   | stsb-roberta-base              | 0.99    | 0.99 | 0.99      | 0.99   |
+| Pre-trained cross-encoder             | stsb-roberta-base              | 0.73    | 0.68 | 0.53      | 0.93   |
+## Usage
+1. Clone this repository and install the required dependencies:
+    ```bash
+    pip install -r requirements.txt
+    ```
+2. You can run the model using two options:
+    **Option 1**: Using `inference_onnx.py` with the ONNX Model.
+        ```
+        python inference_onnx.py '[
+            ["System prompt example 1", "User prompt example 1"],
+            ["System prompt example 2", "System prompt example 2]
+        ]'
+        ```
+    **Option 2**: Using `inference_safetensors.py` with PyTorch and SafeTensors.
+        ```
+        python inference_safetensors.py '[
+            ["System prompt example 1", "User prompt example 1"],
+            ["System prompt example 2", "System prompt example 2]
+        ]'
+        ```
+Read more about this model in our [technical report](https://arxiv.org/abs/2411.12946).

config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "description": "Off-Topic classifier designed to block user prompts that do not align with the intended purpose of the system, as determined by the system prompt.",
+    "classifier": {
+        "embedding": {
+            "model_name": "cross-encoder/stsb-roberta-base",
+            "max_length": 512,
+            "model_weights_fp": "models/off-topic-cross-encoder-stsb-roberta-base-CrossEncoder.safetensors",
+            "model_fp": "models/off-topic-cross-encoder-stsb-roberta-base-CrossEncoder.onnx"
+        }
+    }
+}

inference_onnx.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+inference_onnx.py
+This script leverages ONNX runtime to perform inference with a pre-trained model.
+"""
+import json
+import torch
+import sys
+import numpy as np
+import onnxruntime as rt
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer
+repo_path = "govtech/stsb-roberta-base-off-topic"
+config_path = hf_hub_download(repo_id=repo_path, filename="config.json")
+config_path = "config.json"
+with open(config_path, 'r') as f:
+    config = json.load(f)
+def predict(sentence1, sentence2):
+    # Configuration
+    model_name = config['classifier']['embedding']['model_name']
+    max_length = config['classifier']['embedding']['max_length']
+    model_fp = config['classifier']['embedding']['model_fp']
+    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Get inputs
+    encoding = tokenizer(
+        sentence1, sentence2,  # Takes in a two sentences as a pair
+        return_tensors="pt",
+        truncation=True,
+        padding="max_length",
+        max_length=max_length,
+        return_token_type_ids=False
+    )
+    input_ids = encoding["input_ids"].to(device)
+    attention_mask = encoding["attention_mask"].to(device)
+    # Download the classifier from HuggingFace hub
+    local_model_fp = model_fp
+    local_model_fp = hf_hub_download(repo_id=repo_path, filename=model_fp)
+    # Run inference
+    session = rt.InferenceSession(local_model_fp)  # Load the ONNX model
+    onnx_inputs = {
+        session.get_inputs()[0].name: input_ids.cpu().numpy(),
+        session.get_inputs()[1].name: attention_mask.cpu().numpy()
+    }
+    outputs = session.run(None, onnx_inputs)
+    probabilities = torch.softmax(torch.tensor(outputs[0]), dim=1)
+    predicted_label = torch.argmax(probabilities, dim=1).item()
+    return predicted_label, probabilities.cpu().numpy()
+if __name__ == "__main__":
+    # Load data
+    input_data = sys.argv[1]
+    sentence_pairs = json.loads(input_data)
+    # Validate input data format
+    if not all(isinstance(pair[0], str) and isinstance(pair[1], str) for pair in sentence_pairs):
+        raise ValueError("Each pair must contain two strings.")
+    for idx, (sentence1, sentence2) in enumerate(sentence_pairs):
+        # Generate prediction and scores
+        predicted_label, probabilities = predict(sentence1, sentence2)
+        # Print the results
+        print(f"Pair {idx + 1}:")
+        print(f"  Sentence 1: {sentence1}")
+        print(f"  Sentence 2: {sentence2}")
+        print(f"  Predicted Label: {predicted_label}")
+        print(f"  Probabilities: {probabilities}")
+        print('-' * 50)

inference_safetensors.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+inference_safetensors.py
+Defines the architecture of the fine-tuned embedding model used for Off-Topic classification.
+"""
+import json
+import torch
+import sys
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from transformers import AutoTokenizer, AutoModel
+class CrossEncoderWithMLP(nn.Module):
+    def __init__(self, base_model, num_labels=2):
+        super(CrossEncoderWithMLP, self).__init__()
+        # Existing cross-encoder model
+        self.base_model = base_model
+        # Hidden size of the base model
+        hidden_size = base_model.config.hidden_size
+        # MLP layers after combining the cross-encoders
+        self.mlp = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size // 2),  # Input: a single sentence
+            nn.ReLU(),
+            nn.Linear(hidden_size // 2, hidden_size // 4),  # Reduce the size of the layer
+            nn.ReLU()
+        )
+        # Classifier head
+        self.classifier = nn.Linear(hidden_size // 4, num_labels)
+    def forward(self, input_ids, attention_mask):
+        # Encode the pair of sentences in one pass
+        outputs = self.base_model(input_ids, attention_mask)
+        pooled_output = outputs.pooler_output
+        # Pass the pooled output through mlp layers
+        mlp_output = self.mlp(pooled_output)
+        # Pass the final MLP output through the classifier
+        logits = self.classifier(mlp_output)
+        return logits
+# Load configuration file
+repo_path = "govtech/jina-embeddings-v2-small-en-off-topic"
+#config_path = hf_hub_download(repo_id=repo_path, filename="config.json")
+config_path = "config.json"
+with open(config_path, 'r') as f:
+    config = json.load(f)
+def predict(sentence1, sentence2):
+    """
+    Predicts the label for a pair of sentences using a fine-tuned model with SafeTensors weights.
+    Args:
+    - sentence1 (str): The first input sentence.
+    - sentence2 (str): The second input sentence.
+    Returns:
+    tuple:
+        - predicted_label (int): The predicted label (e.g., 0 or 1).
+        - probabilities (numpy.ndarray): The probabilities for each class.
+    """
+    # Load model configuration
+    model_name = config['classifier']['embedding']['model_name']
+    max_length = config['classifier']['embedding']['max_length']
+    model_weights_fp = config['classifier']['embedding']['model_weights_fp']
+    # Load tokenizer and base model
+    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    base_model = AutoModel.from_pretrained(model_name)
+    model = CrossEncoderWithMLP(base_model, num_labels=2)
+    # Load weights into the model
+    weights = load_file(model_weights_fp)
+    model.load_state_dict(weights)
+    model.to(device)
+    model.eval()
+    # Get inputs
+    encoding = tokenizer(
+        sentence1, sentence2,  # Takes in a two sentences as a pair
+        return_tensors="pt",
+        truncation=True,
+        padding="max_length",
+        max_length=max_length,
+        return_token_type_ids=False
+    )
+    input_ids = encoding["input_ids"].to(device)
+    attention_mask = encoding["attention_mask"].to(device)
+    # Get outputs
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        probabilities = torch.softmax(outputs, dim=1)
+        predicted_label = torch.argmax(probabilities, dim=1).item()
+    return predicted_label, probabilities.cpu().numpy()
+if __name__ == "__main__":
+    # Load data
+    input_data = sys.argv[1]
+    sentence_pairs = json.loads(input_data)
+    # Validate input data format
+    if not all(isinstance(pair[0], str) and isinstance(pair[1], str) for pair in sentence_pairs):
+        raise ValueError("Each pair must contain two strings.")
+    for idx, (sentence1, sentence2) in enumerate(sentence_pairs):
+        # Generate prediction and scores
+        predicted_label, probabilities = predict(sentence1, sentence2)
+        # Print the results
+        print(f"Pair {idx + 1}:")
+        print(f"  Sentence 1: {sentence1}")
+        print(f"  Sentence 2: {sentence2}")
+        print(f"  Predicted Label: {predicted_label}")
+        print(f"  Probabilities: {probabilities}")
+        print('-' * 50)

models/off-topic-cross-encoder-stsb-roberta-base-CrossEncoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c311c4a80aae3477b3688d52f3c5dfc9e2e761242f7884d2e713f028e3aa21c
+size 500394446

govtech-stsb-roberta-base-off-topic → models/off-topic-cross-encoder-stsb-roberta-base-CrossEncoder.safetensors RENAMED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+huggingface_hub==0.26.2
+numpy==2.1.3
+onnxruntime==1.20.0
+safetensors==0.4.5
+torch==2.5.1
+transformers==4.46.3