Spaces:

govtech
/

off-topic-demo

Running

App Files Files Community

Shing Yee commited on Sep 24, 2024

Commit

3c2639a

unverified ·

1 Parent(s): 2aec41c

Add application

Browse files

Files changed (8) hide show

.gitattributes +1 -0
.gitignore +160 -0
app.py +55 -0
models/cross-encoder-ms-marco-MiniLM-L-6-v2-CrossEncoder-OffTopic-Classifier-20240918-090615.safetensors +3 -0
models/cross-encoder-stsb-roberta-base-CrossEncoder-OffTopic-Classifier-20240920-174009.safetensors +3 -0
models/jinaai-jina-embeddings-v2-small-en-TwinEncoder-OffTopic-Classifier-20240915-151858.safetensors +3 -0
requirements.txt +66 -0
utils.py +202 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+models/*.safetensors filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import gradio as gr
+from utils import (
+    device,
+    jina_tokenizer,
+    jina_model,
+    embeddings_predict_relevance,
+    stsb_model,
+    stsb_tokenizer,
+    ms_model,
+    ms_tokenizer,
+    cross_encoder_predict_relevance
+)
+def predict(system_prompt, user_prompt, selected_model):
+    if selected_model == "jinaai/jina-embeddings-v2-small-en":
+        predicted_label, probabilities = embeddings_predict_relevance(system_prompt, user_prompt, jina_model, jina_tokenizer, device)
+    elif selected_model == "cross-encoder/stsb-roberta-base":
+        predicted_label, probabilities = cross_encoder_predict_relevance(system_prompt, user_prompt, stsb_model, stsb_tokenizer, device)
+    elif selected_model == "cross-encoder/ms-marco-MiniLM-L-6-v2":
+        predicted_label, probabilities = cross_encoder_predict_relevance(system_prompt, user_prompt, ms_model, ms_tokenizer, device)
+    probability_off_topic = probabilities[0][1] * 100
+    result = f'{probability_off_topic:.3f}% chance this is off-topic'
+    return result
+with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as app:
+    gr.Markdown("# Off-Topic Classification using Fine-tuned Embeddings and Cross-Encoder Models")
+    with gr.Row():
+        system_prompt = gr.Textbox(label="System Prompt")
+        user_prompt = gr.Textbox(label="User Prompt")
+    with gr.Row():
+        selected_model = gr.Dropdown(
+            ["jinaai/jina-embeddings-v2-small-en",
+             "cross-encoder/stsb-roberta-base",
+             "cross-encoder/ms-marco-MiniLM-L-6-v2"],
+            label="Select a model")
+    # Button to run the prediction
+    get_classfication = gr.Button("Check Content")
+    output_result = gr.Textbox(label="Classification and Probabilities", lines=5)
+    get_classfication.click(
+        fn=predict,
+        inputs=[system_prompt, user_prompt, selected_model],
+        outputs=output_result
+    )
+if __name__ == "__main__":
+    app.launch()

models/cross-encoder-ms-marco-MiniLM-L-6-v2-CrossEncoder-OffTopic-Classifier-20240918-090615.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78a99fac3bc5b4729fee844d2154ea625aa9ceac2928cd648984ee1da5b8a203
+size 91236352

models/cross-encoder-stsb-roberta-base-CrossEncoder-OffTopic-Classifier-20240920-174009.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e90752828e92bc2f8ec567b85b3de5a0c8c5ddc331c1907d4dfa950624f71ce
+size 500085976

models/jinaai-jina-embeddings-v2-small-en-TwinEncoder-OffTopic-Classifier-20240915-151858.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:223687abc28cf0fa198d326d2786374000396d841e66d684c022941da2ca9628
+size 144076480

requirements.txt ADDED Viewed

	@@ -0,0 +1,66 @@

+aiofiles==23.2.1
+annotated-types==0.7.0
+anyio==4.6.0
+certifi==2024.8.30
+charset-normalizer==3.3.2
+click==8.1.7
+contourpy==1.3.0
+cycler==0.12.1
+fastapi==0.115.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.0
+fsspec==2024.9.0
+gradio==4.44.0
+gradio_client==1.3.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+huggingface-hub==0.25.1
+idna==3.10
+importlib_resources==6.4.5
+Jinja2==3.1.4
+kiwisolver==1.4.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.3
+numpy==2.1.1
+orjson==3.10.7
+packaging==24.1
+pandas==2.2.3
+pillow==10.4.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.4
+python-dateutil==2.9.0.post0
+python-multipart==0.0.10
+pytz==2024.2
+PyYAML==6.0.2
+regex==2024.9.11
+requests==2.32.3
+rich==13.8.1
+ruff==0.6.7
+safetensors==0.4.5
+semantic-version==2.10.0
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+starlette==0.38.6
+sympy==1.13.3
+tokenizers==0.19.1
+tomlkit==0.12.0
+torch==2.4.1
+tqdm==4.66.5
+transformers==4.44.2
+typer==0.12.5
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+uvicorn==0.30.6
+websockets==12.0

utils.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import torch
+from torch import nn
+from safetensors.torch import load_file
+from transformers import AutoModel, AutoTokenizer
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# Load the model state_dict from safetensors
+def load_model_safetensors(model, load_path="model.safetensors"):
+    # Load the safetensors file
+    state_dict = load_file(load_path)
+    # Load the state dict into the model
+    model.load_state_dict(state_dict)
+    return model
+##########################
+# JINA EMBEDDINGS
+##########################
+# Jina Configs
+JINA_CONTEXT_LEN = 1024
+# Adapter for embeddings
+class Adapter(nn.Module):
+    def __init__(self, hidden_size):
+        super(Adapter, self).__init__()
+        self.down_project = nn.Linear(hidden_size, hidden_size // 2)
+        self.activation = nn.ReLU()
+        self.up_project = nn.Linear(hidden_size // 2, hidden_size)
+    def forward(self, x):
+        down = self.down_project(x)
+        activated = self.activation(down)
+        up = self.up_project(activated)
+        return up + x  # Residual connection
+# Pool by attention score
+class AttentionPooling(nn.Module):
+    def __init__(self, hidden_size):
+        super(AttentionPooling, self).__init__()
+        self.attention_weights = nn.Parameter(torch.randn(hidden_size))
+    def forward(self, hidden_states):
+        # hidden_states: [seq_len, batch_size, hidden_size]
+        scores = torch.matmul(hidden_states, self.attention_weights)
+        attention_weights = torch.softmax(scores, dim=0)
+        weighted_sum = torch.sum(attention_weights.unsqueeze(-1) * hidden_states, dim=0)
+        return weighted_sum
+# Custom bi-encoder model with MLP layers for interaction
+class CrossEncoderWithSharedBase(nn.Module):
+    def __init__(self, base_model, num_labels=2, num_heads=8):
+        super(CrossEncoderWithSharedBase, self).__init__()
+        # Shared pre-trained model
+        self.shared_encoder = base_model
+        hidden_size = self.shared_encoder.config.hidden_size
+        # Sentence-specific adapters
+        self.adapter1 = Adapter(hidden_size)
+        self.adapter2 = Adapter(hidden_size)
+        # Cross-attention layers
+        self.cross_attention_1_to_2 = nn.MultiheadAttention(hidden_size, num_heads)
+        self.cross_attention_2_to_1 = nn.MultiheadAttention(hidden_size, num_heads)
+        # Attention pooling layers
+        self.attn_pooling_1_to_2 = AttentionPooling(hidden_size)
+        self.attn_pooling_2_to_1 = AttentionPooling(hidden_size)
+        # Projection layer with non-linearity
+        self.projection_layer = nn.Sequential(
+            nn.Linear(hidden_size * 2, hidden_size),
+            nn.ReLU()
+        )
+        # Classifier with three hidden layers
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size // 2),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_size // 2, hidden_size // 4),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_size // 4, num_labels)
+        )
+    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
+        # Encode sentences
+        outputs1 = self.shared_encoder(input_ids1, attention_mask=attention_mask1)
+        outputs2 = self.shared_encoder(input_ids2, attention_mask=attention_mask2)
+        # Apply sentence-specific adapters
+        embeds1 = self.adapter1(outputs1.last_hidden_state)
+        embeds2 = self.adapter2(outputs2.last_hidden_state)
+        # Transpose for attention layers
+        embeds1 = embeds1.transpose(0, 1)
+        embeds2 = embeds2.transpose(0, 1)
+        # Cross-attention
+        cross_attn_1_to_2, _ = self.cross_attention_1_to_2(embeds1, embeds2, embeds2)
+        cross_attn_2_to_1, _ = self.cross_attention_2_to_1(embeds2, embeds1, embeds1)
+        # Attention pooling
+        pooled_1_to_2 = self.attn_pooling_1_to_2(cross_attn_1_to_2)
+        pooled_2_to_1 = self.attn_pooling_2_to_1(cross_attn_2_to_1)
+        # Concatenate and project
+        combined = torch.cat((pooled_1_to_2, pooled_2_to_1), dim=1)
+        projected = self.projection_layer(combined)
+        # Classification
+        logits = self.classifier(projected)
+        return logits
+# Prediction function
+def embeddings_predict_relevance(sentence1, sentence2, model, tokenizer, device):
+    model.eval()
+    inputs1 = tokenizer(sentence1, return_tensors="pt", truncation=True, padding="max_length", max_length=1024)
+    inputs2 = tokenizer(sentence2, return_tensors="pt", truncation=True, padding="max_length", max_length=1024)
+    input_ids1 = inputs1['input_ids'].to(device)
+    attention_mask1 = inputs1['attention_mask'].to(device)
+    input_ids2 = inputs2['input_ids'].to(device)
+    attention_mask2 = inputs2['attention_mask'].to(device)
+    with torch.no_grad():
+        outputs = model(input_ids1=input_ids1, attention_mask1=attention_mask1,
+                        input_ids2=input_ids2, attention_mask2=attention_mask2)
+        probabilities = torch.softmax(outputs, dim=1)
+        predicted_label = torch.argmax(probabilities, dim=1).item()
+    return predicted_label, probabilities.cpu().numpy()
+# Jina model
+JINA_MODEL_NAME = "jinaai/jina-embeddings-v2-small-en"
+jina_tokenizer = AutoTokenizer.from_pretrained(JINA_MODEL_NAME)
+jina_base_model = AutoModel.from_pretrained(JINA_MODEL_NAME)
+jina_model = CrossEncoderWithSharedBase(jina_base_model, num_labels=2)
+jina_model = load_model_safetensors(jina_model, load_path="models/jinaai-jina-embeddings-v2-small-en-TwinEncoder-OffTopic-Classifier-20240915-151858.safetensors")
+##########################
+# CROSS-ENCODER
+##########################
+# STSB Configs
+STSB_CONTEXT_LEN = 512
+# ms-macro Configs
+MS_CONTEXT_LEN = 512
+class CrossEncoderWithMLP(nn.Module):
+    def __init__(self, base_model, num_labels=2):
+        super(CrossEncoderWithMLP, self).__init__()
+        # Existing cross-encoder model
+        self.base_model = base_model
+        # Hidden size of the base model
+        hidden_size = base_model.config.hidden_size
+        # MLP layers after combining the cross-encoders
+        self.mlp = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size // 2),  # Input: a single sentence
+            nn.ReLU(),
+            nn.Linear(hidden_size // 2, hidden_size // 4),  # Reduce the size of the layer
+            nn.ReLU()
+        )
+        # Classifier head
+        self.classifier = nn.Linear(hidden_size // 4, num_labels)
+    def forward(self, input_ids, attention_mask):
+        # Encode the pair of sentences in one pass
+        outputs = self.base_model(input_ids, attention_mask)
+        pooled_output = outputs.pooler_output
+        # Pass the pooled output through mlp layers
+        mlp_output = self.mlp(pooled_output)
+        # Pass the final MLP output through the classifier
+        logits = self.classifier(mlp_output)
+        return logits
+def cross_encoder_predict_relevance(sentence1, sentence2, model, tokenizer, device):
+    model.eval()
+    # Tokenize the pair of sentences
+    encoding = tokenizer(
+        sentence1, sentence2,  # Takes in a two sentences as a pair
+        return_tensors="pt",
+        truncation=True,
+        padding="max_length",
+        max_length=512,
+        return_token_type_ids=False
+    )
+    # Extract the input_ids and attention mask
+    input_ids = encoding["input_ids"].to(device)
+    attention_mask = encoding["attention_mask"].to(device)
+    with torch.no_grad():
+        outputs = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )  # Returns logits
+        # Convert raw logits into probabilities for each class and get the predicted label
+        probabilities = torch.softmax(outputs, dim=1)
+        predicted_label = torch.argmax(probabilities, dim=1).item()
+    return predicted_label, probabilities.cpu().numpy()
+# STSB model
+STSB_MODEL_NAME = "cross-encoder/stsb-roberta-base"
+stsb_tokenizer = AutoTokenizer.from_pretrained(STSB_MODEL_NAME)
+stsb_base_model = AutoModel.from_pretrained(STSB_MODEL_NAME)
+stsb_model = CrossEncoderWithMLP(stsb_base_model, num_labels=2)
+stsb_model = load_model_safetensors(stsb_model, load_path="models/cross-encoder-stsb-roberta-base-CrossEncoder-OffTopic-Classifier-20240920-174009.safetensors")
+# MS model
+MS_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+ms_tokenizer = AutoTokenizer.from_pretrained(MS_MODEL_NAME)
+ms_base_model = AutoModel.from_pretrained(MS_MODEL_NAME)
+ms_model = CrossEncoderWithMLP(ms_base_model, num_labels=2)
+ms_model = load_model_safetensors(ms_model, load_path="models/cross-encoder-ms-marco-MiniLM-L-6-v2-CrossEncoder-OffTopic-Classifier-20240918-090615.safetensors")