Usage

import re
import urllib.parse

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import nltk.tokenize
import torch

preprocess_tokenizer_regex = r'[^\W_0-9]+|[^\w\s]+|_+|\s+|[0-9]+' # Similar to wordpunct_tokenize
preprocess_tokenizer = nltk.tokenize.RegexpTokenizer(preprocess_tokenizer_regex).tokenize

def preprocess_url(url):
    protocol_idx = url.find("://")
    protocol_idx = (protocol_idx + 3) if protocol_idx != -1 else 0
    url = url.rstrip('/')[protocol_idx:]
    url = urllib.parse.unquote(url, errors="backslashreplace")

    # Remove blanks
    url = re.sub(r'\s+', ' ', url)
    url = re.sub(r'^\s+|\s+$', '', url)

    # Tokenize
    url = ' '.join(preprocess_tokenizer(url))

    return url

tokenizer = AutoTokenizer.from_pretrained("Transducens/xlm-roberta-base-parallel-urls-classifier")
model = AutoModelForSequenceClassification.from_pretrained("Transducens/xlm-roberta-base-parallel-urls-classifier")

# prepare input
url1 = preprocess_url("https://web.ua.es/en/culture.html")
url2 = preprocess_url("https://web.ua.es/es/cultura.html")
urls = f"{url1}{tokenizer.sep_token}{url2}"
encoded_input = tokenizer(urls, add_special_tokens=True, truncation=True, padding="longest",
                          return_attention_mask=True, return_tensors="pt", max_length=256)

# forward pass
output = model(encoded_input["input_ids"], encoded_input["attention_mask"])

# obtain probability
probability = torch.sigmoid(output["logits"]).cpu().squeeze().item()

print(probability)
Downloads last month
7
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.