cgr71ii commited on
Commit
b71fe3e
·
verified ·
1 Parent(s): 53c1d69

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +46 -0
README.md CHANGED
@@ -1,3 +1,49 @@
1
  ---
2
  license: cc-by-sa-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-sa-4.0
3
  ---
4
+ # Usage
5
+
6
+ ```python
7
+ import re
8
+ import urllib.parse
9
+
10
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
+ import nltk.tokenize
12
+ import torch
13
+
14
+ preprocess_tokenizer_regex = r'[^\W_0-9]+|[^\w\s]+|_+|\s+|[0-9]+' # Similar to wordpunct_tokenize
15
+ preprocess_tokenizer = nltk.tokenize.RegexpTokenizer(preprocess_tokenizer_regex).tokenize
16
+
17
+ def preprocess_url(url):
18
+ protocol_idx = url.find("://")
19
+ protocol_idx = (protocol_idx + 3) if protocol_idx != -1 else 0
20
+ url = url.rstrip('/')[protocol_idx:]
21
+ url = urllib.parse.unquote(url, errors="backslashreplace")
22
+
23
+ # Remove blanks
24
+ url = re.sub(r'\s+', ' ', url)
25
+ url = re.sub(r'^\s+|\s+$', '', url)
26
+
27
+ # Tokenize
28
+ url = ' '.join(preprocess_tokenizer(url))
29
+
30
+ return url
31
+
32
+ tokenizer = AutoTokenizer.from_pretrained("Transducens/xlm-roberta-base-parallel-urls-classifier")
33
+ model = AutoModelForSequenceClassification.from_pretrained("Transducens/xlm-roberta-base-parallel-urls-classifier")
34
+
35
+ # prepare input
36
+ url1 = preprocess_url("https://web.ua.es/en/culture.html")
37
+ url2 = preprocess_url("https://web.ua.es/es/cultura.html")
38
+ urls = f"{url1}{tokenizer.sep_token}{url2}"
39
+ encoded_input = tokenizer(urls, add_special_tokens=True, truncation=True, padding="longest",
40
+ return_attention_mask=True, return_tensors="pt", max_length=256)
41
+
42
+ # forward pass
43
+ output = model(encoded_input["input_ids"], encoded_input["attention_mask"])
44
+
45
+ # obtain probability
46
+ probability = torch.sigmoid(output["logits"]).cpu().squeeze().item()
47
+
48
+ print(probability)
49
+ ```