Spaces:
Runtime error
Runtime error
Update rpc.py
Browse files
rpc.py
CHANGED
@@ -5,6 +5,7 @@ import keras_nlp
|
|
5 |
|
6 |
import math
|
7 |
import json
|
|
|
8 |
from transformers import AutoTokenizer
|
9 |
from tokenizers import AddedToken
|
10 |
|
@@ -23,6 +24,23 @@ print("vocab_size:", vocab_size)
|
|
23 |
print("pad token id:", tokenizer.pad_token)
|
24 |
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
# Masked Accuracy Metric
|
27 |
def masked_accuracy(y_true, y_pred, padding_token=tokenizer.pad_token_id):
|
28 |
y_true = tf.cast(y_true, tf.int32)
|
|
|
5 |
|
6 |
import math
|
7 |
import json
|
8 |
+
import spacy
|
9 |
from transformers import AutoTokenizer
|
10 |
from tokenizers import AddedToken
|
11 |
|
|
|
24 |
print("pad token id:", tokenizer.pad_token)
|
25 |
|
26 |
|
27 |
+
nlp = spacy.load("en_core_web_lg")
|
28 |
+
nlp.max_length = 2000000
|
29 |
+
selected = {'NUM', 'PROPN'}
|
30 |
+
alltoks = sorted(list(tokenizer.get_vocab().items()), key=lambda x:x[1])
|
31 |
+
all_toks_text = "\n".join([t[0].replace("▁", "") for t in alltoks])
|
32 |
+
doc = nlp(all_toks_text)
|
33 |
+
carry_toks = set()
|
34 |
+
i = 0
|
35 |
+
for ii, token in enumerate(doc):
|
36 |
+
if str(token) in alltoks[i][0]: pass
|
37 |
+
else: i += 1
|
38 |
+
if str(token) in alltoks[i][0] and token.pos_ in selected and i > 100:
|
39 |
+
if (token.pos_ != "PROPN" or alltoks[i][0].replace("▁", "")[0].isupper()):
|
40 |
+
carry_toks.add(alltoks[i][1])
|
41 |
+
print(len(carry_toks))
|
42 |
+
|
43 |
+
|
44 |
# Masked Accuracy Metric
|
45 |
def masked_accuracy(y_true, y_pred, padding_token=tokenizer.pad_token_id):
|
46 |
y_true = tf.cast(y_true, tf.int32)
|