pedrocas15 commited on
Commit
02480fa
·
verified ·
1 Parent(s): bce4284

Update rpc.py

Browse files
Files changed (1) hide show
  1. rpc.py +18 -0
rpc.py CHANGED
@@ -5,6 +5,7 @@ import keras_nlp
5
 
6
  import math
7
  import json
 
8
  from transformers import AutoTokenizer
9
  from tokenizers import AddedToken
10
 
@@ -23,6 +24,23 @@ print("vocab_size:", vocab_size)
23
  print("pad token id:", tokenizer.pad_token)
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  # Masked Accuracy Metric
27
  def masked_accuracy(y_true, y_pred, padding_token=tokenizer.pad_token_id):
28
  y_true = tf.cast(y_true, tf.int32)
 
5
 
6
  import math
7
  import json
8
+ import spacy
9
  from transformers import AutoTokenizer
10
  from tokenizers import AddedToken
11
 
 
24
  print("pad token id:", tokenizer.pad_token)
25
 
26
 
27
+ nlp = spacy.load("en_core_web_lg")
28
+ nlp.max_length = 2000000
29
+ selected = {'NUM', 'PROPN'}
30
+ alltoks = sorted(list(tokenizer.get_vocab().items()), key=lambda x:x[1])
31
+ all_toks_text = "\n".join([t[0].replace("▁", "") for t in alltoks])
32
+ doc = nlp(all_toks_text)
33
+ carry_toks = set()
34
+ i = 0
35
+ for ii, token in enumerate(doc):
36
+ if str(token) in alltoks[i][0]: pass
37
+ else: i += 1
38
+ if str(token) in alltoks[i][0] and token.pos_ in selected and i > 100:
39
+ if (token.pos_ != "PROPN" or alltoks[i][0].replace("▁", "")[0].isupper()):
40
+ carry_toks.add(alltoks[i][1])
41
+ print(len(carry_toks))
42
+
43
+
44
  # Masked Accuracy Metric
45
  def masked_accuracy(y_true, y_pred, padding_token=tokenizer.pad_token_id):
46
  y_true = tf.cast(y_true, tf.int32)