ierhon commited on
Commit
d12bf65
·
1 Parent(s): d3cb051

sleepy hon forgot how to use a tokenizer

Browse files
Files changed (1) hide show
  1. test.py +5 -1
test.py CHANGED
@@ -7,13 +7,17 @@ from vecs import *
7
  with open("dataset.json", "r") as f:
8
  dset = json.load(f)
9
 
 
 
 
10
  model = load_model("chatbot.keras", custom_objects={"SeqSelfAttention": SeqSelfAttention})
11
 
12
  def find_line_number(array):
13
  return sorted(zip(list(array), [x for x in range(len(array))]), key=lambda x:x[0], reverse=True)[0][1] # yeah, one big line, find the biggest value and return the number of the line
14
 
15
  def generate(text):
16
-
 
17
 
18
  if __name__ == "__main__": # if this code is not being imported, open the chat
19
  while True:
 
7
  with open("dataset.json", "r") as f:
8
  dset = json.load(f)
9
 
10
+ tokenizer = Tokenizer() # a tokenizer is a thing to split text into words, it might have some other stuff like making all the letters lowercase, etc.
11
+ tokenizer.fit_on_texts(list(dset.keys()))
12
+
13
  model = load_model("chatbot.keras", custom_objects={"SeqSelfAttention": SeqSelfAttention})
14
 
15
  def find_line_number(array):
16
  return sorted(zip(list(array), [x for x in range(len(array))]), key=lambda x:x[0], reverse=True)[0][1] # yeah, one big line, find the biggest value and return the number of the line
17
 
18
  def generate(text):
19
+ tokens = list(tokenizer.texts_to_sequences([text,])[0]) # text into tokens (almost words)
20
+ tokens =
21
 
22
  if __name__ == "__main__": # if this code is not being imported, open the chat
23
  while True: