ierhon commited on
Commit
23f7f3b
·
1 Parent(s): 6d37d8c

Use new dataset.json

Browse files
Files changed (1) hide show
  1. train.py +6 -4
train.py CHANGED
@@ -8,10 +8,12 @@ from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
8
  from model_settings import *
9
 
10
 
11
- with open("dataset.json", "r") as f: # TODO: move the outputs into a separate file, so it would be "key": 0, "key2": 1 etc
12
  dset = json.load(f)
13
 
14
- dset_size = len(dset)
 
 
15
  tokenizer = Tokenizer() # a tokenizer is a thing to split text into words, it might have some other stuff like making all the letters lowercase, etc.
16
  tokenizer.fit_on_texts(list(dset.keys()))
17
 
@@ -31,11 +33,11 @@ model.add(Dense(dset_size, activation="linear")) # TBH it doesn't matter that mu
31
  X = [] # we're loading the training data into input X
32
  y = [] # and output y
33
 
34
- for line, key in enumerate(dset):
35
  tokens = tokenizer.texts_to_sequences([key,])[0]
36
  X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code
37
  output_array = np.zeros(dset_size)
38
- output_array[line] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response
39
  y.append(output_array)
40
 
41
  X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited)
 
8
  from model_settings import *
9
 
10
 
11
+ with open("dataset.json", "r") as f:
12
  dset = json.load(f)
13
 
14
+ with open("responses.txt", "r") as f:
15
+ dset_size = len(f.readlines())
16
+
17
  tokenizer = Tokenizer() # a tokenizer is a thing to split text into words, it might have some other stuff like making all the letters lowercase, etc.
18
  tokenizer.fit_on_texts(list(dset.keys()))
19
 
 
33
  X = [] # we're loading the training data into input X
34
  y = [] # and output y
35
 
36
+ for key in dset:
37
  tokens = tokenizer.texts_to_sequences([key,])[0]
38
  X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code
39
  output_array = np.zeros(dset_size)
40
+ output_array[dset[key]] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response
41
  y.append(output_array)
42
 
43
  X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited)