reichenbach commited on
Commit
1c58e99
·
1 Parent(s): 2cb298b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -0
app.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.system('pip install tensorflow')
4
+
5
+ import json
6
+ import numpy as np
7
+ import gradio as gr
8
+ import tensorflow as tf
9
+ from tensorflow import keras
10
+ from huggingface_hub.keras_mixin import from_pretrained_keras
11
+
12
+ num_samples = 10000
13
+ data_path = 'fra.txt'
14
+
15
+ input_texts = []
16
+ target_texts = []
17
+ input_characters = set()
18
+ target_characters = set()
19
+
20
+ with open(data_path, "r", encoding="utf-8") as f:
21
+ lines = f.read().split("\n")
22
+ for line in lines[: min(num_samples, len(lines) - 1)]:
23
+ input_text, target_text, _ = line.split("\t")
24
+ # We use "tab" as the "start sequence" character
25
+ # for the targets, and "\n" as "end sequence" character.
26
+ target_text = "\t" + target_text + "\n"
27
+ input_texts.append(input_text)
28
+ target_texts.append(target_text)
29
+ for char in input_text:
30
+ if char not in input_characters:
31
+ input_characters.add(char)
32
+ for char in target_text:
33
+ if char not in target_characters:
34
+ target_characters.add(char)
35
+
36
+ input_characters = sorted(list(input_characters))
37
+ target_characters = sorted(list(target_characters))
38
+
39
+ input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
40
+ target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
41
+
42
+ num_encoder_tokens = len(input_characters)
43
+ num_decoder_tokens = len(target_characters)
44
+ max_encoder_seq_length = max([len(txt) for txt in input_texts])
45
+ max_decoder_seq_length = max([len(txt) for txt in target_texts])
46
+
47
+ model = from_pretrained_keras('keras-io/cl_s2s')
48
+ latent_dim = 256
49
+
50
+ encoder_inputs = model.input[0] # input_1
51
+ encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output # lstm_1
52
+ encoder_states = [state_h_enc, state_c_enc]
53
+ encoder_model = keras.Model(encoder_inputs, encoder_states)
54
+
55
+ decoder_inputs = model.input[1] # input_2
56
+ decoder_state_input_h = keras.Input(shape=(latent_dim,))
57
+ decoder_state_input_c = keras.Input(shape=(latent_dim,))
58
+ decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
59
+ decoder_lstm = model.layers[3]
60
+ decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
61
+ decoder_inputs, initial_state=decoder_states_inputs
62
+ )
63
+ decoder_states = [state_h_dec, state_c_dec]
64
+ decoder_dense = model.layers[4]
65
+ decoder_outputs = decoder_dense(decoder_outputs)
66
+ decoder_model = keras.Model(
67
+ [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
68
+ )
69
+
70
+ # Reverse-lookup token index to decode sequences back to
71
+ # something readable.
72
+ reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
73
+ reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())
74
+
75
+
76
+ def decode_sequence(input_seq):
77
+ # Encode the input as state vectors.
78
+
79
+ input_seq2 = list()
80
+ input_seq2.append(input_seq)
81
+
82
+ infer_input_data = np.zeros((len(input_seq2), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
83
+
84
+ for i, (input_text) in enumerate((input_seq2)):
85
+ print(i, input_text)
86
+ for t, char in enumerate(input_text):
87
+ print(t, char)
88
+ infer_input_data[i, t, input_token_index[char]] = 1.0
89
+ infer_input_data[i, t + 1:, input_token_index[" "]] = 1.0
90
+
91
+ states_value = encoder_model.predict(infer_input_data)
92
+
93
+ # Generate empty target sequence of length 1.
94
+ target_seq = np.zeros((1, 1, num_decoder_tokens))
95
+ # Populate the first character of target sequence with the start character.
96
+ target_seq[0, 0, target_token_index["\t"]] = 1.0
97
+
98
+ # Sampling loop for a batch of sequences
99
+ # (to simplify, here we assume a batch of size 1).
100
+ stop_condition = False
101
+ decoded_sentence = ""
102
+ while not stop_condition:
103
+ output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
104
+
105
+ # Sample a token
106
+ sampled_token_index = np.argmax(output_tokens[0, -1, :])
107
+ sampled_char = reverse_target_char_index[sampled_token_index]
108
+ decoded_sentence += sampled_char
109
+
110
+ # Exit condition: either hit max length
111
+ # or find stop character.
112
+ if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
113
+ stop_condition = True
114
+
115
+ # Update the target sequence (of length 1).
116
+ target_seq = np.zeros((1, 1, num_decoder_tokens))
117
+ target_seq[0, 0, sampled_token_index] = 1.0
118
+
119
+ # Update states
120
+ states_value = [h, c]
121
+
122
+ return decoded_sentence
123
+
124
+
125
+ input_1 = gr.inputs.Textbox(lines=2)
126
+ output_1 = gr.outputs.Textbox()
127
+
128
+ iface = gr.Interface(decode_sequence,
129
+ inputs=input_1, outputs=["highlight"],
130
+ examples=[["Be kind."],
131
+ ["Hug me."]],
132
+ title="Character Level Recurrent Seq2Seq Model",
133
+ article="Author: <a href=\"https://huggingface.co/reichenbach\">Rishav Chandra Varma</a>")
134
+
135
+ iface.launch(debug=True)