Update pages/15_Plus_Detokenizer.py
Browse files- pages/15_Plus_Detokenizer.py +15 -13
pages/15_Plus_Detokenizer.py
CHANGED
@@ -122,20 +122,7 @@ components.html(html_content, height=700, scrolling=True)
|
|
122 |
# Load the tokenizer
|
123 |
tokenizer = AutoTokenizer.from_pretrained('gpt2')
|
124 |
|
125 |
-
# Tokenization section
|
126 |
-
st.header("Tokenization")
|
127 |
-
sentence = st.text_input("Enter a sentence to tokenize:", "cr8 lg cnvs html js hlds 9 wbs")
|
128 |
|
129 |
-
def format_token_ids(token_ids):
|
130 |
-
formatted_ids = [str(token_id).zfill(5) for token_id in token_ids]
|
131 |
-
return ''.join(formatted_ids)
|
132 |
-
|
133 |
-
if st.button("Tokenize"):
|
134 |
-
input_ids = tokenizer(sentence, return_tensors='pt').input_ids
|
135 |
-
token_ids_list = input_ids[0].tolist()
|
136 |
-
formatted_token_ids = format_token_ids(token_ids_list)
|
137 |
-
st.write("Tokenized input IDs (formatted):")
|
138 |
-
st.write(formatted_token_ids)
|
139 |
|
140 |
# Detokenization section
|
141 |
st.header("Detokenization")
|
@@ -158,6 +145,21 @@ if st.button("Detokenize"):
|
|
158 |
st.write("Detokenized sentence:")
|
159 |
st.write(detokenized_sentence)
|
160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
# Load the model
|
162 |
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')
|
163 |
|
|
|
122 |
# Load the tokenizer
|
123 |
tokenizer = AutoTokenizer.from_pretrained('gpt2')
|
124 |
|
|
|
|
|
|
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
# Detokenization section
|
128 |
st.header("Detokenization")
|
|
|
145 |
st.write("Detokenized sentence:")
|
146 |
st.write(detokenized_sentence)
|
147 |
|
148 |
+
# Tokenization section
|
149 |
+
st.header("Tokenization")
|
150 |
+
sentence = st.text_input("Enter a sentence to tokenize:", "cr8 lg")
|
151 |
+
|
152 |
+
def format_token_ids(token_ids):
|
153 |
+
formatted_ids = [str(token_id).zfill(5) for token_id in token_ids]
|
154 |
+
return ''.join(formatted_ids)
|
155 |
+
|
156 |
+
if st.button("Tokenize"):
|
157 |
+
input_ids = tokenizer(sentence, return_tensors='pt').input_ids
|
158 |
+
token_ids_list = input_ids[0].tolist()
|
159 |
+
formatted_token_ids = format_token_ids(token_ids_list)
|
160 |
+
st.write("Tokenized input IDs (formatted):")
|
161 |
+
st.write(formatted_token_ids)
|
162 |
+
|
163 |
# Load the model
|
164 |
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')
|
165 |
|