eaglelandsonce commited on
Commit
4c13885
·
verified ·
1 Parent(s): 774c14a

Update pages/15_Plus_Detokenizer.py

Browse files
Files changed (1) hide show
  1. pages/15_Plus_Detokenizer.py +15 -13
pages/15_Plus_Detokenizer.py CHANGED
@@ -122,20 +122,7 @@ components.html(html_content, height=700, scrolling=True)
122
  # Load the tokenizer
123
  tokenizer = AutoTokenizer.from_pretrained('gpt2')
124
 
125
- # Tokenization section
126
- st.header("Tokenization")
127
- sentence = st.text_input("Enter a sentence to tokenize:", "cr8 lg cnvs html js hlds 9 wbs")
128
 
129
- def format_token_ids(token_ids):
130
- formatted_ids = [str(token_id).zfill(5) for token_id in token_ids]
131
- return ''.join(formatted_ids)
132
-
133
- if st.button("Tokenize"):
134
- input_ids = tokenizer(sentence, return_tensors='pt').input_ids
135
- token_ids_list = input_ids[0].tolist()
136
- formatted_token_ids = format_token_ids(token_ids_list)
137
- st.write("Tokenized input IDs (formatted):")
138
- st.write(formatted_token_ids)
139
 
140
  # Detokenization section
141
  st.header("Detokenization")
@@ -158,6 +145,21 @@ if st.button("Detokenize"):
158
  st.write("Detokenized sentence:")
159
  st.write(detokenized_sentence)
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  # Load the model
162
  gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')
163
 
 
122
  # Load the tokenizer
123
  tokenizer = AutoTokenizer.from_pretrained('gpt2')
124
 
 
 
 
125
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  # Detokenization section
128
  st.header("Detokenization")
 
145
  st.write("Detokenized sentence:")
146
  st.write(detokenized_sentence)
147
 
148
+ # Tokenization section
149
+ st.header("Tokenization")
150
+ sentence = st.text_input("Enter a sentence to tokenize:", "cr8 lg")
151
+
152
+ def format_token_ids(token_ids):
153
+ formatted_ids = [str(token_id).zfill(5) for token_id in token_ids]
154
+ return ''.join(formatted_ids)
155
+
156
+ if st.button("Tokenize"):
157
+ input_ids = tokenizer(sentence, return_tensors='pt').input_ids
158
+ token_ids_list = input_ids[0].tolist()
159
+ formatted_token_ids = format_token_ids(token_ids_list)
160
+ st.write("Tokenized input IDs (formatted):")
161
+ st.write(formatted_token_ids)
162
+
163
  # Load the model
164
  gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')
165