eaglelandsonce commited on
Commit
774c14a
·
verified ·
1 Parent(s): 03fa735

Create 15_Plus_Detokenizer.py

Browse files
Files changed (1) hide show
  1. pages/15_Plus_Detokenizer.py +181 -0
pages/15_Plus_Detokenizer.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import streamlit.components.v1 as components
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+
5
+ # Set page configuration
6
+ st.set_page_config(page_title="Interactive Base 50256 Grid with Tokenizer", layout="wide")
7
+
8
+ # Title
9
+ st.title("Interactive Base 50256 Grid with GPT-2 Tokenizer/Detokenizer")
10
+
11
+ # HTML content (your original HTML/JS code)
12
+ html_content = """
13
+ <!DOCTYPE html>
14
+ <html lang="en">
15
+ <head>
16
+ <meta charset="UTF-8">
17
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
18
+ <title>Interactive Base 50256 Grid</title>
19
+ <style>
20
+ body {
21
+ font-family: Arial, sans-serif;
22
+ display: flex;
23
+ justify-content: center;
24
+ align-items: center;
25
+ height: 100vh;
26
+ margin: 0;
27
+ background-color: #f0f0f0;
28
+ }
29
+ .container {
30
+ text-align: center;
31
+ }
32
+ #grid {
33
+ max-width: 80vmin;
34
+ max-height: 80vmin;
35
+ border: 1px solid #ccc;
36
+ }
37
+ .output {
38
+ margin-top: 20px;
39
+ font-size: 18px;
40
+ font-weight: bold;
41
+ }
42
+ </style>
43
+ </head>
44
+ <body>
45
+ <div class="container">
46
+ <canvas id="grid" width="1000" height="1000"></canvas>
47
+ <div id="clickedOutput" class="output">Click on the grid to select a coordinate</div>
48
+ <div id="hoverOutput">Hover Coordinate: (X: 0, Y: 0)</div>
49
+ </div>
50
+ <script>
51
+ const canvas = document.getElementById('grid');
52
+ const ctx = canvas.getContext('2d');
53
+ const clickedOutput = document.getElementById('clickedOutput');
54
+ const hoverOutput = document.getElementById('hoverOutput');
55
+
56
+ const gridSizeX = 50255;
57
+ const gridSizeY = 50255;
58
+ const cellSizeX = canvas.width / 16;
59
+ const cellSizeY = canvas.height / 16;
60
+
61
+ function drawGrid() {
62
+ ctx.fillStyle = 'white';
63
+ ctx.fillRect(0, 0, canvas.width, canvas.height);
64
+
65
+ ctx.strokeStyle = '#ccc';
66
+ ctx.lineWidth = 1;
67
+
68
+ for (let i = cellSizeX; i < canvas.width; i += cellSizeX) {
69
+ ctx.beginPath();
70
+ ctx.moveTo(i, 0);
71
+ ctx.lineTo(i, canvas.height);
72
+ ctx.stroke();
73
+ }
74
+
75
+ for (let i = cellSizeY; i < canvas.height; i += cellSizeY) {
76
+ ctx.beginPath();
77
+ ctx.moveTo(0, i);
78
+ ctx.lineTo(canvas.width, i);
79
+ ctx.stroke();
80
+ }
81
+
82
+ ctx.fillStyle = 'black';
83
+ ctx.font = '16px Arial';
84
+ ctx.fillText('0,0', 5, canvas.height - 5);
85
+ ctx.fillText(`${gridSizeX},0`, canvas.width - 60, canvas.height - 5);
86
+ ctx.fillText(`0,${gridSizeY}`, 5, 20);
87
+ ctx.fillText(`${gridSizeX},${gridSizeY}`, canvas.width - 100, 20);
88
+ }
89
+
90
+ function getCoordinates(event) {
91
+ const rect = canvas.getBoundingClientRect();
92
+ const x = Math.min(Math.floor((event.clientX - rect.left) / rect.width * gridSizeX), gridSizeX);
93
+ const y = Math.min(gridSizeY - Math.floor((event.clientY - rect.top) / rect.height * gridSizeY), gridSizeY);
94
+ return { x, y };
95
+ }
96
+
97
+ canvas.addEventListener('mousemove', (event) => {
98
+ const { x, y } = getCoordinates(event);
99
+ hoverOutput.textContent = `Hover Coordinate: (X: ${x}, Y: ${y})`;
100
+ });
101
+
102
+ canvas.addEventListener('click', (event) => {
103
+ const { x, y } = getCoordinates(event);
104
+ const combinedCoord = x * 100000 + y;
105
+ clickedOutput.textContent = `Clicked Coordinate: ${combinedCoord.toString().padStart(10, '0')}`;
106
+ window.parent.postMessage({type: 'clickedCoordinate', value: combinedCoord.toString().padStart(10, '0')}, '*');
107
+ });
108
+
109
+ canvas.addEventListener('mouseleave', () => {
110
+ hoverOutput.textContent = 'Hover Coordinate: (X: 0, Y: 0)';
111
+ });
112
+
113
+ drawGrid();
114
+ </script>
115
+ </body>
116
+ </html>
117
+ """
118
+
119
+ # Embed the HTML content
120
+ components.html(html_content, height=700, scrolling=True)
121
+
122
+ # Load the tokenizer
123
+ tokenizer = AutoTokenizer.from_pretrained('gpt2')
124
+
125
+ # Tokenization section
126
+ st.header("Tokenization")
127
+ sentence = st.text_input("Enter a sentence to tokenize:", "cr8 lg cnvs html js hlds 9 wbs")
128
+
129
+ def format_token_ids(token_ids):
130
+ formatted_ids = [str(token_id).zfill(5) for token_id in token_ids]
131
+ return ''.join(formatted_ids)
132
+
133
+ if st.button("Tokenize"):
134
+ input_ids = tokenizer(sentence, return_tensors='pt').input_ids
135
+ token_ids_list = input_ids[0].tolist()
136
+ formatted_token_ids = format_token_ids(token_ids_list)
137
+ st.write("Tokenized input IDs (formatted):")
138
+ st.write(formatted_token_ids)
139
+
140
+ # Detokenization section
141
+ st.header("Detokenization")
142
+ token_ids = st.text_input("Enter token IDs (concatenated without spaces):", "")
143
+
144
+ def split_token_ids(concatenated_ids, length=5):
145
+ return [concatenated_ids[i:i+length] for i in range(0, len(concatenated_ids), length)]
146
+
147
+ def remove_leading_zeros(grouped_ids):
148
+ return [id.lstrip('0') for id in grouped_ids]
149
+
150
+ if st.button("Detokenize"):
151
+ split_ids = split_token_ids(token_ids)
152
+ cleaned_ids = remove_leading_zeros(split_ids)
153
+ cleaned_token_ids_str = ' '.join(cleaned_ids)
154
+ token_id_list = [int(id) for id in cleaned_ids if id.isdigit()]
155
+ detokenized_sentence = tokenizer.decode(token_id_list)
156
+ st.write("Grouped and cleaned token IDs:")
157
+ st.write(cleaned_token_ids_str)
158
+ st.write("Detokenized sentence:")
159
+ st.write(detokenized_sentence)
160
+
161
+ # Load the model
162
+ gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')
163
+
164
+ # Display help for the GPT-2 model
165
+ if st.checkbox("Show GPT-2 Model Help"):
166
+ st.write("Help GPT2")
167
+ st.help(gpt2)
168
+
169
+ # JavaScript to handle messages from the iframe
170
+ components.html(
171
+ """
172
+ <script>
173
+ window.addEventListener('message', function(event) {
174
+ if (event.data.type === 'clickedCoordinate') {
175
+ document.querySelector('input[aria-label="Enter token IDs (concatenated without spaces):"]').value = event.data.value;
176
+ document.querySelector('button[kind="secondary"]').click();
177
+ }
178
+ }, false);
179
+ </script>
180
+ """
181
+ )