Create 15_Plus_Detokenizer.py
Browse files- pages/15_Plus_Detokenizer.py +181 -0
pages/15_Plus_Detokenizer.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import streamlit.components.v1 as components
|
3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
+
|
5 |
+
# Set page configuration
|
6 |
+
st.set_page_config(page_title="Interactive Base 50256 Grid with Tokenizer", layout="wide")
|
7 |
+
|
8 |
+
# Title
|
9 |
+
st.title("Interactive Base 50256 Grid with GPT-2 Tokenizer/Detokenizer")
|
10 |
+
|
11 |
+
# HTML content (your original HTML/JS code)
|
12 |
+
html_content = """
|
13 |
+
<!DOCTYPE html>
|
14 |
+
<html lang="en">
|
15 |
+
<head>
|
16 |
+
<meta charset="UTF-8">
|
17 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
18 |
+
<title>Interactive Base 50256 Grid</title>
|
19 |
+
<style>
|
20 |
+
body {
|
21 |
+
font-family: Arial, sans-serif;
|
22 |
+
display: flex;
|
23 |
+
justify-content: center;
|
24 |
+
align-items: center;
|
25 |
+
height: 100vh;
|
26 |
+
margin: 0;
|
27 |
+
background-color: #f0f0f0;
|
28 |
+
}
|
29 |
+
.container {
|
30 |
+
text-align: center;
|
31 |
+
}
|
32 |
+
#grid {
|
33 |
+
max-width: 80vmin;
|
34 |
+
max-height: 80vmin;
|
35 |
+
border: 1px solid #ccc;
|
36 |
+
}
|
37 |
+
.output {
|
38 |
+
margin-top: 20px;
|
39 |
+
font-size: 18px;
|
40 |
+
font-weight: bold;
|
41 |
+
}
|
42 |
+
</style>
|
43 |
+
</head>
|
44 |
+
<body>
|
45 |
+
<div class="container">
|
46 |
+
<canvas id="grid" width="1000" height="1000"></canvas>
|
47 |
+
<div id="clickedOutput" class="output">Click on the grid to select a coordinate</div>
|
48 |
+
<div id="hoverOutput">Hover Coordinate: (X: 0, Y: 0)</div>
|
49 |
+
</div>
|
50 |
+
<script>
|
51 |
+
const canvas = document.getElementById('grid');
|
52 |
+
const ctx = canvas.getContext('2d');
|
53 |
+
const clickedOutput = document.getElementById('clickedOutput');
|
54 |
+
const hoverOutput = document.getElementById('hoverOutput');
|
55 |
+
|
56 |
+
const gridSizeX = 50255;
|
57 |
+
const gridSizeY = 50255;
|
58 |
+
const cellSizeX = canvas.width / 16;
|
59 |
+
const cellSizeY = canvas.height / 16;
|
60 |
+
|
61 |
+
function drawGrid() {
|
62 |
+
ctx.fillStyle = 'white';
|
63 |
+
ctx.fillRect(0, 0, canvas.width, canvas.height);
|
64 |
+
|
65 |
+
ctx.strokeStyle = '#ccc';
|
66 |
+
ctx.lineWidth = 1;
|
67 |
+
|
68 |
+
for (let i = cellSizeX; i < canvas.width; i += cellSizeX) {
|
69 |
+
ctx.beginPath();
|
70 |
+
ctx.moveTo(i, 0);
|
71 |
+
ctx.lineTo(i, canvas.height);
|
72 |
+
ctx.stroke();
|
73 |
+
}
|
74 |
+
|
75 |
+
for (let i = cellSizeY; i < canvas.height; i += cellSizeY) {
|
76 |
+
ctx.beginPath();
|
77 |
+
ctx.moveTo(0, i);
|
78 |
+
ctx.lineTo(canvas.width, i);
|
79 |
+
ctx.stroke();
|
80 |
+
}
|
81 |
+
|
82 |
+
ctx.fillStyle = 'black';
|
83 |
+
ctx.font = '16px Arial';
|
84 |
+
ctx.fillText('0,0', 5, canvas.height - 5);
|
85 |
+
ctx.fillText(`${gridSizeX},0`, canvas.width - 60, canvas.height - 5);
|
86 |
+
ctx.fillText(`0,${gridSizeY}`, 5, 20);
|
87 |
+
ctx.fillText(`${gridSizeX},${gridSizeY}`, canvas.width - 100, 20);
|
88 |
+
}
|
89 |
+
|
90 |
+
function getCoordinates(event) {
|
91 |
+
const rect = canvas.getBoundingClientRect();
|
92 |
+
const x = Math.min(Math.floor((event.clientX - rect.left) / rect.width * gridSizeX), gridSizeX);
|
93 |
+
const y = Math.min(gridSizeY - Math.floor((event.clientY - rect.top) / rect.height * gridSizeY), gridSizeY);
|
94 |
+
return { x, y };
|
95 |
+
}
|
96 |
+
|
97 |
+
canvas.addEventListener('mousemove', (event) => {
|
98 |
+
const { x, y } = getCoordinates(event);
|
99 |
+
hoverOutput.textContent = `Hover Coordinate: (X: ${x}, Y: ${y})`;
|
100 |
+
});
|
101 |
+
|
102 |
+
canvas.addEventListener('click', (event) => {
|
103 |
+
const { x, y } = getCoordinates(event);
|
104 |
+
const combinedCoord = x * 100000 + y;
|
105 |
+
clickedOutput.textContent = `Clicked Coordinate: ${combinedCoord.toString().padStart(10, '0')}`;
|
106 |
+
window.parent.postMessage({type: 'clickedCoordinate', value: combinedCoord.toString().padStart(10, '0')}, '*');
|
107 |
+
});
|
108 |
+
|
109 |
+
canvas.addEventListener('mouseleave', () => {
|
110 |
+
hoverOutput.textContent = 'Hover Coordinate: (X: 0, Y: 0)';
|
111 |
+
});
|
112 |
+
|
113 |
+
drawGrid();
|
114 |
+
</script>
|
115 |
+
</body>
|
116 |
+
</html>
|
117 |
+
"""
|
118 |
+
|
119 |
+
# Embed the HTML content
|
120 |
+
components.html(html_content, height=700, scrolling=True)
|
121 |
+
|
122 |
+
# Load the tokenizer
|
123 |
+
tokenizer = AutoTokenizer.from_pretrained('gpt2')
|
124 |
+
|
125 |
+
# Tokenization section
|
126 |
+
st.header("Tokenization")
|
127 |
+
sentence = st.text_input("Enter a sentence to tokenize:", "cr8 lg cnvs html js hlds 9 wbs")
|
128 |
+
|
129 |
+
def format_token_ids(token_ids):
|
130 |
+
formatted_ids = [str(token_id).zfill(5) for token_id in token_ids]
|
131 |
+
return ''.join(formatted_ids)
|
132 |
+
|
133 |
+
if st.button("Tokenize"):
|
134 |
+
input_ids = tokenizer(sentence, return_tensors='pt').input_ids
|
135 |
+
token_ids_list = input_ids[0].tolist()
|
136 |
+
formatted_token_ids = format_token_ids(token_ids_list)
|
137 |
+
st.write("Tokenized input IDs (formatted):")
|
138 |
+
st.write(formatted_token_ids)
|
139 |
+
|
140 |
+
# Detokenization section
|
141 |
+
st.header("Detokenization")
|
142 |
+
token_ids = st.text_input("Enter token IDs (concatenated without spaces):", "")
|
143 |
+
|
144 |
+
def split_token_ids(concatenated_ids, length=5):
|
145 |
+
return [concatenated_ids[i:i+length] for i in range(0, len(concatenated_ids), length)]
|
146 |
+
|
147 |
+
def remove_leading_zeros(grouped_ids):
|
148 |
+
return [id.lstrip('0') for id in grouped_ids]
|
149 |
+
|
150 |
+
if st.button("Detokenize"):
|
151 |
+
split_ids = split_token_ids(token_ids)
|
152 |
+
cleaned_ids = remove_leading_zeros(split_ids)
|
153 |
+
cleaned_token_ids_str = ' '.join(cleaned_ids)
|
154 |
+
token_id_list = [int(id) for id in cleaned_ids if id.isdigit()]
|
155 |
+
detokenized_sentence = tokenizer.decode(token_id_list)
|
156 |
+
st.write("Grouped and cleaned token IDs:")
|
157 |
+
st.write(cleaned_token_ids_str)
|
158 |
+
st.write("Detokenized sentence:")
|
159 |
+
st.write(detokenized_sentence)
|
160 |
+
|
161 |
+
# Load the model
|
162 |
+
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')
|
163 |
+
|
164 |
+
# Display help for the GPT-2 model
|
165 |
+
if st.checkbox("Show GPT-2 Model Help"):
|
166 |
+
st.write("Help GPT2")
|
167 |
+
st.help(gpt2)
|
168 |
+
|
169 |
+
# JavaScript to handle messages from the iframe
|
170 |
+
components.html(
|
171 |
+
"""
|
172 |
+
<script>
|
173 |
+
window.addEventListener('message', function(event) {
|
174 |
+
if (event.data.type === 'clickedCoordinate') {
|
175 |
+
document.querySelector('input[aria-label="Enter token IDs (concatenated without spaces):"]').value = event.data.value;
|
176 |
+
document.querySelector('button[kind="secondary"]').click();
|
177 |
+
}
|
178 |
+
}, false);
|
179 |
+
</script>
|
180 |
+
"""
|
181 |
+
)
|