Yang Gu commited on
Commit
fa2d4e3
·
1 Parent(s): 37cf2c3

Support ort-phi3 and polish ort-phi2

Browse files
demo/ort-phi2/index.html CHANGED
@@ -7,8 +7,8 @@
7
 
8
  <body>
9
  <script src="../../util.js"></script>
10
- <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@dev/dist/ort.webgpu.min.js"></script>
11
- <!-- <script src="https://wp-27.sh.intel.com/workspace/project/onnxruntime/js/web/dist/ort.webgpu.min.js"> </script> -->
12
 
13
  <script type="module">
14
  import { AutoTokenizer, env } from '../../transformers/transformers.js';
@@ -88,7 +88,7 @@
88
  const model_config = JSON.parse(textDecoder.decode(json_bytes));
89
 
90
  const model_bytes = await fetchAndCache(modelPath + "/phi2-int4.onnx");
91
- const externaldata = (model.externaldata) ? await fetchAndCache(modelPath + '/phi2-int4.onnx.data') : false;
92
  let modelSize = model_bytes.byteLength;
93
  if (externaldata) {
94
  modelSize += externaldata.byteLength;
 
7
 
8
  <body>
9
  <script src="../../util.js"></script>
10
+ <!-- <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@dev/dist/ort.webgpu.min.js"></script> -->
11
+ <script src="https://wp-27.sh.intel.com/workspace/project/onnxruntime/js/web/dist/ort.webgpu.min.js"></script>
12
 
13
  <script type="module">
14
  import { AutoTokenizer, env } from '../../transformers/transformers.js';
 
88
  const model_config = JSON.parse(textDecoder.decode(json_bytes));
89
 
90
  const model_bytes = await fetchAndCache(modelPath + "/phi2-int4.onnx");
91
+ const externaldata = (model.externaldata) ? await fetchAndCache(modelPath + '/phi2-int4.data') : false;
92
  let modelSize = model_bytes.byteLength;
93
  if (externaldata) {
94
  modelSize += externaldata.byteLength;
demo/ort-phi2/models/phi2-int4/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
demo/ort-phi2/models/phi2-int4/tokenizer_config.json ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "50257": {
13
+ "content": " ",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": false
19
+ },
20
+ "50258": {
21
+ "content": " ",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": false
27
+ },
28
+ "50259": {
29
+ "content": " ",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": false
35
+ },
36
+ "50260": {
37
+ "content": " ",
38
+ "lstrip": false,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": false
43
+ },
44
+ "50261": {
45
+ "content": " ",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": false
51
+ },
52
+ "50262": {
53
+ "content": " ",
54
+ "lstrip": false,
55
+ "normalized": true,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": false
59
+ },
60
+ "50263": {
61
+ "content": " ",
62
+ "lstrip": false,
63
+ "normalized": true,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": false
67
+ },
68
+ "50264": {
69
+ "content": " ",
70
+ "lstrip": false,
71
+ "normalized": true,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": false
75
+ },
76
+ "50265": {
77
+ "content": " ",
78
+ "lstrip": false,
79
+ "normalized": true,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": false
83
+ },
84
+ "50266": {
85
+ "content": " ",
86
+ "lstrip": false,
87
+ "normalized": true,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": false
91
+ },
92
+ "50267": {
93
+ "content": " ",
94
+ "lstrip": false,
95
+ "normalized": true,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": false
99
+ },
100
+ "50268": {
101
+ "content": " ",
102
+ "lstrip": false,
103
+ "normalized": true,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": false
107
+ },
108
+ "50269": {
109
+ "content": " ",
110
+ "lstrip": false,
111
+ "normalized": true,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": false
115
+ },
116
+ "50270": {
117
+ "content": " ",
118
+ "lstrip": false,
119
+ "normalized": true,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "50271": {
125
+ "content": " ",
126
+ "lstrip": false,
127
+ "normalized": true,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "50272": {
133
+ "content": " ",
134
+ "lstrip": false,
135
+ "normalized": true,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "50273": {
141
+ "content": " ",
142
+ "lstrip": false,
143
+ "normalized": true,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "50274": {
149
+ "content": " ",
150
+ "lstrip": false,
151
+ "normalized": true,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "50275": {
157
+ "content": " ",
158
+ "lstrip": false,
159
+ "normalized": true,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "50276": {
165
+ "content": " ",
166
+ "lstrip": false,
167
+ "normalized": true,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "50277": {
173
+ "content": " ",
174
+ "lstrip": false,
175
+ "normalized": true,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ },
180
+ "50278": {
181
+ "content": " ",
182
+ "lstrip": false,
183
+ "normalized": true,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": false
187
+ },
188
+ "50279": {
189
+ "content": " ",
190
+ "lstrip": false,
191
+ "normalized": true,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": false
195
+ },
196
+ "50280": {
197
+ "content": " ",
198
+ "lstrip": false,
199
+ "normalized": true,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": false
203
+ },
204
+ "50281": {
205
+ "content": " ",
206
+ "lstrip": false,
207
+ "normalized": true,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": false
211
+ },
212
+ "50282": {
213
+ "content": " ",
214
+ "lstrip": false,
215
+ "normalized": true,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": false
219
+ },
220
+ "50283": {
221
+ "content": " ",
222
+ "lstrip": false,
223
+ "normalized": true,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": false
227
+ },
228
+ "50284": {
229
+ "content": " ",
230
+ "lstrip": false,
231
+ "normalized": true,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": false
235
+ },
236
+ "50285": {
237
+ "content": " ",
238
+ "lstrip": false,
239
+ "normalized": true,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": false
243
+ },
244
+ "50286": {
245
+ "content": " ",
246
+ "lstrip": false,
247
+ "normalized": true,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": false
251
+ },
252
+ "50287": {
253
+ "content": "\t\t\t\t\t\t\t\t\t",
254
+ "lstrip": false,
255
+ "normalized": true,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": false
259
+ },
260
+ "50288": {
261
+ "content": "\t\t\t\t\t\t\t\t",
262
+ "lstrip": false,
263
+ "normalized": true,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": false
267
+ },
268
+ "50289": {
269
+ "content": "\t\t\t\t\t\t\t",
270
+ "lstrip": false,
271
+ "normalized": true,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": false
275
+ },
276
+ "50290": {
277
+ "content": "\t\t\t\t\t\t",
278
+ "lstrip": false,
279
+ "normalized": true,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": false
283
+ },
284
+ "50291": {
285
+ "content": "\t\t\t\t\t",
286
+ "lstrip": false,
287
+ "normalized": true,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": false
291
+ },
292
+ "50292": {
293
+ "content": "\t\t\t\t",
294
+ "lstrip": false,
295
+ "normalized": true,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": false
299
+ },
300
+ "50293": {
301
+ "content": "\t\t\t",
302
+ "lstrip": false,
303
+ "normalized": true,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": false
307
+ },
308
+ "50294": {
309
+ "content": "\t\t",
310
+ "lstrip": false,
311
+ "normalized": true,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": false
315
+ }
316
+ },
317
+ "bos_token": "<|endoftext|>",
318
+ "clean_up_tokenization_spaces": true,
319
+ "eos_token": "<|endoftext|>",
320
+ "model_max_length": 2048,
321
+ "tokenizer_class": "CodeGenTokenizer",
322
+ "unk_token": "<|endoftext|>"
323
+ }
demo/ort-phi3/index.html ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+
4
+ <head>
5
+ <title>Example</title>
6
+ </head>
7
+
8
+ <body>
9
+ <script src="../../util.js"></script>
10
+ <!-- <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@dev/dist/ort.webgpu.min.js"></script> -->
11
+ <script src="https://wp-27.sh.intel.com/workspace/project/onnxruntime/js/web/dist/ort.webgpu.min.js"></script>
12
+
13
+ <script type="module">
14
+ import { AutoTokenizer, env } from '../../transformers/transformers.js';
15
+
16
+ function log(i) { console.log(i); document.getElementById('status').innerText += `\n${i}`; }
17
+
18
+ const MODELS = {
19
+ "phi3": { name: "phi3", path: "phi3-int4", externaldata: true },
20
+ }
21
+
22
+ function getConfig() {
23
+ const query = window.location.search.substring(1);
24
+ var config = {
25
+ model: "phi3",
26
+ provider: "webgpu",
27
+ profiler: 0,
28
+ verbose: 0,
29
+ threads: 1,
30
+ trace: 0,
31
+ csv: 0,
32
+ max_tokens: 256,
33
+ local: 1,
34
+ }
35
+ let vars = query.split("&");
36
+ for (var i = 0; i < vars.length; i++) {
37
+ let pair = vars[i].split("=");
38
+ if (pair[0] in config) {
39
+ const key = pair[0];
40
+ const value = decodeURIComponent(pair[1]);
41
+ if (typeof config[key] == "number") {
42
+ config[key] = parseInt(value);
43
+ }
44
+ else {
45
+ config[key] = value;
46
+ }
47
+ } else if (pair[0].length > 0) {
48
+ throw new Error("unknown argument: " + pair[0]);
49
+ }
50
+ }
51
+ if (MODELS[config.model] !== undefined) {
52
+ config.model = MODELS[config.model];
53
+ }
54
+ return config;
55
+ }
56
+
57
+ class LLM {
58
+ sess = undefined;
59
+ profiler = false;
60
+ trace = false;
61
+ feed = {};
62
+ output_tokens = [];
63
+ eos = 2;
64
+ need_position_ids = true;
65
+ stop = false;
66
+ kv_dims = [];
67
+ dtype = "float16";
68
+
69
+ constructor() {
70
+ }
71
+
72
+ async load(model, options) {
73
+ const provider = options.provider || "webgpu";
74
+ const verbose = options.verbose;
75
+ const local = options.local;
76
+ this.profiler = options.profiler;
77
+ this.trace = options.trace;
78
+
79
+ const modelPath = getModelsPath() + model.path;
80
+
81
+ log(`loading... ${model.name}, ${provider}`);
82
+ const json_bytes = await fetchAndCache(modelPath + "/config.json");
83
+ let textDecoder = new TextDecoder();
84
+ const model_config = JSON.parse(textDecoder.decode(json_bytes));
85
+
86
+ const model_bytes = await fetchAndCache(modelPath + "/phi3-int4.onnx");
87
+ const externaldata = (model.externaldata) ? await fetchAndCache(modelPath + '/phi3-int4.data') : false;
88
+ let modelSize = model_bytes.byteLength;
89
+ if (externaldata) {
90
+ modelSize += externaldata.byteLength;
91
+ }
92
+
93
+ log(`model size ${Math.round(modelSize / 1024 / 1024)} MB`);
94
+
95
+ const opt = {
96
+ executionProviders: [provider],
97
+ preferredOutputLocation: {},
98
+ };
99
+
100
+ switch (provider) {
101
+ case "webgpu":
102
+ if (!("gpu" in navigator)) {
103
+ throw new Error("webgpu is NOT supported");
104
+ }
105
+ for (let i = 0; i < model_config.num_hidden_layers; ++i) {
106
+ opt.preferredOutputLocation[`present.${i}.key`] = 'gpu-buffer';
107
+ opt.preferredOutputLocation[`present.${i}.value`] = 'gpu-buffer';
108
+ }
109
+ break;
110
+ case "webnn":
111
+ if (!("ml" in navigator)) {
112
+ throw new Error("webnn is NOT supported");
113
+ }
114
+ break;
115
+ }
116
+
117
+ if (externaldata !== undefined) {
118
+ opt.externalData = [
119
+ {
120
+ data: externaldata,
121
+ path: 'decoder_model_merged.onnx.data'
122
+ },
123
+ ]
124
+ }
125
+ if (verbose) {
126
+ opt.logSeverityLevel = 0;
127
+ opt.logVerbosityLevel = 0;
128
+ ort.env.logLevel = "verbose";
129
+ ort.env.debug = true;
130
+ }
131
+
132
+ ort.env.webgpu.profiling = {};
133
+ if (this.profiler) {
134
+ opt.enableProfiling = true;
135
+ ort.env.webgpu.profilingMode = 'default';
136
+ ort.env.webgpu.profiling.mode = 'default';
137
+ }
138
+
139
+ this.sess = await ort.InferenceSession.create(model_bytes, opt);
140
+
141
+ if (this.trace) {
142
+ ort.env.trace = true;
143
+ ort.env.webgpu.profiling.ondata = (version, inputsMetadata, outputsMetadata, kernelId, kernelType,
144
+ kernelName, programName, startTime, endTime) => { };
145
+ }
146
+
147
+ this.eos = model_config.eos_token_id;
148
+ this.kv_dims = [1, model_config.num_key_value_heads, 0, model_config.hidden_size / model_config.num_attention_heads];
149
+ this.dtype = config.model.dtype || "float16";
150
+ this.num_layers = model_config.num_hidden_layers;
151
+ this.initilize_feed();
152
+ }
153
+
154
+ initilize_feed() {
155
+ this.feed = {};
156
+ const empty = (this.dtype === "float16") ? new Uint16Array() : [];
157
+ for (let i = 0; i < this.num_layers; ++i) {
158
+ this.feed[`past_key_values.${i}.key`] = new ort.Tensor(this.dtype, empty, this.kv_dims)
159
+ this.feed[`past_key_values.${i}.value`] = new ort.Tensor(this.dtype, empty, this.kv_dims)
160
+ }
161
+ this.output_tokens = [];
162
+ }
163
+
164
+
165
+ argmax(t) {
166
+ const arr = t.data;
167
+ const start = t.dims[2] * (t.dims[1] - 1);
168
+ let max = arr[start];
169
+ let maxidx = 0;
170
+
171
+ for (let i = 0; i < t.dims[2]; i++) {
172
+ const val = arr[i + start];
173
+ if (!isFinite(val)) {
174
+ throw new Error("found infinitive in logits");
175
+ }
176
+ if (val > max) {
177
+ max = arr[i + start];
178
+ maxidx = i;
179
+ }
180
+ }
181
+ return maxidx;
182
+ }
183
+
184
+ update_kv_cache(feed, outputs) {
185
+ for (const name in outputs) {
186
+ if (name.startsWith('present')) {
187
+ let newName = name.replace('present', 'past_key_values');
188
+ // free old gpu buffer
189
+ const t = feed[newName];
190
+ if (t.location === 'gpu-buffer') {
191
+ t.dispose();
192
+ }
193
+ feed[newName] = outputs[name];
194
+ }
195
+ }
196
+ }
197
+
198
+ abort() {
199
+ this.stop = true;
200
+ }
201
+
202
+ async generate(tokens, callback, options) {
203
+ const keep_cache = options.keep_cache;
204
+ const max_tokens = options.max_tokens || 256;
205
+ const feed = this.feed;
206
+ const input_ids = new ort.Tensor('int64', BigInt64Array.from(tokens.map(BigInt)), [1, tokens.length]);
207
+ feed['input_ids'] = input_ids;
208
+ this.stop = false;
209
+
210
+ if (keep_cache) {
211
+ this.output_tokens.push(...input_ids)
212
+ } else {
213
+ this.initilize_feed();
214
+ this.output_tokens = Array.from(feed['input_ids'].data);
215
+ }
216
+
217
+ let last_token = 0n;
218
+ let seqlen = this.output_tokens.length;
219
+ if (this.need_position_ids) {
220
+ if (keep_cache) {
221
+ feed['position_ids'] = new ort.Tensor('int64', BigInt64Array.from({ length: seqlen }, (_, i) => BigInt(i)), [1, input_ids.length]);
222
+ } else {
223
+ feed['position_ids'] = new ort.Tensor('int64', BigInt64Array.from({ length: seqlen }, (_, i) => BigInt(i)), [1, seqlen]);
224
+ }
225
+ }
226
+
227
+ while (last_token != this.eos && seqlen < max_tokens && !this.stop) {
228
+ seqlen = this.output_tokens.length;
229
+ feed['attention_mask'] = new ort.Tensor('int64', BigInt64Array.from({ length: seqlen }, () => 1n), [1, seqlen]);
230
+ let outputs;
231
+ if (this.trace) {
232
+ console.timeStamp("RUN-BEGIN");
233
+ outputs = await this.sess.run(feed);
234
+ console.timeStamp("RUN-END");
235
+ } else {
236
+ outputs = await this.sess.run(feed);
237
+ }
238
+ last_token = BigInt(this.argmax(outputs.logits));
239
+ this.output_tokens.push(last_token);
240
+ if (callback && !this.profiler) {
241
+ callback(this.output_tokens);
242
+ }
243
+ this.update_kv_cache(feed, outputs);
244
+ feed['input_ids'] = new ort.Tensor('int64', BigInt64Array.from([last_token]), [1, 1]);
245
+ if (this.need_position_ids) {
246
+ feed['position_ids'] = new ort.Tensor('int64', BigInt64Array.from([BigInt(seqlen)]), [1, 1]);
247
+ }
248
+ }
249
+ if (this.profiler) {
250
+ this.sess.endProfiling();
251
+ }
252
+ return this.output_tokens;
253
+ }
254
+ }
255
+
256
+ const config = getConfig();
257
+ env.localModelPath = 'models';
258
+ env.allowRemoteModels = config.local == 0;
259
+ env.allowLocalModels = config.local == 1;
260
+ ort.env.wasm.numThreads = config.threads;
261
+ ort.env.wasm.simd = true;
262
+
263
+ const cons_log = [];
264
+
265
+ if (config.profiler === 2) {
266
+ console.log = function (message) {
267
+ if (!message.includes('_fence_')) {
268
+ cons_log.push(message);
269
+ }
270
+ };
271
+ }
272
+
273
+ const tokenizer = await AutoTokenizer.from_pretrained(config.model.path);
274
+
275
+ function create_download_link(cons_log) {
276
+ if (cons_log.length > 0) {
277
+ let link = document.getElementById('download').childNodes[0];
278
+ if (link === undefined) {
279
+ link = document.createElement("a", "download-link");
280
+ link.download = "profiler.log";
281
+ link.innerText = "Download";
282
+ document.getElementById('download').appendChild(link);
283
+ }
284
+ const base64 = btoa(cons_log.join('\n'));
285
+ link.href = `data:application/json;base64,${base64}`;
286
+ }
287
+ }
288
+
289
+ async function fetchAndCache(url) {
290
+ try {
291
+ const cache = await caches.open("onnx");
292
+ let cachedResponse = await cache.match(url);
293
+ if (cachedResponse == undefined) {
294
+ await cache.add(url);
295
+ cachedResponse = await cache.match(url);
296
+ log(`${url} (network)`);
297
+ } else {
298
+ log(`${url} (cached)`);
299
+ }
300
+ const data = await cachedResponse.arrayBuffer();
301
+ return data;
302
+ } catch (error) {
303
+ log(`${url} (network)`);
304
+ return await fetch(url).then(response => response.arrayBuffer());
305
+ }
306
+ }
307
+
308
+ function token_to_text(tokenizer, tokens, startidx) {
309
+ const txt = tokenizer.decode(tokens.slice(startidx), { skip_special_tokens: true, });
310
+ return txt;
311
+ }
312
+
313
+ const llm = new LLM();
314
+
315
+ async function main() {
316
+
317
+ const model = config.model;
318
+
319
+ await llm.load(model, {
320
+ provider: config.provider,
321
+ verbose: config.verbose,
322
+ profiler: config.profiler,
323
+ trace: config.trace,
324
+ local: config.local,
325
+ });
326
+
327
+
328
+ document.getElementById('status').innerText = "";
329
+ const query = "Tell me about Constantinople.";
330
+ let prompt;
331
+
332
+ if (model.name.includes('phi3')) {
333
+ prompt = `User:${query}\nAssistant:`;
334
+ } else {
335
+ prompt = `"<|system|>\nYou are a friendly assistant.</s>\n<|user|>\n${query}</s>\n<|assistant|>\n`;
336
+ }
337
+ const { input_ids } = await tokenizer(prompt, { return_tensor: false, padding: true, truncation: true });
338
+
339
+ const start_timer = performance.now();
340
+ const output_tokens = await llm.generate(input_ids, (output_tokens) => {
341
+ document.getElementById('result').innerText = token_to_text(tokenizer, output_tokens, input_ids.length);
342
+ }, {});
343
+ const took = (performance.now() - start_timer) / 1000;
344
+ const txt = token_to_text(tokenizer, output_tokens, input_ids.length);
345
+ const seqlen = output_tokens.length;
346
+ document.getElementById('result').innerText = txt;
347
+ const perf = `${seqlen} tokens in ${took.toFixed(1)}sec, ${(seqlen / took).toFixed(2)} tokens/sec`;
348
+ console.log(perf + " @@1");
349
+ document.getElementById('perf').innerText = perf;
350
+ if (config.csv) {
351
+ log(`${model.name},${took.toFixed(2)},${(seqlen / took).toFixed(3)},${seqlen},@@2`);
352
+ }
353
+ }
354
+ try {
355
+ await main();
356
+ } catch (error) {
357
+ console.error(error);
358
+ document.getElementById('result').innerText = error.message;
359
+ } finally {
360
+ create_download_link(cons_log);
361
+ }
362
+ </script>
363
+
364
+ <div id="status"></div>
365
+ <br />
366
+ <div id="result"></div>
367
+ <br />
368
+ <div id="perf"></div>
369
+ <br />
370
+ <div id="download"></div>
371
+ <br />
372
+
373
+ </body>
374
+
375
+ </html>
demo/ort-phi3/models/phi3-int4/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
demo/ort-phi3/models/phi3-int4/tokenizer_config.json ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": true,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "32000": {
30
+ "content": "<|endoftext|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "32001": {
38
+ "content": "<|assistant|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": true,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "32002": {
46
+ "content": "<|step|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": true,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "32003": {
54
+ "content": "<|function_output|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": true,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "32004": {
62
+ "content": "<|tag|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": true,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "32005": {
70
+ "content": "<|function_call|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": true,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "32006": {
78
+ "content": "<|system|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": true,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "32007": {
86
+ "content": "<|end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": true,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "32008": {
94
+ "content": "<|raw|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": true,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "32009": {
102
+ "content": "<|continue|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": true,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "32010": {
110
+ "content": "<|user|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": true,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "32011": {
118
+ "content": "<|function_list|>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": true,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "32012": {
126
+ "content": "<|calc|>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": true,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "32013": {
134
+ "content": "<|code|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": true,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "32014": {
142
+ "content": "<|/code|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": true,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "32015": {
150
+ "content": "<|summary|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": true,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "32016": {
158
+ "content": "<|resource|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": true,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "32017": {
166
+ "content": "<|assistant_mask|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": true,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "32018": {
174
+ "content": "<|start|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": true,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "32019": {
182
+ "content": "<|message|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": true,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "32020": {
190
+ "content": "<|fim_prefix|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": true,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "32021": {
198
+ "content": "<|fim_middle|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": true,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "32022": {
206
+ "content": "<|fim_suffix|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": true,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "32023": {
214
+ "content": "<|meta_start|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": true,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "32024": {
222
+ "content": "<|ipynb_marker|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": true,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "32025": {
230
+ "content": "<|diff_marker|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": true,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "32026": {
238
+ "content": "<|ghissue|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": true,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "32027": {
246
+ "content": "<|ghreview|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": true,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "32028": {
254
+ "content": "<|disc_start|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": true,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "32029": {
262
+ "content": "<|disc_sep|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": true,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "32030": {
270
+ "content": "<|disc_thread|><|query|>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": true,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "32031": {
278
+ "content": "<|/query|>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": true,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "32032": {
286
+ "content": "<|data|>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": true,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "32033": {
294
+ "content": "<|/data|>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": true,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "32034": {
302
+ "content": "<|sys|>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": true,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "32035": {
310
+ "content": "<|/sys|>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": true,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "32036": {
318
+ "content": "<|inst|>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": true,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "32037": {
326
+ "content": "<|/inst|>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": true,
330
+ "single_word": false,
331
+ "special": true
332
+ }
333
+ },
334
+ "additional_special_tokens": [
335
+ "<|/inst|>"
336
+ ],
337
+ "bos_token": "<s>",
338
+ "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
339
+ "clean_up_tokenization_spaces": false,
340
+ "eos_token": "<|endoftext|>",
341
+ "legacy": false,
342
+ "model_max_length": 4096,
343
+ "pad_token": "<|endoftext|>",
344
+ "padding_side": "left",
345
+ "sp_model_kwargs": {},
346
+ "tokenizer_class": "LlamaTokenizer",
347
+ "unk_token": "<unk>",
348
+ "use_default_system_prompt": false
349
+ }
main.js CHANGED
@@ -49,11 +49,9 @@ function createElem(tag, attrs = {}, children = []) {
49
  return elem;
50
  }
51
 
52
- // todo: Musicgen
53
-
54
- const pageCategories = [
55
  {
56
- title: `ONNX Runtime`,
57
  description: `ONNX Runtime`,
58
  demos: {
59
  'ort-phi2': {
@@ -61,6 +59,11 @@ const pageCategories = [
61
  description: `phi2 from Microsoft`,
62
  filename: "ort-phi2",
63
  },
 
 
 
 
 
64
  'ort-sam': {
65
  name: 'Segment Anything',
66
  description: `Segment Anything from https://github.com/guschmue/ort-webgpu/tree/master/segment-anything`,
@@ -84,7 +87,7 @@ const pageCategories = [
84
  },
85
  },
86
  {
87
- title: `TFLite`,
88
  description: `TFLite`,
89
  demos: {
90
  'tflite-gemma': {
@@ -95,7 +98,7 @@ const pageCategories = [
95
  },
96
  },
97
  {
98
- title: 'Transformers.js',
99
  description: 'Transformers.js',
100
  demos: {
101
  benchmark: {
@@ -131,7 +134,7 @@ const pageCategories = [
131
  },
132
  },
133
  {
134
- title: 'TVM',
135
  description: 'TVM',
136
  demos: {
137
  sd: {
@@ -32475,7 +32478,7 @@ function setSampleIFrameURL(e, demoInfo) {
32475
  // from those keys to each demo.
32476
  const samplesByKey = new Map();
32477
  // Generate the list of demos
32478
- for (const { title, description, demos } of pageCategories) {
32479
  for (const [key, demoInfo] of Object.entries(demos)) {
32480
  samplesByKey.set(key, demoInfo);
32481
  }
@@ -32484,7 +32487,7 @@ for (const { title, description, demos } of pageCategories) {
32484
  createElem('div', { className: 'sampleCategory' }, [
32485
  createElem('h3', {
32486
  style: { 'margin-top': '5px' },
32487
- textContent: title,
32488
  dataset: { tooltip: description },
32489
  }),
32490
  ]),
 
49
  return elem;
50
  }
51
 
52
+ const demoCategories = [
 
 
53
  {
54
+ name: `ONNX Runtime`,
55
  description: `ONNX Runtime`,
56
  demos: {
57
  'ort-phi2': {
 
59
  description: `phi2 from Microsoft`,
60
  filename: "ort-phi2",
61
  },
62
+ 'ort-phi3': {
63
+ name: 'phi3',
64
+ description: `phi3 from Microsoft`,
65
+ filename: "ort-phi3",
66
+ },
67
  'ort-sam': {
68
  name: 'Segment Anything',
69
  description: `Segment Anything from https://github.com/guschmue/ort-webgpu/tree/master/segment-anything`,
 
87
  },
88
  },
89
  {
90
+ name: `TFLite`,
91
  description: `TFLite`,
92
  demos: {
93
  'tflite-gemma': {
 
98
  },
99
  },
100
  {
101
+ name: 'Transformers.js',
102
  description: 'Transformers.js',
103
  demos: {
104
  benchmark: {
 
134
  },
135
  },
136
  {
137
+ name: 'TVM',
138
  description: 'TVM',
139
  demos: {
140
  sd: {
 
32478
  // from those keys to each demo.
32479
  const samplesByKey = new Map();
32480
  // Generate the list of demos
32481
+ for (const { name, description, demos } of demoCategories) {
32482
  for (const [key, demoInfo] of Object.entries(demos)) {
32483
  samplesByKey.set(key, demoInfo);
32484
  }
 
32487
  createElem('div', { className: 'sampleCategory' }, [
32488
  createElem('h3', {
32489
  style: { 'margin-top': '5px' },
32490
+ textContent: name,
32491
  dataset: { tooltip: description },
32492
  }),
32493
  ]),