Spaces:

webai-community
/

demos

Running

File size: 16,227 Bytes

eccf3fa

import { env, AutoTokenizer } from '../../transformers/transformers.js';
import * as ort from './dist/esm/ort.webgpu.min.js'
//await loadOrt();

const clipboardIcon = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-clipboard" viewBox="0 0 16 16">
<path d="M4 1.5H3a2 2 0 0 0-2 2V14a2 2 0 0 0 2 2h10a2 2 0 0 0 2-2V3.5a2 2 0 0 0-2-2h-1v1h1a1 1 0 0 1 1 1V14a1 1 0 0 1-1 1H3a1 1 0 0 1-1-1V3.5a1 1 0 0 1 1-1h1v-1z"/>
<path d="M9.5 1a.5.5 0 0 1 .5.5v1a.5.5 0 0 1-.5.5h-3a.5.5 0 0 1-.5-.5v-1a.5.5 0 0 1 .5-.5h3zm-3-1A1.5 1.5 0 0 0 5 1.5v1A1.5 1.5 0 0 0 6.5 4h3A1.5 1.5 0 0 0 11 2.5v-1A1.5 1.5 0 0 0 9.5 0h-3z"/>
</svg>`

marked.use({
  mangle: false,
  headerIds: false
});

function log(i) { console.log(i); document.getElementById('status').innerText += `\n${i}`; }

const sendButton = document.getElementById('send-button');

// adjusts the padding at the bottom of scrollWrapper to be the height of the input box
function adjustPadding() {
  const inputBoxHeight = document.getElementById('input-area').offsetHeight;
  const scrollWrapper = document.getElementById('scroll-wrapper');
  scrollWrapper.style.paddingBottom = `${inputBoxHeight + 15}px`;
}

// sets up padding resize whenever input box has its height changed
const autoResizePadding = new ResizeObserver(() => {
  adjustPadding();
});
autoResizePadding.observe(document.getElementById('input-area'));

// variables to handle auto-scroll
// we only need one ResizeObserver and isAutoScrollOn variable globally
// no need to make a new one for every time submitRequest is called
const scrollWrapper = document.getElementById('scroll-wrapper');
let isAutoScrollOn = true;
// autoscroll when new line is added
const autoScroller = new ResizeObserver(() => {
  if (isAutoScrollOn) {
    scrollWrapper.scrollIntoView({ behavior: "smooth", block: "end" });
  }
});

// event listener for scrolling
let lastKnownScrollPosition = 0;
let ticking = false;
document.addEventListener("scroll", (event) => {
  // if user has scrolled up and autoScroll is on we turn it off
  if (!ticking && isAutoScrollOn && window.scrollY < lastKnownScrollPosition) {
    window.requestAnimationFrame(() => {
      isAutoScrollOn = false;
      ticking = false;
    });
    ticking = true;
  }
  // if user has scrolled nearly all the way down and autoScroll is disabled, re-enable
  else if (!ticking && !isAutoScrollOn &&
    window.scrollY > lastKnownScrollPosition && // make sure scroll direction is down
    window.scrollY >= document.documentElement.scrollHeight - window.innerHeight - 30 // add 30px of space--no need to scroll all the way down, just most of the way
  ) {
    window.requestAnimationFrame(() => {
      isAutoScrollOn = true;
      ticking = false;
    });
    ticking = true;
  }
  lastKnownScrollPosition = window.scrollY;
});


function copyTextToClipboard(responseDiv, with_button) {
  let elem = responseDiv;
  if (with_button) {
    let copyButton = document.createElement('button');
    copyButton.className = 'btn btn-secondary copy-button';
    copyButton.innerHTML = clipboardIcon;
    elem = copyButton;
  }

  elem.onclick = () => {
    let text = responseDiv.hidden_text;
    if (!text) {
      text = responseDiv.innerText;
    }
    navigator.clipboard.writeText(text).then(() => {
      console.log('Text copied to clipboard');
    }).catch(err => {
      console.error('Failed to copy text:', err);
    });
  };
  if (with_button) {
    responseDiv.appendChild(elem);
  }
}

// Function to handle the user input and call the API functions
async function submitRequest() {
  if (sendButton.innerHTML == "Stop") {
    llm.abort();
    return;
  }

  document.getElementById('chat-container').style.display = 'block';

  const input = document.getElementById('user-input').value;
  if (input.length == 0) {
    document.getElementById('chat-history').context = "";
    let chatHistory = document.getElementById('chat-history');
    while (chatHistory.firstChild) {
      chatHistory.firstChild.remove();
    }
    return;
  }
  let context = document.getElementById('chat-history').context;
  if (context === undefined) {
    context = "";
  }
  // Create user message element and append to chat history
  let chatHistory = document.getElementById('chat-history');
  let userMessageDiv = document.createElement('div');
  userMessageDiv.className = 'mb-2 user-message';
  userMessageDiv.innerText = input;
  chatHistory.appendChild(userMessageDiv);
  copyTextToClipboard(userMessageDiv);

  // Create response container
  let responseDiv = document.createElement('div');
  responseDiv.className = 'response-message mb-2 text-start';
  responseDiv.style.minHeight = '3em'; // make sure div does not shrink if we cancel the request when no text has been generated yet
  let spinner = document.createElement('div');
  spinner.className = 'spinner-border text-light';
  spinner.setAttribute('role', 'status');
  responseDiv.appendChild(spinner);
  chatHistory.appendChild(responseDiv);

  // create button to stop text generation
  sendButton.innerHTML = "Stop";

  // change autoScroller to keep track of our new responseDiv
  autoScroller.observe(responseDiv);

  Query(input, (word) => {
    // add word to response
    responseDiv.innerHTML = DOMPurify.sanitize(marked.parse(word)); // Append word to response container
  }).then(() => {
    chatHistory.context = responseDiv.innerHTML;
    copyTextToClipboard(responseDiv, true);
    sendButton.innerHTML = "Send";
    spinner.remove();
  }).catch(error => {
    if (error !== 'Stop button pressed') {
      console.error(error);
    }
    sendButton.innerHTML = "Send";
    spinner.remove();
  });

  // Clear user input
  document.getElementById('user-input').value = '';
}

const preCannedQueries = {
  "1": "Tell me about the lighthouse of Alexandria.",
  "2": "Did the lighthouse of Alexandria existed at the same time the library of Alexandria existed?",
  "3": "How did the Pharos lighthouse impact ancient maritime trade?",
  "4": "Tell me about Constantinople?",
};

// Event listener for Ctrl + Enter or CMD + Enter
document.getElementById('user-input').addEventListener('keydown', function (e) {
  if (e.ctrlKey) {
    if (e.key === 'Enter') {
      submitRequest();
    } else {
      const query = preCannedQueries[e.key];
      if (query) {
        document.getElementById('user-input').value = query;
        submitRequest();
      }
    }
  }
});

const MODELS = {
  "tinyllama": { name: "tinyllama", path: "schmuell/TinyLlama-1.1B-Chat-v1.0-int4" },
  "tinyllama_fp16": { name: "tinyllama-fp16", path: "schmuell/TinyLlama-1.1B-Chat-v1.0-fp16", externaldata: true },
  "phi2": { name: "phi2", path: "schmuell/phi2-int4" },
  "phi3": { name: "phi3", path: "schmuell/phi3-int4", externaldata: true },
  "stablelm": { name: "stablelm", path: "schmuell/stablelm-2-zephyr-1_6b-int4" },
}

function getConfig() {
  const query = window.location.search.substring(1);
  var config = {
    model: "phi3",
    provider: "webgpu",
    profiler: 0,
    verbose: 0,
    threads: 1,
    csv: 0,
    max_tokens: 512,
    local: 0,
  }
  let vars = query.split("&");
  for (var i = 0; i < vars.length; i++) {
    let pair = vars[i].split("=");
    if (pair[0] in config) {
      const key = pair[0];
      const value = decodeURIComponent(pair[1]);
      if (typeof config[key] == "number") {
        config[key] = parseInt(value);
      }
      else {
        config[key] = value;
      }
    } else if (pair[0].length > 0) {
      throw new Error("unknown argument: " + pair[0]);
    }
  }
  if (MODELS[config.model] !== undefined) {
    config.model = MODELS[config.model];
  }
  return config;
}

async function fetchAndCache(url) {
  try {
    const cache = await caches.open("onnx");
    let cachedResponse = await cache.match(url);
    if (cachedResponse == undefined) {
      await cache.add(url);
      cachedResponse = await cache.match(url);
      log(`${url} (network)`);
    } else {
      log(`${url} (cached)`);
    }
    const data = await cachedResponse.arrayBuffer();
    return data;
  } catch (error) {
    log(`${url} (network)`);
    return await fetch(url).then(response => response.arrayBuffer());
  }
}

class LLM {
  sess = undefined;
  profiler = false;
  feed = {};
  output_tokens = [];
  eos = 2;
  need_position_ids = true;
  stop = false;
  kv_dims = [];
  dtype = "float16";
  max_tokens = 256;

  constructor() {
  }

  async load(model, options) {
    const provider = options.provider || "webgpu";
    const verbose = options.verbose;
    const local = options.local;
    this.profiler = options.profiler;

    const model_path = (local) ? "models/" + model.path : "https://huggingface.co/" + model.path + "/resolve/main";

    log(`loading... ${model.name},  ${provider}`);
    const json_bytes = await fetchAndCache(model_path + "/config.json");
    let textDecoder = new TextDecoder();
    const model_config = JSON.parse(textDecoder.decode(json_bytes));

    const model_bytes = await fetchAndCache(model_path + "/onnx/decoder_model_merged.onnx");
    const externaldata = (model.externaldata) ? await fetchAndCache(model_path + '/onnx/decoder_model_merged.onnx.data') : false;
    let modelSize = model_bytes.byteLength;
    if (externaldata) {
        modelSize += externaldata.byteLength;
    }
    log(`model size ${Math.round(modelSize / 1024 / 1024)} MB`);

    const opt = {
      executionProviders: [provider],
      preferredOutputLocation: {},
    }

    switch (provider) {
      case "webgpu":
        if (!("gpu" in navigator)) {
          throw new Error("webgpu is NOT supported");
        }
        for (let i = 0; i < model_config.num_hidden_layers; ++i) {
          opt.preferredOutputLocation[`present.${i}.key`] = 'gpu-buffer';
          opt.preferredOutputLocation[`present.${i}.value`] = 'gpu-buffer';
        }
        break;
    }

    if (externaldata !== undefined) {
      opt.externalData = [
        {
          data: externaldata,
          path: 'decoder_model_merged.onnx.data'
        },
      ]
    }
    if (verbose) {
      opt.logSeverityLevel = 0;
      opt.logVerbosityLevel = 0;
      ort.env.logLevel = "verbose";
    }

    ort.env.webgpu.profiling = {}
    if (this.profiler) {
      opt.enableProfiling = true;
      ort.env.webgpu.profilingMode = 'default';
      ort.env.webgpu.profiling.mode = 'default';
    }

    this.sess = await ort.InferenceSession.create(model_bytes, opt);
    this.eos = model_config.eos_token_id;
    this.kv_dims = [1, model_config.num_key_value_heads, 0, model_config.hidden_size / model_config.num_attention_heads];
    this.dtype = config.model.dtype || "float16";
    this.num_layers = model_config.num_hidden_layers;
    this.initilize_feed();
  }

  initilize_feed() {
    this.feed = {};
    const empty = (this.dtype === "float16") ? new Uint16Array() : [];
    for (let i = 0; i < this.num_layers; ++i) {
      this.feed[`past_key_values.${i}.key`] = new ort.Tensor(this.dtype, empty, this.kv_dims)
      this.feed[`past_key_values.${i}.value`] = new ort.Tensor(this.dtype, empty, this.kv_dims)
    }
    this.output_tokens = [];
  }

  argmax(t) {
    const arr = t.data;
    const start = t.dims[2] * (t.dims[1] - 1);
    let max = arr[start];
    let maxidx = 0;

    for (let i = 0; i < t.dims[2]; i++) {
      const val = arr[i + start];
      if (!isFinite(val)) {
        throw new Error("found infinitive in logits");
      }
      if (val > max) {
        max = arr[i + start];
        maxidx = i;
      }
    }
    return maxidx;
  }

  update_kv_cache(feed, outputs) {
    for (const name in outputs) {
      if (name.startsWith('present')) {
        let newName = name.replace('present', 'past_key_values');
        // free old gpu buffer
        const t = feed[newName];
        if (t.location === 'gpu-buffer') {
          t.dispose();
        }
        feed[newName] = outputs[name];
      }
    }
  }

  abort() {
    this.stop = true;
  }

  async generate(tokens, callback, options) {
    const keep_cache = options.keep_cache;
    const max_tokens = options.max_tokens || 256;
    const feed = this.feed;
    const input_ids = new ort.Tensor('int64', BigInt64Array.from(tokens.map(BigInt)), [1, tokens.length]);
    feed['input_ids'] = input_ids;
    this.stop = false;

    if (keep_cache) {
      this.output_tokens.push(...input_ids)
    } else {
        this.initilize_feed();
        this.output_tokens = Array.from(feed['input_ids'].data);
    }

    let last_token = 0n;
    let seqlen = this.output_tokens.length;
    if (this.need_position_ids) {
      if (keep_cache) {
        feed['position_ids'] = new ort.Tensor('int64', BigInt64Array.from({ length: seqlen }, (_, i) => BigInt(i)), [1, input_ids.length]);
      } else {
        feed['position_ids'] = new ort.Tensor('int64', BigInt64Array.from({ length: seqlen }, (_, i) => BigInt(i)), [1, seqlen]);
      }
    }

    while (last_token != this.eos && seqlen < max_tokens && !this.stop) {
      seqlen = this.output_tokens.length;
      feed['attention_mask'] = new ort.Tensor('int64', BigInt64Array.from({ length: seqlen }, () => 1n), [1, seqlen]);
      const outputs = await this.sess.run(feed);
      last_token = BigInt(this.argmax(outputs.logits));
      this.output_tokens.push(last_token);
      if (callback && !this.profiler) {
        callback(this.output_tokens);
      }
      this.update_kv_cache(feed, outputs);
      feed['input_ids'] = new ort.Tensor('int64', BigInt64Array.from([last_token]), [1, 1]);
      if (this.need_position_ids) {
        feed['position_ids'] = new ort.Tensor('int64', BigInt64Array.from([BigInt(seqlen)]), [1, 1]);
      }
    }
    if (this.profiler) {
      this.sess.endProfiling();
    }
    return this.output_tokens;
  }
}


const config = getConfig();
let tokenizer;

env.localModelPath = 'models';
env.allowRemoteModels = config.local == 0;
env.allowLocalModels = config.local == 1;
ort.env.wasm.numThreads = config.threads;
ort.env.wasm.simd = true;
ort.env.wasm.wasmPaths = document.location.pathname.replace('index.html', '') + 'dist/';

const llm = new LLM();

function token_to_text(tokenizer, tokens, startidx) {
  const txt = tokenizer.decode(tokens.slice(startidx), { skip_special_tokens: true, });
  return txt;
}

async function Query(query, cb) {
  let prompt;

  if (config.model.name == 'phi2') {
    prompt = `User:${query}\nAssistant:`;
  } else if (config.model.name == 'phix') {
    prompt = query;
  } else {
    prompt = `"<|system|>\nYou are a friendly assistant.</s>\n<|user|>\n${query}</s>\n<|assistant|>\n`;
  }

  const { input_ids } = await tokenizer(prompt, { return_tensor: false, padding: true, truncation: true });

  const start_timer = performance.now();
  const output_tokens = await llm.generate(input_ids, (output_tokens) => {
    cb(token_to_text(tokenizer, output_tokens, input_ids.length));
  }, {max_tokens: config.max_tokens});

  const took = (performance.now() - start_timer) / 1000;
  const txt = token_to_text(tokenizer, output_tokens, input_ids.length);
  cb(txt);
  const seqlen = output_tokens.length;
  const perf = `${seqlen} tokens in ${took.toFixed(1)}sec, ${(seqlen / took).toFixed(2)} tokens/sec`;
  console.log(perf);
}


async function LoadModel() {
  try {
    tokenizer = await AutoTokenizer.from_pretrained(config.model.path);

    log("Loading model...");
    await llm.load(config.model, {
      provider: config.provider,
      profiler: config.profiler,
      verbose: config.verbose,
      local: config.local,
      max_tokens: config.max_tokens,
    });
    log("Ready.");
  } catch (error) {
    log(error);
  }
}

async function hasFp16() {
  try {
    const adapter = await navigator.gpu.requestAdapter()
    return adapter.features.has('shader-f16')
  } catch (e) {
    return false
  }
}

window.onload = () => {
  hasFp16().then((fp16) => {
    if (fp16) {
      LoadModel().then(() => {
        adjustPadding();
        sendButton.addEventListener('click', submitRequest);
        const userInput = document.getElementById('user-input');
        document.getElementById("status").style.display = "none";
        userInput.focus();
      });
    } else {
      log("Your GPU or Browser doesn't support webgpu/f16");
    }
  });
}