Spaces:
Build error
Build error
File size: 3,755 Bytes
70023bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import encoder from "./encoder.js";
import bpe_file from "./vocab.bpe.js";
const range = (x, y) => {
const res = Array.from(Array(y).keys()).slice(x)
return res
}
const ord = x => {
return x.charCodeAt(0)
}
const chr = x => {
return String.fromCharCode(x)
}
const textEncoder = new TextEncoder("utf-8")
const encodeStr = str => {
return Array.from(textEncoder.encode(str)).map(x => x.toString())
}
const textDecoder = new TextDecoder("utf-8")
const decodeStr = arr => {
return textDecoder.decode(new Uint8Array(arr));
}
const dictZip = (x, y) => {
const result = {}
x.map((_, i) => { result[x[i]] = y[i] })
return result
}
function bytes_to_unicode() {
const bs = range(ord('!'), ord('~') + 1).concat(range(ord('¡'), ord('¬') + 1), range(ord('®'), ord('ÿ') + 1))
let cs = bs.slice()
let n = 0
for (let b = 0; b < 2 ** 8; b++) {
if (!bs.includes(b)) {
bs.push(b)
cs.push(2 ** 8 + n)
n = n + 1
}
}
cs = cs.map(x => chr(x))
const result = {}
bs.map((_, i) => { result[bs[i]] = cs[i] })
return result
}
function get_pairs(word) {
const pairs = new Set()
let prev_char = word[0]
for (let i = 1; i < word.length; i++) {
const char = word[i]
pairs.add([prev_char, char])
prev_char = char
}
return pairs
}
const pat = /'s|'t|'re|'ve|'m|'l l|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu
const decoder = {}
Object.keys(encoder).map(x => { decoder[encoder[x]] = x })
const lines = bpe_file.split('\n')
// bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
const bpe_merges = lines.slice(1, lines.length - 1).map(x => {
return x.split(/(\s+)/).filter(function(e) { return e.trim().length > 0 })
})
const byte_encoder = bytes_to_unicode()
const byte_decoder = {}
Object.keys(byte_encoder).map(x => { byte_decoder[byte_encoder[x]] = x })
const bpe_ranks = dictZip(bpe_merges, range(0, bpe_merges.length))
const cache = {}
function bpe(token) {
if (Object.hasOwn(cache, token)) {
return cache[token]
}
let word = token.split('')
let pairs = get_pairs(word)
if (!pairs) {
return token
}
while (true) {
const minPairs = {}
Array.from(pairs).map(pair => {
const rank = bpe_ranks[pair]
minPairs[(isNaN(rank) ? 10e10 : rank)] = pair
})
const bigram = minPairs[Math.min(...Object.keys(minPairs).map(x => {
return parseInt(x)
}
))]
if (!(Object.hasOwn(bpe_ranks, bigram))) {
break
}
const first = bigram[0]
const second = bigram[1]
let new_word = []
let i = 0
while (i < word.length) {
const j = word.indexOf(first, i)
if (j === -1) {
new_word = new_word.concat(word.slice(i))
break
}
new_word = new_word.concat(word.slice(i, j))
i = j
if (word[i] === first && i < word.length - 1 && word[i + 1] === second) {
new_word.push(first + second)
i = i + 2
} else {
new_word.push(word[i])
i = i + 1
}
}
word = new_word
if (word.length === 1) {
break
} else {
pairs = get_pairs(word)
}
}
word = word.join(' ')
cache[token] = word
return word
}
export function encode(text) {
let bpe_tokens = []
const matches = Array.from(text.matchAll(pat)).map(x => x[0])
for (let token of matches) {
token = encodeStr(token).map(x => {
return byte_encoder[x]
}).join('')
const new_tokens = bpe(token).split(' ').map(x => encoder[x])
bpe_tokens = bpe_tokens.concat(new_tokens)
}
return bpe_tokens
}
export function decode(tokens) {
let text = tokens.map(x => decoder[x]).join('')
text = decodeStr(text.split('').map(x => byte_decoder[x]))
return text
}
|