File size: 4,791 Bytes
a177e53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import json
from functools import lru_cache
import regex as re
import os
from datetime import datetime
from email.utils import formatdate, parsedate_to_datetime
import requests
from appdirs import user_cache_dir
def download(url, destination_file):
headers = {}
path = user_cache_dir("promptify")
if not os.path.isdir(path):
os.makedirs(path)
destination_file = os.path.join(path, destination_file)
if os.path.exists(destination_file):
mtime = os.path.getmtime(destination_file)
headers["if-modified-since"] = formatdate(mtime, usegmt=True)
response = requests.get(url, headers=headers, stream=True)
response.raise_for_status()
if response.status_code == requests.codes.not_modified:
return
if response.status_code == requests.codes.ok:
with open(destination_file, "wb") as f:
for chunk in response.iter_content(chunk_size=1048576):
f.write(chunk)
if last_modified := response.headers.get("last-modified"):
new_mtime = parsedate_to_datetime(last_modified).timestamp()
os.utime(destination_file, times=(datetime.now().timestamp(), new_mtime))
return destination_file
bpe_file = {"filename": "vocab.bpe", "link": "https://github.com/syonfox/GPT-3-Encoder/raw/master/vocab.bpe"}
encoder_file = {"filename": "encoder.json", "link": "https://github.com/syonfox/GPT-3-Encoder/raw/master/encoder.json"}
@lru_cache()
def bytes_to_unicode():
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class Encoder:
def __init__(self, encoder, bpe_merges, errors="replace"):
self.encoder = encoder
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def encode(self, text):
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def decode(self, tokens):
text = "".join([self.decoder[token] for token in tokens])
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def get_encoder():
encoder_filename = download(encoder_file["link"], encoder_file["filename"])
bpe_filename = download(bpe_file["link"], bpe_file["filename"])
with open(encoder_filename, "r") as f:
encoder = json.load(f)
with open(bpe_filename, "r", encoding="utf-8") as f:
bpe_data = f.read()
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
return Encoder(encoder=encoder, bpe_merges=bpe_merges) |