aaditya commited on
Commit
a177e53
·
1 Parent(s): bbdfd97

Add application file

Browse files
Files changed (2) hide show
  1. app.py +20 -0
  2. encoder_file.py +150 -0
app.py CHANGED
@@ -1,7 +1,11 @@
1
  from flask import Flask, request
2
  from flask_restful import Resource, Api, reqparse
3
  from flask_cors import CORS
 
4
 
 
 
 
5
  app = Flask(__name__)
6
  CORS(app)
7
  api = Api(app)
@@ -29,5 +33,21 @@ class Data(Resource):
29
  api.add_resource(Status, '/')
30
  api.add_resource(Data, '/data')
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  if __name__ == '__main__':
33
  app.run(debug=True)
 
1
  from flask import Flask, request
2
  from flask_restful import Resource, Api, reqparse
3
  from flask_cors import CORS
4
+ from encoder_file import get_encoder
5
 
6
+ len(encoder.encode(t))
7
+
8
+ encoder = get_encoder()
9
  app = Flask(__name__)
10
  CORS(app)
11
  api = Api(app)
 
33
  api.add_resource(Status, '/')
34
  api.add_resource(Data, '/data')
35
 
36
+
37
+ @app.route('/get_sample',methods=['POST'])
38
+ def _inference():
39
+
40
+ try:
41
+ info = request.json["data"]
42
+ return jsonify({"result" : encoder.encode(info)})
43
+
44
+ except Exception as e:
45
+ print(e)
46
+ return jsonify({"result":"-1"})
47
+
48
+ @app.route('/checking',methods=['GET'])
49
+ def check():
50
+ return jsonify({"status": 'working'})
51
+
52
  if __name__ == '__main__':
53
  app.run(debug=True)
encoder_file.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from functools import lru_cache
3
+
4
+ import regex as re
5
+
6
+ import os
7
+ from datetime import datetime
8
+ from email.utils import formatdate, parsedate_to_datetime
9
+
10
+ import requests
11
+ from appdirs import user_cache_dir
12
+
13
+
14
+ def download(url, destination_file):
15
+ headers = {}
16
+
17
+ path = user_cache_dir("promptify")
18
+
19
+ if not os.path.isdir(path):
20
+ os.makedirs(path)
21
+
22
+ destination_file = os.path.join(path, destination_file)
23
+
24
+ if os.path.exists(destination_file):
25
+ mtime = os.path.getmtime(destination_file)
26
+ headers["if-modified-since"] = formatdate(mtime, usegmt=True)
27
+
28
+ response = requests.get(url, headers=headers, stream=True)
29
+ response.raise_for_status()
30
+
31
+ if response.status_code == requests.codes.not_modified:
32
+ return
33
+
34
+ if response.status_code == requests.codes.ok:
35
+ with open(destination_file, "wb") as f:
36
+ for chunk in response.iter_content(chunk_size=1048576):
37
+ f.write(chunk)
38
+
39
+ if last_modified := response.headers.get("last-modified"):
40
+ new_mtime = parsedate_to_datetime(last_modified).timestamp()
41
+ os.utime(destination_file, times=(datetime.now().timestamp(), new_mtime))
42
+ return destination_file
43
+
44
+ bpe_file = {"filename": "vocab.bpe", "link": "https://github.com/syonfox/GPT-3-Encoder/raw/master/vocab.bpe"}
45
+ encoder_file = {"filename": "encoder.json", "link": "https://github.com/syonfox/GPT-3-Encoder/raw/master/encoder.json"}
46
+
47
+
48
+ @lru_cache()
49
+ def bytes_to_unicode():
50
+
51
+ bs = (
52
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
53
+ )
54
+ cs = bs[:]
55
+ n = 0
56
+ for b in range(2**8):
57
+ if b not in bs:
58
+ bs.append(b)
59
+ cs.append(2**8 + n)
60
+ n += 1
61
+ cs = [chr(n) for n in cs]
62
+ return dict(zip(bs, cs))
63
+
64
+
65
+ def get_pairs(word):
66
+ pairs = set()
67
+ prev_char = word[0]
68
+ for char in word[1:]:
69
+ pairs.add((prev_char, char))
70
+ prev_char = char
71
+ return pairs
72
+
73
+
74
+ class Encoder:
75
+ def __init__(self, encoder, bpe_merges, errors="replace"):
76
+ self.encoder = encoder
77
+ self.decoder = {v: k for k, v in self.encoder.items()}
78
+ self.errors = errors
79
+ self.byte_encoder = bytes_to_unicode()
80
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
81
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
82
+ self.cache = {}
83
+ self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
84
+
85
+ def bpe(self, token):
86
+ if token in self.cache:
87
+ return self.cache[token]
88
+ word = tuple(token)
89
+
90
+ pairs = get_pairs(word)
91
+
92
+ if not pairs:
93
+ return token
94
+
95
+ while True:
96
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
97
+ if bigram not in self.bpe_ranks:
98
+ break
99
+ first, second = bigram
100
+ new_word = []
101
+ i = 0
102
+ while i < len(word):
103
+ try:
104
+ j = word.index(first, i)
105
+ new_word.extend(word[i:j])
106
+ i = j
107
+ except:
108
+ new_word.extend(word[i:])
109
+ break
110
+
111
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
112
+ new_word.append(first + second)
113
+ i += 2
114
+ else:
115
+ new_word.append(word[i])
116
+ i += 1
117
+ new_word = tuple(new_word)
118
+ word = new_word
119
+ if len(word) == 1:
120
+ break
121
+ else:
122
+ pairs = get_pairs(word)
123
+
124
+ word = " ".join(word)
125
+ self.cache[token] = word
126
+ return word
127
+
128
+ def encode(self, text):
129
+ bpe_tokens = []
130
+ for token in re.findall(self.pat, text):
131
+ token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
132
+
133
+ bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
134
+ return bpe_tokens
135
+
136
+ def decode(self, tokens):
137
+ text = "".join([self.decoder[token] for token in tokens])
138
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
139
+ return text
140
+
141
+
142
+ def get_encoder():
143
+ encoder_filename = download(encoder_file["link"], encoder_file["filename"])
144
+ bpe_filename = download(bpe_file["link"], bpe_file["filename"])
145
+ with open(encoder_filename, "r") as f:
146
+ encoder = json.load(f)
147
+ with open(bpe_filename, "r", encoding="utf-8") as f:
148
+ bpe_data = f.read()
149
+ bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
150
+ return Encoder(encoder=encoder, bpe_merges=bpe_merges)