anasampa2 commited on
Commit
be8596b
·
verified ·
1 Parent(s): ee0ec3d

Added wembedding_service folder.

Browse files

Added wembedding_server folder running in vm.

wembedding_service/compute_wembeddings.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ #
3
+ # Copyright 2020 Institute of Formal and Applied Linguistics, Faculty of
4
+ # Mathematics and Physics, Charles University, Czech Republic.
5
+ #
6
+ # This Source Code Form is subject to the terms of the Mozilla Public
7
+ # License, v. 2.0. If a copy of the MPL was not distributed with this
8
+ # file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
+
10
+ import sys
11
+ import zipfile
12
+
13
+ import numpy as np
14
+
15
+ import wembeddings.wembeddings as wembeddings
16
+
17
+ if __name__ == "__main__":
18
+ import argparse
19
+
20
+ # Parse arguments
21
+ parser = argparse.ArgumentParser()
22
+ parser.add_argument("input_path", type=str, help="Input file")
23
+ parser.add_argument("output_npz", type=str, help="Output NPZ file")
24
+ parser.add_argument("--batch_size", default=64, type=int, help="Batch size")
25
+ parser.add_argument("--dtype", default="float16", type=str, help="Dtype to save as")
26
+ parser.add_argument("--format", default="conllu", type=str, help="Input format (conllu, conll)")
27
+ parser.add_argument("--model", default="bert-base-multilingual-uncased-last4", type=str, help="Model name (see wembeddings.py for options)")
28
+ parser.add_argument("--server", default=None, type=str, help="Use given server to compute the embeddings")
29
+ parser.add_argument("--threads", default=4, type=int, help="Threads to use")
30
+ args = parser.parse_args()
31
+
32
+ args.dtype = getattr(np, args.dtype)
33
+ assert args.format in ["conll", "conllu"]
34
+
35
+ # Load the input file
36
+ sentences = []
37
+ with open(args.input_path, mode="r", encoding="utf-8") as input_file:
38
+ in_sentence = False
39
+ for line in input_file:
40
+ line = line.rstrip("\n")
41
+ if line:
42
+ if not in_sentence:
43
+ sentences.append([])
44
+ in_sentence = True
45
+
46
+ columns = line.split("\t")
47
+ if args.format == "conll":
48
+ sentences[-1].append(columns[0])
49
+ elif args.format == "conllu":
50
+ if columns[0].isdigit():
51
+ assert len(columns) == 10
52
+ sentences[-1].append(columns[1])
53
+ else:
54
+ in_sentence = False
55
+ print("Loaded {} sentences and {} words.".format(len(sentences), sum(map(len, sentences))), file=sys.stderr, flush=True)
56
+
57
+ # Initialize suitable computational class
58
+ if args.server is not None:
59
+ wembeddings = wembeddings.WEmbeddings.ClientNetwork(args.server)
60
+ else:
61
+ wembeddings = wembeddings.WEmbeddings(threads=args.threads)
62
+
63
+ # Compute word embeddings
64
+ with zipfile.ZipFile(args.output_npz, mode="w", compression=zipfile.ZIP_STORED) as output_npz:
65
+ for i in range(0, len(sentences), args.batch_size):
66
+ sentences_embeddings = wembeddings.compute_embeddings(args.model, sentences[i:i + args.batch_size])
67
+ for j, sentence_embeddings in enumerate(sentences_embeddings):
68
+ with output_npz.open("arr_{}".format(i + j), mode="w") as embeddings_file:
69
+ np.save(embeddings_file, sentence_embeddings.astype(args.dtype))
70
+ if (i + j + 1) % 100 == 0:
71
+ print("Processed {}/{} sentences.".format(i + j + 1, len(sentences)), file=sys.stderr, flush=True)
72
+ print("Done, all embeddings saved.", file=sys.stderr, flush=True)
wembedding_service/start_wembeddings_server.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # coding=utf-8
3
+ #
4
+ # Copyright 2020 Institute of Formal and Applied Linguistics, Faculty of
5
+ # Mathematics and Physics, Charles University, Czech Republic.
6
+ #
7
+ # This Source Code Form is subject to the terms of the Mozilla Public
8
+ # License, v. 2.0. If a copy of the MPL was not distributed with this
9
+ # file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
+
11
+ """Word embeddings server.
12
+
13
+ Example setup:
14
+ $ venv/bin/python ./wembeddings_server.py
15
+
16
+ Example call:
17
+ $ curl --data-binary @examples/request.json localhost:8000/wembeddings | xxd
18
+ """
19
+
20
+ import signal
21
+ import os
22
+ import sys
23
+ import threading
24
+ import time
25
+
26
+ import numpy as np
27
+
28
+ import wembeddings.wembeddings as wembeddings
29
+ import wembeddings.wembeddings_server as wembeddings_server
30
+
31
+ if __name__ == "__main__":
32
+ import argparse
33
+
34
+ # Parse arguments
35
+ parser = argparse.ArgumentParser()
36
+ parser.add_argument("port", type=int, help="Port to use")
37
+ parser.add_argument("--dtype", default="float16", type=str, help="Dtype to serve the embeddings as")
38
+ parser.add_argument("--logfile", default=None, type=str, help="Log path")
39
+ parser.add_argument("--preload_models", default=[], nargs="*", type=str, help="Models to preload, or `all`")
40
+ parser.add_argument("--preload_only", default=False, action="store_true", help="Only preload models and exit")
41
+ parser.add_argument("--threads", default=4, type=int, help="Threads to use")
42
+ args = parser.parse_args()
43
+ args.dtype = getattr(np, args.dtype)
44
+
45
+ # Log stderr to logfile if given
46
+ if args.logfile is not None:
47
+ sys.stderr = open(args.logfile, "a", encoding="utf-8")
48
+
49
+ # Lambda to create the WEmbeddings instance
50
+ wembeddings_lambda = lambda: wembeddings.WEmbeddings(threads=args.threads, preload_models=args.preload_models)
51
+
52
+ if args.preload_only:
53
+ print("Preloading models only.", file=sys.stderr)
54
+ wembeddings_lambda()
55
+ sys.exit(0)
56
+
57
+ # Create the server and its own thread
58
+ server = wembeddings_server.WEmbeddingsServer(args.port, args.dtype, wembeddings_lambda)
59
+ server_thread = threading.Thread(target=server.serve_forever, daemon=True)
60
+ server_thread.start()
61
+
62
+ print("Starting WEmbeddings server on port {}.".format(args.port), file=sys.stderr)
63
+ print("To stop it gracefully, either send SIGINT (Ctrl+C) or SIGUSR1.", file=sys.stderr, flush=True)
64
+
65
+ def shutdown():
66
+ print("Initiating shutdown of the WEmbeddings server.", file=sys.stderr, flush=True)
67
+ server.shutdown()
68
+ print("Stopped handling new requests, processing all current ones.", file=sys.stderr, flush=True)
69
+ server.server_close()
70
+ print("Finished shutdown of the WEmbeddings server.", file=sys.stderr, flush=True)
71
+
72
+ # Serve
73
+ if os.name != 'nt':
74
+ # Wait for one of the signals on Posix systems.
75
+ signal.pthread_sigmask(signal.SIG_BLOCK, [signal.SIGINT, signal.SIGUSR1])
76
+ signal.sigwait([signal.SIGINT, signal.SIGUSR1])
77
+ shutdown()
78
+ else:
79
+ # On Windows, allow interruption with Ctrl+C -- for testing only.
80
+ def signal_handler(sig, frame):
81
+ shutdown()
82
+ sys.exit(0)
83
+ signal.signal(signal.SIGINT, signal_handler)
84
+ while True:
85
+ time.sleep(1)
wembedding_service/wembeddings/__pycache__/wembeddings.cpython-37.pyc ADDED
Binary file (6.5 kB). View file
 
wembedding_service/wembeddings/wembeddings.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # coding=utf-8
3
+ #
4
+ # Copyright 2020 Institute of Formal and Applied Linguistics, Faculty of
5
+ # Mathematics and Physics, Charles University, Czech Republic.
6
+ #
7
+ # This Source Code Form is subject to the terms of the Mozilla Public
8
+ # License, v. 2.0. If a copy of the MPL was not distributed with this
9
+ # file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
+
11
+ """Word embeddings computation class."""
12
+
13
+ import json
14
+ import sys
15
+ import time
16
+ import urllib.request
17
+
18
+ import numpy as np
19
+
20
+
21
+ class WEmbeddings:
22
+ """Class to keep multiple constructed word embedding computation models."""
23
+
24
+ MODELS_MAP = {
25
+ # Key: model name. Value: transformer model name, layer start, layer end.
26
+ "bert-base-multilingual-uncased-last4": ("bert-base-multilingual-uncased", -4, None),
27
+ "robeczech-base-last4": ("ufal/robeczech-base", -4, None),
28
+ "xlm-roberta-base-last4": ("xlm-roberta-base", -4, None),
29
+ "bert-large-portuguese-cased-last4":("neuralmind/bert-large-portuguese-cased", -4, None),
30
+ "bert-base-portuguese-cased-last4":("neuralmind/bert-base-portuguese-cased", -4, None),
31
+ }
32
+
33
+ MAX_SUBWORDS_PER_SENTENCE = 510
34
+
35
+ class _Model:
36
+ """Construct a tokenizer and transformers model graph."""
37
+ def __init__(self, transformers_model, layer_start, layer_end, loader_lock):
38
+ self._model_loaded = False
39
+ self._transformers_model_name = transformers_model
40
+ self._layer_start = layer_start
41
+ self._layer_end = layer_end
42
+ self._loader_lock = loader_lock
43
+
44
+ def load(self):
45
+ if self._model_loaded: return
46
+ with self._loader_lock:
47
+ import tensorflow as tf
48
+ import transformers
49
+
50
+ if self._model_loaded: return
51
+
52
+ self.tokenizer = transformers.AutoTokenizer.from_pretrained(self._transformers_model_name, use_fast=True)
53
+
54
+ self._transformers_model = transformers.TFAutoModel.from_pretrained(
55
+ self._transformers_model_name,
56
+ config=transformers.AutoConfig.from_pretrained(self._transformers_model_name, output_hidden_states=True),
57
+ from_pt=True
58
+ )
59
+
60
+ def compute_embeddings(subwords, segments):
61
+ subword_embeddings_layers = self._transformers_model(
62
+ (tf.maximum(subwords, 0), tf.cast(tf.not_equal(subwords, -1), tf.int32))
63
+ ).hidden_states
64
+ subword_embeddings = tf.math.reduce_mean(subword_embeddings_layers[self._layer_start:self._layer_end], axis=0)
65
+
66
+ # Average subwords (word pieces) word embeddings for each token
67
+ def average_subwords(embeddings_and_segments):
68
+ subword_embeddings, segments = embeddings_and_segments
69
+ return tf.math.segment_mean(subword_embeddings, segments)
70
+ word_embeddings = tf.map_fn(average_subwords, (subword_embeddings[:, 1:], segments), dtype=tf.float32)[:, :-1]
71
+ return word_embeddings
72
+ self.compute_embeddings = tf.function(compute_embeddings).get_concrete_function(
73
+ tf.TensorSpec(shape=[None, None], dtype=tf.int32), tf.TensorSpec(shape=[None, None], dtype=tf.int32)
74
+ )
75
+
76
+ self._model_loaded = True
77
+
78
+
79
+ def __init__(self, max_form_len=64, threads=None, preload_models=[]):
80
+ import tensorflow as tf
81
+ import threading
82
+
83
+ # Impose the limit on the number of threads, if given
84
+ if threads is not None:
85
+ tf.config.threading.set_inter_op_parallelism_threads(threads)
86
+ tf.config.threading.set_intra_op_parallelism_threads(threads)
87
+
88
+ self._max_form_len = max_form_len
89
+
90
+ loader_lock = threading.Lock()
91
+ self._models = {}
92
+ for model_name, (transformers_model, layer_start, layer_end) in self.MODELS_MAP.items():
93
+ self._models[model_name] = self._Model(transformers_model, layer_start, layer_end, loader_lock)
94
+
95
+ if model_name in preload_models or "all" in preload_models:
96
+ self._models[model_name].load()
97
+
98
+ def compute_embeddings(self, model, sentences):
99
+ """Computes word embeddings.
100
+ Arguments:
101
+ model: one of the keys of self.MODELS_MAP.
102
+ sentences: 2D Python array with sentences with tokens (strings).
103
+ Returns:
104
+ embeddings as a Python list of 1D Numpy arrays
105
+ """
106
+
107
+ if model not in self._models:
108
+ print("No such WEmbeddings model {}".format(model), file=sys.stderr, flush=True)
109
+
110
+ embeddings = []
111
+ if sentences:
112
+ model = self._models[model]
113
+ model.load()
114
+
115
+ time_tokenization = time.time()
116
+
117
+ sentences_subwords = model.tokenizer(
118
+ [(" " if i else "") + word[:self._max_form_len] for sentence in sentences for i, word in enumerate(sentence)],
119
+ add_special_tokens=False
120
+ ).input_ids
121
+
122
+ subwords, segments, parts = [], [], []
123
+ for sentence in sentences:
124
+ segments.append([])
125
+ subwords.append([])
126
+ parts.append([0])
127
+ sentence_subwords, sentences_subwords = sentences_subwords[:len(sentence)], sentences_subwords[len(sentence):]
128
+ for word_subwords in sentence_subwords:
129
+ # Split sentences with too many subwords
130
+ if len(subwords[-1]) + len(word_subwords) > self.MAX_SUBWORDS_PER_SENTENCE:
131
+ subwords[-1] = model.tokenizer.build_inputs_with_special_tokens(subwords[-1])
132
+ segments.append([])
133
+ subwords.append([])
134
+ parts[-1].append(0)
135
+ segments[-1].extend([parts[-1][-1]] * len(word_subwords))
136
+ subwords[-1].extend(word_subwords)
137
+ parts[-1][-1] += 1
138
+ subwords[-1] = model.tokenizer.build_inputs_with_special_tokens(subwords[-1])
139
+
140
+ max_sentence_len = max(len(sentence) for sentence in sentences)
141
+ max_subwords = max(len(sentence) for sentence in subwords)
142
+
143
+ time_embeddings = time.time()
144
+ np_subwords = np.full([len(subwords), max_subwords], -1, np.int32)
145
+ for i, subword in enumerate(subwords):
146
+ np_subwords[i, :len(subword)] = subword
147
+
148
+ np_segments = np.full([len(segments), max_subwords - 1], max_sentence_len, np.int32)
149
+ for i, segment in enumerate(segments):
150
+ np_segments[i, :len(segment)] = segment
151
+
152
+ embeddings_with_parts = model.compute_embeddings(np_subwords, np_segments).numpy()
153
+
154
+ # Concatenate splitted sentences
155
+ current_sentence_part = 0
156
+ for sentence_parts in parts:
157
+ embeddings.append(np.concatenate(
158
+ [embeddings_with_parts[current_sentence_part + i, :sentence_part] for i, sentence_part in enumerate(sentence_parts)],
159
+ axis=0))
160
+ current_sentence_part += len(sentence_parts)
161
+
162
+ print("WEmbeddings in {:.1f}ms,".format(1000 * (time.time() - time_embeddings)),
163
+ "tokenization in {:.1f}ms,".format(1000*(time_embeddings - time_tokenization)),
164
+ "batch {},".format(len(sentences)),
165
+ "max sentence len {},".format(max_sentence_len),
166
+ "max subwords {}.".format(max_subwords),
167
+ file=sys.stderr, flush=True)
168
+
169
+ return embeddings
170
+
171
+
172
+ class ClientNetwork:
173
+ def __init__(self, url):
174
+ self._url = url
175
+ def compute_embeddings(self, model, sentences):
176
+ with urllib.request.urlopen(
177
+ "http://{}/wembeddings".format(self._url),
178
+ data=json.dumps({"model": model, "sentences": sentences}, ensure_ascii=True).encode("ascii"),
179
+ ) as response:
180
+ embeddings = []
181
+ for _ in sentences:
182
+ embeddings.append(np.lib.format.read_array(response, allow_pickle=False))
183
+ return embeddings
wembedding_service/wembeddings/wembeddings_server.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # coding=utf-8
3
+ #
4
+ # Copyright 2020 Institute of Formal and Applied Linguistics, Faculty of
5
+ # Mathematics and Physics, Charles University, Czech Republic.
6
+ #
7
+ # This Source Code Form is subject to the terms of the Mozilla Public
8
+ # License, v. 2.0. If a copy of the MPL was not distributed with this
9
+ # file, You can obtain one at http://mozilla.org/MPL/2.0/.
10
+
11
+ """Word embeddings server class."""
12
+
13
+ import http.server
14
+ import json
15
+ import socketserver
16
+ import os
17
+ import sys
18
+ import threading
19
+ import urllib.parse
20
+
21
+ import numpy as np
22
+
23
+ class WEmbeddingsServer(socketserver.ThreadingTCPServer):
24
+
25
+ class WEmbeddingsRequestHandler(http.server.BaseHTTPRequestHandler):
26
+ protocol_version = "HTTP/1.1"
27
+
28
+ def respond(request, content_type, code=200):
29
+ request.close_connection = True
30
+ request.send_response(code)
31
+ request.send_header("Connection", "close")
32
+ request.send_header("Content-Type", content_type)
33
+ request.send_header("Access-Control-Allow-Origin", "*")
34
+ request.end_headers()
35
+
36
+ def respond_error(request, message, code=400):
37
+ request.respond("text/plain", code)
38
+ request.wfile.write(message.encode("utf-8"))
39
+
40
+ def do_POST(request):
41
+ try:
42
+ request.path = request.path.encode("iso-8859-1").decode("utf-8")
43
+ url = urllib.parse.urlparse(request.path)
44
+ except:
45
+ return request.respond_error("Cannot parse request URL.")
46
+
47
+ # Handle /wembeddings
48
+ if url.path == "/wembeddings":
49
+ if request.headers.get("Transfer-Encoding", "identity").lower() != "identity":
50
+ return request.respond_error("Only 'identity' Transfer-Encoding of payload is supported for now.")
51
+
52
+ if "Content-Length" not in request.headers:
53
+ return request.respond_error("The Content-Length of payload is required.")
54
+
55
+ try:
56
+ length = int(request.headers["Content-Length"])
57
+ data = json.loads(request.rfile.read(length))
58
+ model, sentences = data["model"], data["sentences"]
59
+ except:
60
+ import traceback
61
+ traceback.print_exc(file=sys.stderr)
62
+ sys.stderr.flush()
63
+ return request.respond_error("Malformed request.")
64
+
65
+ try:
66
+ with request.server._wembeddings_mutex:
67
+ sentences_embeddings = request.server._wembeddings.compute_embeddings(model, sentences)
68
+ except:
69
+ import traceback
70
+ traceback.print_exc(file=sys.stderr)
71
+ sys.stderr.flush()
72
+ return request.respond_error("An error occurred during wembeddings computation.")
73
+
74
+ request.respond("application/octet_stream")
75
+ for sentence_embedding in sentences_embeddings:
76
+ np.lib.format.write_array(request.wfile, sentence_embedding.astype(request.server._dtype), allow_pickle=False)
77
+
78
+ # URL not found
79
+ else:
80
+ request.respond_error("No handler for the given URL '{}'".format(url.path), code=404)
81
+
82
+ def do_GET(request):
83
+ try:
84
+ request.path = request.path.encode("iso-8859-1").decode("utf-8")
85
+ url = urllib.parse.urlparse(request.path)
86
+ except:
87
+ return request.respond_error("Cannot parse request URL.")
88
+
89
+ if url.path == "/status":
90
+ request.respond("application/json")
91
+ request.wfile.write(bytes("""{"status": "UP"}""", "utf-8"))
92
+ # URL not found
93
+ else:
94
+ request.respond_error("No handler for the given URL '{}'".format(url.path), code=404)
95
+
96
+ daemon_threads = False
97
+
98
+ def __init__(self, port, dtype, wembeddings_lambda):
99
+ self._dtype = dtype
100
+
101
+ # Create the WEmbeddings object its mutex
102
+ self._wembeddings = wembeddings_lambda()
103
+ self._wembeddings_mutex = threading.Lock()
104
+
105
+ # Initialize the server
106
+ super().__init__(("", port), self.WEmbeddingsRequestHandler)
107
+
108
+ def server_bind(self):
109
+ import socket
110
+ self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
111
+ if os.name != 'nt':
112
+ self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
113
+ super().server_bind()
114
+
115
+ def service_actions(self):
116
+ if isinstance(getattr(self, "_threads", None), list):
117
+ if len(self._threads) >= 1024:
118
+ self._threads = [thread for thread in self._threads if thread.is_alive()]