File size: 9,250 Bytes
0be59b5
 
 
 
8484d37
0be59b5
 
 
02480fa
0be59b5
 
 
 
 
52823c4
0be59b5
 
 
 
 
 
 
 
 
 
 
 
8484d37
02480fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0be59b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac2a08a
0be59b5
 
 
ac2a08a
 
 
 
0be59b5
 
 
ac2a08a
0be59b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac2a08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0be59b5
 
 
 
 
 
 
 
 
 
 
 
 
ac2a08a
0be59b5
ac2a08a
 
0be59b5
 
ac2a08a
 
0be59b5
 
ac2a08a
 
 
 
 
 
 
 
 
291afac
ac2a08a
0be59b5
 
 
 
ac2a08a
fd69507
ac2a08a
 
 
 
 
 
 
0be59b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd69507
 
0be59b5
 
fd69507
 
 
 
 
 
 
 
 
 
0be59b5
 
 
 
 
 
 
7ff3f2d
b2165fc
0be59b5
 
 
 
 
 
fd69507
 
 
 
0be59b5
 
ac2a08a
 
0be59b5
ac2a08a
 
0be59b5
 
 
 
 
 
 
ac2a08a
 
 
e069448
ac2a08a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import tensorflow as tf
from tensorflow import keras
from keras.layers import *
import keras_nlp
import subprocess

import math
import json
import spacy
from transformers import AutoTokenizer
from tokenizers import AddedToken


# Config
input_size  = 320#512
embed_dim   = 128


# Tokenizer
tokenizer = AutoTokenizer.from_pretrained('google/t5-v1_1-base')
tokenizer.add_tokens(AddedToken("\n", normalized=False))
tokenizer.add_tokens(AddedToken("<s>", normalized=False))
vocab_size = len(tokenizer.get_vocab().keys())
print("vocab_size:", vocab_size)
print("pad token id:", tokenizer.pad_token)


subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
nlp = spacy.load("en_core_web_lg")
nlp.max_length = 2000000
selected = {'NUM', 'PROPN'}
alltoks = sorted(list(tokenizer.get_vocab().items()), key=lambda x:x[1])
all_toks_text = "\n".join([t[0].replace("▁", "") for t in alltoks])
doc = nlp(all_toks_text)
carry_toks = set()
i = 0
for ii, token in enumerate(doc):
    if str(token) in alltoks[i][0]: pass
    else: i += 1
    if str(token) in alltoks[i][0] and token.pos_ in selected and i > 100:
        if (token.pos_ != "PROPN" or alltoks[i][0].replace("▁", "")[0].isupper()):
            carry_toks.add(alltoks[i][1])
print(len(carry_toks))


# Masked Accuracy Metric
def masked_accuracy(y_true, y_pred, padding_token=tokenizer.pad_token_id):
    y_true = tf.cast(y_true, tf.int32)
    y_pred = tf.cast(tf.argmax(y_pred, axis=-1), tf.int32)
    mask = tf.cast(tf.not_equal(y_true, padding_token), tf.float32)
    matches = tf.cast(tf.equal(y_true, y_pred), tf.float32)
    accuracy = tf.reduce_sum(matches * mask) / tf.reduce_sum(mask)
    return accuracy


# Embedding Layer
class SharedEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embed_dim, **kwargs):
        super(SharedEmbedding, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        
    def build(self, input_shape):
        self.shared_weights = self.add_weight(
            shape=(self.vocab_size, self.embed_dim),
            initializer='random_normal',
            trainable=True,
            name='shared_weights'
        )
        super(SharedEmbedding, self).build(input_shape)
    
    def call(self, inputs, mode='embedding', temp=0.1):
        if mode == 'embedding':
            return tf.nn.embedding_lookup(self.shared_weights, inputs)
        elif mode == 'classify':
            return tf.nn.softmax(tf.matmul(inputs, self.shared_weights, transpose_b=True), axis=-1) 
        

# Attention Layer
class DiffAttention(keras.layers.Layer):
    def __init__(self, depth, **kwargs):
        super(DiffAttention, self).__init__(**kwargs)
        self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * depth)

    def build(self, input_shape):
        self.embed_dim = input_shape[-1]
        self.input_size = input_shape[-2]
        self.mask = tf.where(tf.linalg.band_part(tf.ones((input_shape[-2], input_shape[-2])), -1, 0) == 1.0, 0.0, float("-inf"))
        self.range_do = -tf.range(input_shape[-2])-1
        self.range_undo = tf.range(input_shape[-2])+1
        self.Q = self.add_weight(name='kernelQ',
                                      shape=(input_shape[-1], input_shape[-1]),
                                      initializer='uniform',
                                      trainable=True)
        self.K = self.add_weight(name='kernelK',
                                      shape=(input_shape[-1], input_shape[-1]),
                                      initializer='uniform',
                                      trainable=True)
        self.V = self.add_weight(name='kernelV',
                                      shape=(input_shape[-1], input_shape[-1]),
                                      initializer='uniform',
                                      trainable=True)

        initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.1)
        self.lambda_q1 = self.add_weight(
            shape=(input_shape[-1],), initializer=initializer, trainable=True, name="lambda_q1"
        )
        self.lambda_k1 = self.add_weight(
            shape=(input_shape[-1],), initializer=initializer, trainable=True, name="lambda_k1"
        )
        self.lambda_q2 = self.add_weight(
            shape=(input_shape[-1],), initializer=initializer, trainable=True, name="lambda_q2"
        )
        self.lambda_k2 = self.add_weight(
            shape=(input_shape[-1],), initializer=initializer, trainable=True, name="lambda_k2"
        )
        
        super(DiffAttention, self).build(input_shape)

    def roll_embeddings(self, tensor, shift_values):
        batch_size, time_size, embed_dim = tensor.shape
        if batch_size is None: return tensor
        shift_matrix   = tf.reshape(shift_values, (1, -1, 1))
        shift_matrix   = tf.tile(shift_matrix, [batch_size, 1, embed_dim])
        indices        = tf.range(embed_dim)
        indices_matrix = tf.tile(indices, [batch_size * time_size])
        indices_matrix = tf.reshape(indices_matrix, (batch_size, time_size, embed_dim))
        new_indices    = (indices_matrix + shift_matrix) % embed_dim     
        rolled_tensor  = tf.gather(tensor, new_indices, batch_dims=2)
        return rolled_tensor

    def call(self, x, pos, pos_src):
        v    = x @ self.V
        q    = tf.transpose(tf.reshape(x @ self.Q, (-1, self.input_size, 2, self.embed_dim//2)), perm=[0, 2, 1, 3])
        k    = tf.transpose(tf.reshape(x @ self.K, (-1, self.input_size, 2, self.embed_dim//2)), perm=[0, 2, 1, 3])
        atti = tf.matmul(q, k,   transpose_b=True)
        attp = tf.matmul(q, pos, transpose_b=True)
        attp = self.roll_embeddings(tf.reshape(attp, (-1, self.input_size, self.input_size)), self.range_do)
        attp = tf.reshape(attp, (-1, 2, self.input_size, self.input_size))
        att  = atti + attp
        att  = tf.nn.softmax((att / math.sqrt(self.embed_dim)) + self.mask, axis=-1)
        att1 = att[:, 0]
        att2 = att[:, 1]
        
        # Differential attention
        lambda_1 = tf.math.exp(tf.reduce_sum(self.lambda_q1 * self.lambda_k1, axis=-1))
        lambda_2 = tf.math.exp(tf.reduce_sum(self.lambda_q2 * self.lambda_k2, axis=-1))
        lambda_full = lambda_1 - lambda_2 + self.lambda_init
        att = att1 - lambda_full * att2

        out = att @ v
        out = out * (1 - self.lambda_init)
        return out
    

# Import Model
model = keras.models.load_model(
    "rpc.keras",
    custom_objects={
        "DiffAttention" : DiffAttention,
        "SharedEmbedding" : SharedEmbedding,
        "masked_accuracy" : masked_accuracy
    }
)
encoder = keras.Model(inputs=model.layers[0].input, outputs=model.layers[-1].output)
encoder.summary()


# Vectorize Function
def vectorize_texts(all_texts):
    batch_size = 128
    vects = []
    for i in range(0, len(all_texts), batch_size):
        texts = all_texts[i:i+batch_size]
        toks = [text + ([tokenizer.pad_token_id] * (input_size - len(text))) for text in texts]
        if len(toks) > 0:
            toks = tf.constant(toks, shape=(len(toks), input_size))
            vect = encoder.predict(toks, verbose=0)
            for v, t in zip(vect, texts):
                vects.append(v[:len(t), :])
    return tf.concat(vects, axis=0).numpy()


# Import Database and All Toks
index = None
all_toks = None
index_type = None
def load_index(index_path="/dev/shm/rpc-vecdb/index", idx_type="ngt"):
    global index
    global all_toks
    global index_type
    index_type = idx_type
    if idx_type == "ngt":
        import ngtpy
        index = ngtpy.Index(index_path, read_only=True)
    elif idx_type == "faiss":    
        import faiss
        index = faiss.read_index(index_path + "/index.faiss")
    else:
        raise ValueError("Unknown index type")
    with open(index_path + "/all_toks.json", "r") as f:
        all_toks = json.loads(f.read())


# Generate Function
def generate(text, use_rpc=True, max_tokens=128):
    enc_text = tokenizer.encode(text, add_special_tokens=False)
    text = tokenizer.decode(enc_text)
    tok = None
    i = 0
    while i < max_tokens and tok != vocab_size - 2:

        enc_text = enc_text[-input_size:]
        if use_rpc:
            xq = vectorize_texts([enc_text])[-1]
            if index_type == "ngt":
                _id = index.search(xq, size=1, epsilon=1)[0][0]
            else:
                _id = index.search(xq.reshape((1, -1)), 1)[1][0][0]
            if all_toks[_id] in carry_toks:
                tmp = tf.argmax(tf.matmul(xq.reshape((1, -1)), encoder.layers[1].shared_weights, transpose_b=True), axis=-1).numpy()[0]
                if tmp in enc_text:
                    tok = tmp
                else: tok = all_toks[_id]
            else:
                tok = all_toks[_id]
        else:
            ins = enc_text + [tokenizer.pad_token_id] * (input_size - len(enc_text))
            ins = tf.constant(ins, shape=(1, input_size))
            res = model.predict(ins, verbose=0)[0][len(enc_text)-1]
            tok = tf.argmax(res, axis=-1).numpy().tolist()
        
        enc_text += [tok]
        new_text = tokenizer.decode(enc_text)
        res = new_text[len(text):]
        text = new_text
    
        yield res