xin commited on
Commit
ade8773
·
1 Parent(s): 74555b0
LSTM/__pycache__/config.cpython-38.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1a316609dde1c8a3aeca1dfe09c8f54ff2bc3193afae97d0ced837735a41063
3
+ size 480
LSTM/__pycache__/inputHandler.cpython-38.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e822d00085a8a0c7f4c9227b1f26fff37af85a5ba9a97005d77ee5628ae0ba4e
3
+ size 6423
LSTM/choosed_checkpoit/lstm_50_50_0.17_0.25.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ca771b21fa23112b534ee0d1f8bacd97cf2205301e7a66bed5c34546201d5c8
3
+ size 40510656
LSTM/config.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ EMBEDDING_DIM = 50
3
+
4
+
5
+ MAX_SEQUENCE_LENGTH = 50
6
+ VALIDATION_SPLIT = 0.1
7
+
8
+
9
+ RATE_DROP_LSTM = 0.17
10
+ RATE_DROP_DENSE = 0.25
11
+ NUMBER_LSTM = 50
12
+ NUMBER_DENSE_UNITS = 50
13
+ ACTIVATION_FUNCTION = 'relu'
14
+
15
+
16
+ siamese_config = {
17
+ 'EMBEDDING_DIM': EMBEDDING_DIM,
18
+ 'MAX_SEQUENCE_LENGTH' : MAX_SEQUENCE_LENGTH,
19
+ 'VALIDATION_SPLIT': VALIDATION_SPLIT,
20
+ 'RATE_DROP_LSTM': RATE_DROP_LSTM,
21
+ 'RATE_DROP_DENSE': RATE_DROP_DENSE,
22
+ 'NUMBER_LSTM': NUMBER_LSTM,
23
+ 'NUMBER_DENSE_UNITS': NUMBER_DENSE_UNITS,
24
+ 'ACTIVATION_FUNCTION': ACTIVATION_FUNCTION
25
+ }
LSTM/inputHandler.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from keras.preprocessing.sequence import pad_sequences
2
+ from keras.preprocessing.text import Tokenizer
3
+ from gensim.models import Word2Vec
4
+ import numpy as np
5
+ import gc
6
+
7
+
8
+ def train_word2vec(documents, embedding_dim):
9
+ """
10
+ train word2vector over training documents
11
+ Args:
12
+ documents (list): list of document
13
+ embedding_dim (int): output wordvector size
14
+ Returns:
15
+ word_vectors(dict): dict containing words and their respective vectors
16
+ """
17
+ model = Word2Vec(documents, min_count=1, size=embedding_dim)
18
+ word_vectors = model.wv
19
+ del model
20
+ return word_vectors
21
+
22
+
23
+ def create_embedding_matrix(tokenizer, word_vectors, embedding_dim):
24
+ """
25
+ Create embedding matrix containing word indexes and respective vectors from word vectors
26
+ Args:
27
+ tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object containing word indexes
28
+ word_vectors (dict): dict containing word and their respective vectors
29
+ embedding_dim (int): dimension of word vector
30
+
31
+ Returns:
32
+
33
+ """
34
+ nb_words = len(tokenizer.word_index) + 1
35
+ word_index = tokenizer.word_index
36
+ embedding_matrix = np.zeros((nb_words, embedding_dim))
37
+ print("Embedding matrix shape: %s" % str(embedding_matrix.shape))
38
+ for word, i in word_index.items():
39
+ try:
40
+ embedding_vector = word_vectors[word]
41
+ if embedding_vector is not None:
42
+ embedding_matrix[i] = embedding_vector
43
+ except KeyError:
44
+ print("vector not found for word - %s" % word)
45
+ print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
46
+ return embedding_matrix
47
+
48
+
49
+ def word_embed_meta_data(documents, embedding_dim):
50
+ """
51
+ Load tokenizer object for given vocabs list
52
+ Args:
53
+ documents (list): list of document
54
+ embedding_dim (int): embedding dimension
55
+ Returns:
56
+ tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
57
+ embedding_matrix (dict): dict with word_index and vector mapping
58
+ """
59
+ documents = [str(x).lower().split() for x in documents]
60
+ tokenizer = Tokenizer()
61
+ tokenizer.fit_on_texts(documents)
62
+ word_vector = train_word2vec(documents, embedding_dim)
63
+ embedding_matrix = create_embedding_matrix(tokenizer, word_vector, embedding_dim)
64
+ del word_vector
65
+ gc.collect()
66
+ return tokenizer, embedding_matrix
67
+
68
+
69
+ def create_train_dev_set(tokenizer, sentences_pair, is_similar, max_sequence_length, validation_split_ratio):
70
+ """
71
+ Create training and validation dataset
72
+ Args:
73
+ tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
74
+ sentences_pair (list): list of tuple of sentences pairs
75
+ is_similar (list): list containing labels if respective sentences in sentence1 and sentence2
76
+ are same or not (1 if same else 0)
77
+ max_sequence_length (int): max sequence length of sentences to apply padding
78
+ validation_split_ratio (float): contain ratio to split training data into validation data
79
+
80
+ Returns:
81
+ train_data_1 (list): list of input features for training set from sentences1
82
+ train_data_2 (list): list of input features for training set from sentences2
83
+ labels_train (np.array): array containing similarity score for training data
84
+ leaks_train(np.array): array of training leaks features
85
+
86
+ val_data_1 (list): list of input features for validation set from sentences1
87
+ val_data_2 (list): list of input features for validation set from sentences1
88
+ labels_val (np.array): array containing similarity score for validation data
89
+ leaks_val (np.array): array of validation leaks features
90
+ """
91
+ sentences1 = [x[0].lower() for x in sentences_pair]
92
+ sentences2 = [x[1].lower() for x in sentences_pair]
93
+ train_sequences_1 = tokenizer.texts_to_sequences(sentences1)
94
+ train_sequences_2 = tokenizer.texts_to_sequences(sentences2)
95
+ leaks = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
96
+ for x1, x2 in zip(train_sequences_1, train_sequences_2)]
97
+
98
+ train_padded_data_1 = pad_sequences(train_sequences_1, maxlen=max_sequence_length)
99
+ train_padded_data_2 = pad_sequences(train_sequences_2, maxlen=max_sequence_length)
100
+ train_labels = np.array(is_similar)
101
+ leaks = np.array(leaks)
102
+
103
+ shuffle_indices = np.random.permutation(np.arange(len(train_labels)))
104
+ train_data_1_shuffled = train_padded_data_1[shuffle_indices]
105
+ train_data_2_shuffled = train_padded_data_2[shuffle_indices]
106
+ train_labels_shuffled = train_labels[shuffle_indices]
107
+ leaks_shuffled = leaks[shuffle_indices]
108
+
109
+ dev_idx = max(1, int(len(train_labels_shuffled) * validation_split_ratio))
110
+
111
+ del train_padded_data_1
112
+ del train_padded_data_2
113
+ gc.collect()
114
+
115
+ train_data_1, val_data_1 = train_data_1_shuffled[:-dev_idx], train_data_1_shuffled[-dev_idx:]
116
+ train_data_2, val_data_2 = train_data_2_shuffled[:-dev_idx], train_data_2_shuffled[-dev_idx:]
117
+ labels_train, labels_val = train_labels_shuffled[:-dev_idx], train_labels_shuffled[-dev_idx:]
118
+ leaks_train, leaks_val = leaks_shuffled[:-dev_idx], leaks_shuffled[-dev_idx:]
119
+
120
+ return train_data_1, train_data_2, labels_train, leaks_train, val_data_1, val_data_2, labels_val, leaks_val
121
+
122
+
123
+ def create_test_data(tokenizer, test_sentences_pair, max_sequence_length):
124
+ """
125
+ Create training and validation dataset
126
+ Args:
127
+ tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
128
+ test_sentences_pair (list): list of tuple of sentences pairs
129
+ max_sequence_length (int): max sequence length of sentences to apply padding
130
+
131
+ Returns:
132
+ test_data_1 (list): list of input features for training set from sentences1
133
+ test_data_2 (list): list of input features for training set from sentences2
134
+ """
135
+ test_sentences1 = [str(x[0]).lower() for x in test_sentences_pair]
136
+ test_sentences2 = [x[1].lower() for x in test_sentences_pair]
137
+
138
+ test_sequences_1 = tokenizer.texts_to_sequences(test_sentences1)
139
+ test_sequences_2 = tokenizer.texts_to_sequences(test_sentences2)
140
+ leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
141
+ for x1, x2 in zip(test_sequences_1, test_sequences_2)]
142
+
143
+ leaks_test = np.array(leaks_test)
144
+ test_data_1 = pad_sequences(test_sequences_1, maxlen=max_sequence_length)
145
+ test_data_2 = pad_sequences(test_sequences_2, maxlen=max_sequence_length)
146
+
147
+ return test_data_1, test_data_2, leaks_test
LSTM/sample_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eff555eaf20951ddb3f611ef509a5e19d31f6842983df6faab4754a17a6eb854
3
+ size 62661