Update customFunctions.py for new pipelines

#4
by hw01558 - opened
Files changed (1) hide show
  1. customFunctions.py +547 -470
customFunctions.py CHANGED
@@ -1,470 +1,547 @@
1
- import pandas as pd
2
- import numpy as np
3
- import random
4
- import torch
5
- import torch.nn as nn
6
- import torch.optim as optim
7
- #from transformers import BertTokenizer, BertModel
8
- from sklearn.metrics import accuracy_score, f1_score, classification_report
9
- import sklearn_crfsuite
10
- from sklearn_crfsuite import metrics
11
- from sklearn.metrics.pairwise import cosine_similarity
12
- from gensim.models import Word2Vec
13
- from sklearn.pipeline import Pipeline
14
- from sklearn.preprocessing import LabelEncoder
15
- from torch.utils.data import Dataset, DataLoader
16
- from torch.nn.utils.rnn import pad_sequence
17
- from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
18
- from sklearn.feature_extraction.text import TfidfVectorizer
19
-
20
-
21
-
22
- EMBEDDING_DIM = 100
23
- PAD_VALUE= -1
24
- MAX_LENGTH = 376
25
- EMBEDDING_DIM = 100
26
- BATCH_SIZE = 16
27
-
28
- class preprocess_sentences():
29
- def __init__(self):
30
- pass
31
-
32
- def fit(self, X, y=None):
33
- print('PREPROCESSING')
34
- return self
35
-
36
- def transform(self, X):
37
- # X = train['tokens'], y =
38
- sentences = X.apply(lambda x: x.tolist()).tolist()
39
- print('--> Preprocessing complete \n', flush=True)
40
- return sentences
41
-
42
-
43
-
44
- class Word2VecTransformer():
45
- def __init__(self, vector_size = 100, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
46
- self.model = None
47
- self.vector_size = vector_size
48
- self.window = window
49
- self.min_count = min_count
50
- self.workers = workers
51
- self.embedding_dim = embedding_dim
52
-
53
- def fit(self, X, y):
54
- # https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
55
- # https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
56
- print('WORD2VEC:', flush=True)
57
- # This fits the word2vec model
58
- self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
59
- , min_count=self.min_count, workers=self.workers)
60
- print('--> Word2Vec Fitted', flush=True)
61
- return self
62
-
63
- def transform(self, X):
64
- # This bit should transform the sentences
65
- embedded_sentences = []
66
-
67
- for sentence in X:
68
- sentence_vectors = []
69
-
70
- for word in sentence:
71
- if word in self.model.wv:
72
- vec = self.model.wv[word]
73
- else:
74
- vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
75
-
76
- sentence_vectors.append(vec)
77
-
78
- embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
79
- print('--> Embeddings Complete \n', flush=True)
80
-
81
- return embedded_sentences
82
-
83
- class Word2VecTransformer_CRF():
84
- def __init__(self, vector_size = 100, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
85
- self.model = None
86
- self.vector_size = vector_size
87
- self.window = window
88
- self.min_count = min_count
89
- self.workers = workers
90
- self.embedding_dim = embedding_dim
91
-
92
- def fit(self, X, y):
93
- # https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
94
- # https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
95
- print('WORD2VEC:', flush=True)
96
- # This fits the word2vec model
97
- self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
98
- , min_count=self.min_count, workers=self.workers)
99
- print('--> Word2Vec Fitted', flush=True)
100
- return self
101
-
102
- def transform(self, X):
103
- # This bit should transform the sentences
104
- embedded_sentences = []
105
-
106
- for sentence in X:
107
- sentence_vectors = []
108
-
109
- for word in sentence:
110
- features = {
111
- 'bias': 1.0,
112
- 'word.lower()': word.lower(),
113
- 'word[-3:]': word[-3:],
114
- 'word[-2:]': word[-2:],
115
- 'word.isupper()': word.isupper(),
116
- 'word.istitle()': word.istitle(),
117
- 'word.isdigit()': word.isdigit(),
118
- }
119
- if word in self.model.wv:
120
- vec = self.model.wv[word]
121
- else:
122
- vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
123
-
124
- # https://stackoverflow.com/questions/58736548/how-to-use-word-embedding-as-features-for-crf-sklearn-crfsuite-model-training
125
- for index in range(len(vec)):
126
- features[f"embedding_{index}"] = vec[index]
127
-
128
- sentence_vectors.append(features)
129
-
130
- embedded_sentences.append(sentence_vectors)
131
- print('--> Embeddings Complete \n', flush=True)
132
-
133
- return embedded_sentences
134
-
135
-
136
- class tfidf(BaseEstimator, TransformerMixin):
137
- def __init__(self):
138
- self.model = None
139
- self.embedding_dim = None
140
- self.idf = None
141
- self.vocab_size = None
142
- self.vocab = None
143
- pass
144
-
145
- def fit(self, X, y = None):
146
- print('TFIDF:', flush=True)
147
- joined_sentences = [' '.join(tokens) for tokens in X]
148
- self.model = TfidfVectorizer()
149
- self.model.fit(joined_sentences)
150
- self.vocab = self.model.vocabulary_
151
- self.idf = self.model.idf_
152
- self.vocab_size = len(self.vocab)
153
- self.embedding_dim = self.vocab_size
154
- print('--> TFIDF Fitted', flush=True)
155
- return self
156
-
157
- def transform(self, X):
158
-
159
- embedded = []
160
- for sentence in X:
161
- sent_vecs = []
162
- token_counts = {}
163
- for word in sentence:
164
- token_counts[word] = token_counts.get(word, 0) + 1
165
-
166
- sent_len = len(sentence)
167
- for word in sentence:
168
- vec = np.zeros(self.vocab_size)
169
- if word in self.vocab:
170
- tf = token_counts[word] / sent_len
171
- token_idx = self.vocab[word]
172
- vec[token_idx] = tf * self.idf[token_idx]
173
- sent_vecs.append(vec)
174
- embedded.append(torch.tensor(sent_vecs, dtype=torch.float32))
175
- print('--> Embeddings Complete \n', flush=True)
176
- print(embedded[0][0], flush=True)
177
- print('Those were the embeddings', flush=True)
178
-
179
-
180
- return embedded
181
-
182
-
183
- class BiLSTM_NER(nn.Module):
184
- def __init__(self,input_dim, hidden_dim, tagset_size):
185
- super(BiLSTM_NER, self).__init__()
186
-
187
- # Embedding layer
188
- #Freeze= false means that it will fine tune
189
- #self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze = False, padding_idx=-1)
190
-
191
- self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
192
- self.fc = nn.Linear(hidden_dim*2, tagset_size)
193
-
194
- def forward(self, sentences):
195
- #embeds = self.embedding(sentences)
196
- lstm_out, _ = self.lstm(sentences)
197
- tag_scores = self.fc(lstm_out)
198
-
199
- return tag_scores
200
-
201
- # Define the FeedForward NN Model
202
- class FeedForwardNN_NER(nn.Module):
203
- def __init__(self, embedding_dim, hidden_dim, tagset_size):
204
- super(FeedForwardNN_NER, self).__init__()
205
- self.fc1 = nn.Linear(embedding_dim, hidden_dim)
206
- self.relu = nn.ReLU()
207
- self.fc2 = nn.Linear(hidden_dim, tagset_size)
208
-
209
- def forward(self, x):
210
- # x: (batch_size, seq_length, embedding_dim)
211
- x = self.fc1(x) # (batch_size, seq_length, hidden_dim)
212
- x = self.relu(x)
213
- logits = self.fc2(x) # (batch_size, seq_length, tagset_size)
214
- return logits
215
-
216
-
217
- def pad(batch):
218
- # batch is a list of (X, y) pairs
219
- X_batch, y_batch = zip(*batch)
220
-
221
- # Convert to tensors
222
- X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in X_batch]
223
- y_batch = [torch.tensor(seq, dtype=torch.long) for seq in y_batch]
224
-
225
- # Pad sequences
226
- X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
227
- y_padded = pad_sequence(y_batch, batch_first=True, padding_value=PAD_VALUE)
228
-
229
- return X_padded, y_padded
230
-
231
- def pred_pad(batch):
232
- X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in batch]
233
- X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
234
- return X_padded
235
-
236
-
237
- class Ner_Dataset(Dataset):
238
- def __init__(self, X, y):
239
- self.X = X
240
- self.y = y
241
-
242
- def __len__(self):
243
- return len(self.X)
244
-
245
- def __getitem__(self, idx):
246
- return self.X[idx], self.y[idx]
247
-
248
-
249
-
250
-
251
- class LSTM(BaseEstimator, ClassifierMixin):
252
- def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
253
- self.embedding_dim = embedding_dim
254
- self.hidden_dim = hidden_dim
255
- self.epochs = epochs
256
- self.learning_rate = learning_rate
257
- self.tag2idx = tag2idx
258
-
259
-
260
-
261
- def fit(self, embedded, encoded_tags):
262
- print('LSTM:', flush=True)
263
- data = Ner_Dataset(embedded, encoded_tags)
264
- train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
265
-
266
- self.model = self.train_LSTM(train_loader)
267
- print('--> LSTM trained', flush=True)
268
- return self
269
-
270
- def predict(self, X):
271
- # Switch to evaluation mode
272
-
273
- test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
274
-
275
- self.model.eval()
276
- predictions = []
277
-
278
- # Iterate through test data
279
- with torch.no_grad():
280
- for X_batch in test_loader:
281
- X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
282
-
283
- tag_scores = self.model(X_batch)
284
- _, predicted_tags = torch.max(tag_scores, dim=2)
285
-
286
- # Flatten the tensors to compare word-by-word
287
- flattened_pred = predicted_tags.view(-1)
288
- predictions.append(flattened_pred.cpu().numpy())
289
-
290
- predictions = np.concatenate(predictions)
291
- return predictions
292
-
293
-
294
- def train_LSTM(self, train_loader, input_dim=None, hidden_dim=128, epochs=5, learning_rate=0.001):
295
-
296
- input_dim = self.embedding_dim
297
- # Instantiate the lstm_model
298
- lstm_model = BiLSTM_NER(input_dim, hidden_dim=hidden_dim, tagset_size=len(self.tag2idx))
299
- lstm_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
300
-
301
- # Loss function and optimizer
302
- loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE) # Ignore padding
303
- optimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)
304
- print('--> Training LSTM')
305
-
306
- # Training loop
307
- for epoch in range(epochs):
308
- total_loss = 0
309
- total_correct = 0
310
- total_words = 0
311
- lstm_model.train() # Set model to training mode
312
-
313
- for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
314
- X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
315
-
316
- # Zero gradients
317
- optimizer.zero_grad()
318
-
319
- # Forward pass
320
- tag_scores = lstm_model(X_batch)
321
-
322
- # Reshape and compute loss (ignore padded values)
323
- loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
324
-
325
- # Backward pass and optimization
326
- loss.backward()
327
- optimizer.step()
328
-
329
- total_loss += loss.item()
330
-
331
- # Compute accuracy for this batch
332
- # Get the predicted tags (index of max score)
333
- _, predicted_tags = torch.max(tag_scores, dim=2)
334
-
335
- # Flatten the tensors to compare word-by-word
336
- flattened_pred = predicted_tags.view(-1)
337
- flattened_true = y_batch.view(-1)
338
-
339
- # Exclude padding tokens from the accuracy calculation
340
- mask = flattened_true != PAD_VALUE
341
- correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
342
-
343
- # Count the total words in the batch (ignoring padding)
344
- total_words_batch = mask.sum().item()
345
-
346
- # Update total correct and total words
347
- total_correct += correct
348
- total_words += total_words_batch
349
-
350
- avg_loss = total_loss / len(train_loader)
351
- avg_accuracy = total_correct / total_words * 100 # Accuracy in percentage
352
-
353
- print(f' ==> Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
354
-
355
- return lstm_model
356
-
357
-
358
- class FeedforwardNN(BaseEstimator, ClassifierMixin):
359
- def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
360
- self.embedding_dim = embedding_dim
361
- self.hidden_dim = hidden_dim
362
- self.epochs = epochs
363
- self.learning_rate = learning_rate
364
- self.tag2idx = tag2idx
365
-
366
-
367
-
368
- def fit(self, embedded, encoded_tags):
369
- print('Feed Forward NN: ', flush=True)
370
- data = Ner_Dataset(embedded, encoded_tags)
371
- train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
372
-
373
- self.model = self.train_FF(train_loader)
374
- print('--> Feed Forward trained', flush=True)
375
- return self
376
-
377
- def predict(self, X):
378
- # Switch to evaluation mode
379
-
380
- test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
381
-
382
- self.model.eval()
383
- predictions = []
384
-
385
- # Iterate through test data
386
- with torch.no_grad():
387
- for X_batch in test_loader:
388
- X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
389
-
390
- tag_scores = self.model(X_batch)
391
- _, predicted_tags = torch.max(tag_scores, dim=2)
392
-
393
- # Flatten the tensors to compare word-by-word
394
- flattened_pred = predicted_tags.view(-1)
395
- predictions.append(flattened_pred.cpu().numpy())
396
-
397
- predictions = np.concatenate(predictions)
398
- return predictions
399
-
400
-
401
- def train_FF(self, train_loader, input_dim=None, hidden_dim=128, epochs=5, learning_rate=0.001):
402
-
403
- input_dim = self.embedding_dim
404
- # Instantiate the lstm_model
405
- ff_model = FeedForwardNN_NER(self.embedding_dim, hidden_dim=hidden_dim, tagset_size=len(self.tag2idx))
406
- ff_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
407
-
408
- # Loss function and optimizer
409
- loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE) # Ignore padding
410
- optimizer = optim.Adam(ff_model.parameters(), lr=learning_rate)
411
- print('--> Training FF')
412
-
413
- # Training loop
414
- for epoch in range(epochs):
415
- total_loss = 0
416
- total_correct = 0
417
- total_words = 0
418
- ff_model.train() # Set model to training mode
419
-
420
- for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
421
- X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
422
-
423
- # Zero gradients
424
- optimizer.zero_grad()
425
-
426
- # Forward pass
427
- tag_scores = ff_model(X_batch)
428
-
429
- # Reshape and compute loss (ignore padded values)
430
- loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
431
-
432
- # Backward pass and optimization
433
- loss.backward()
434
- optimizer.step()
435
-
436
- total_loss += loss.item()
437
-
438
- # Compute accuracy for this batch
439
- # Get the predicted tags (index of max score)
440
- _, predicted_tags = torch.max(tag_scores, dim=2)
441
-
442
- # Flatten the tensors to compare word-by-word
443
- flattened_pred = predicted_tags.view(-1)
444
- flattened_true = y_batch.view(-1)
445
-
446
- # Exclude padding tokens from the accuracy calculation
447
- mask = flattened_true != PAD_VALUE
448
- correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
449
-
450
- # Count the total words in the batch (ignoring padding)
451
- total_words_batch = mask.sum().item()
452
-
453
- # Update total correct and total words
454
- total_correct += correct
455
- total_words += total_words_batch
456
-
457
- avg_loss = total_loss / len(train_loader)
458
- avg_accuracy = total_correct / total_words * 100 # Accuracy in percentage
459
-
460
- print(f' ==> Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
461
-
462
- return ff_model
463
-
464
- crf = sklearn_crfsuite.CRF(
465
- algorithm='lbfgs',
466
- c1=0.1,
467
- c2=0.1,
468
- max_iterations=100,
469
- all_possible_transitions=True)
470
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import random
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.optim as optim
7
+ from transformers import BertTokenizer, BertModel
8
+ from seqeval.metrics import accuracy_score, f1_score, classification_report
9
+ from seqeval.scheme import IOB2
10
+ import sklearn_crfsuite
11
+ from sklearn_crfsuite import metrics
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+ from gensim.models import Word2Vec, KeyedVectors
14
+ from sklearn.pipeline import Pipeline
15
+ from sklearn.preprocessing import LabelEncoder
16
+ from torch.utils.data import Dataset, DataLoader
17
+ from torch.nn.utils.rnn import pad_sequence
18
+ from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
19
+ from sklearn.feature_extraction.text import TfidfVectorizer
20
+ import gensim.downloader as api
21
+ from itertools import product
22
+ from sklearn.model_selection import train_test_split, GridSearchCV
23
+ from joblib import dump
24
+
25
+
26
+ class preprocess_sentences():
27
+ def __init__(self):
28
+ pass
29
+
30
+ def fit(self, X, y=None):
31
+ print('PREPROCESSING')
32
+ return self
33
+
34
+ def transform(self, X):
35
+ # X = train['tokens'], y =
36
+ sentences = X.apply(lambda x: x.tolist()).tolist()
37
+ print('--> Preprocessing complete \n', flush=True)
38
+ return sentences
39
+
40
+ EMBEDDING_DIM = 500
41
+ PAD_VALUE= -1
42
+ MAX_LENGTH = 376
43
+ BATCH_SIZE = 16
44
+
45
+ class Word2VecTransformer():
46
+ def __init__(self, vector_size = EMBEDDING_DIM, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
47
+ self.model = None
48
+ self.vector_size = vector_size
49
+ self.window = window
50
+ self.min_count = min_count
51
+ self.workers = workers
52
+ self.embedding_dim = embedding_dim
53
+
54
+ def fit(self, X, y):
55
+ # https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
56
+ # https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
57
+ print('WORD2VEC:', flush=True)
58
+ # This fits the word2vec model
59
+ self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
60
+ , min_count=self.min_count, workers=self.workers)
61
+ print('--> Word2Vec Fitted', flush=True)
62
+ return self
63
+
64
+ def transform(self, X):
65
+ # This bit should transform the sentences
66
+ embedded_sentences = []
67
+
68
+ for sentence in X:
69
+ sentence_vectors = []
70
+
71
+ for word in sentence:
72
+ if word in self.model.wv:
73
+ vec = self.model.wv[word]
74
+ else:
75
+ vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
76
+
77
+ sentence_vectors.append(vec)
78
+
79
+ embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
80
+ print('--> Embeddings Complete \n', flush=True)
81
+
82
+ return embedded_sentences
83
+
84
+ class Word2VecTransformer_CRF():
85
+ def __init__(self, vector_size = EMBEDDING_DIM, window = 5, min_count = 1, workers = 1, embedding_dim=EMBEDDING_DIM):
86
+ self.model = None
87
+ self.vector_size = vector_size
88
+ self.window = window
89
+ self.min_count = min_count
90
+ self.workers = workers
91
+ self.embedding_dim = embedding_dim
92
+
93
+ def fit(self, X, y):
94
+ # https://stackoverflow.com/questions/17242456/python-print-sys-stdout-write-not-visible-when-using-logging
95
+ # https://stackoverflow.com/questions/230751/how-can-i-flush-the-output-of-the-print-function
96
+ print('WORD2VEC:', flush=True)
97
+ # This fits the word2vec model
98
+ self.model = Word2Vec(sentences = X, vector_size=self.vector_size, window=self.window
99
+ , min_count=self.min_count, workers=self.workers)
100
+ print('--> Word2Vec Fitted', flush=True)
101
+ return self
102
+
103
+ def transform(self, X):
104
+ # This bit should transform the sentences
105
+ embedded_sentences = []
106
+
107
+ for sentence in X:
108
+ sentence_vectors = []
109
+
110
+ for word in sentence:
111
+ features = {
112
+ 'bias': 1.0,
113
+ 'word.lower()': word.lower(),
114
+ 'word[-3:]': word[-3:],
115
+ 'word[-2:]': word[-2:],
116
+ 'word.isupper()': word.isupper(),
117
+ 'word.istitle()': word.istitle(),
118
+ 'word.isdigit()': word.isdigit(),
119
+ }
120
+ if word in self.model.wv:
121
+ vec = self.model.wv[word]
122
+ else:
123
+ vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
124
+
125
+ # https://stackoverflow.com/questions/58736548/how-to-use-word-embedding-as-features-for-crf-sklearn-crfsuite-model-training
126
+ for index in range(len(vec)):
127
+ features[f"embedding_{index}"] = vec[index]
128
+
129
+ sentence_vectors.append(features)
130
+
131
+ embedded_sentences.append(sentence_vectors)
132
+ print('--> Embeddings Complete \n', flush=True)
133
+
134
+ return embedded_sentences
135
+
136
+ class tfidfTransformer(BaseEstimator, TransformerMixin):
137
+ def __init__(self):
138
+ self.model = None
139
+ self.embedding_dim = None
140
+ self.idf = None
141
+ self.vocab_size = None
142
+ self.vocab = None
143
+
144
+ def fit(self, X, y = None):
145
+ print('TFIDF:', flush=True)
146
+ joined_sentences = [' '.join(tokens) for tokens in X]
147
+ self.model = TfidfVectorizer()
148
+ self.model.fit(joined_sentences)
149
+ self.vocab = self.model.vocabulary_
150
+ self.idf = self.model.idf_
151
+ self.vocab_size = len(self.vocab)
152
+ self.embedding_dim = self.vocab_size
153
+ print('--> TFIDF Fitted', flush=True)
154
+ return self
155
+
156
+ def transform(self, X):
157
+
158
+ embedded = []
159
+ for sentence in X:
160
+ sent_vecs = []
161
+ token_counts = {}
162
+ for word in sentence:
163
+ token_counts[word] = token_counts.get(word, 0) + 1
164
+
165
+ sent_len = len(sentence)
166
+ for word in sentence:
167
+ vec = np.zeros(self.vocab_size)
168
+ if word in self.vocab:
169
+ tf = token_counts[word] / sent_len
170
+ token_idx = self.vocab[word]
171
+ vec[token_idx] = tf * self.idf[token_idx]
172
+ sent_vecs.append(vec)
173
+ embedded.append(torch.tensor(sent_vecs, dtype=torch.float32))
174
+ print('--> Embeddings Complete \n', flush=True)
175
+
176
+
177
+ return embedded
178
+
179
+ class GloveTransformer(BaseEstimator, TransformerMixin):
180
+ def __init__(self):
181
+ self.model = None
182
+ self.embedding_dim = 300
183
+
184
+ def fit(self, X, y=None):
185
+ print('GLOVE', flush = True)
186
+ self.model = api.load('glove-wiki-gigaword-300')
187
+ print('--> Glove Downloaded', flush=True)
188
+ return self
189
+
190
+ def transform(self, X):
191
+ # This bit should transform the sentences
192
+ print('--> Beginning embeddings', flush=True)
193
+ embedded_sentences = []
194
+
195
+ for sentence in X:
196
+ sentence_vectors = []
197
+
198
+ for word in sentence:
199
+ if word in self.model:
200
+ vec = self.model[word]
201
+ else:
202
+ vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
203
+
204
+ sentence_vectors.append(vec)
205
+
206
+ embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
207
+ print('--> Embeddings Complete \n', flush=True)
208
+
209
+ return embedded_sentences
210
+
211
+ class Bio2VecTransformer():
212
+ def __init__(self, vector_size = 200, window = 5, min_count = 1, workers = 1, embedding_dim=200):
213
+ self.model = None
214
+ self.vector_size = vector_size
215
+ self.window = window
216
+ self.min_count = min_count
217
+ self.workers = workers
218
+ self.embedding_dim = embedding_dim
219
+
220
+ def fit(self, X, y):
221
+ print('BIO2VEC:', flush=True)
222
+ # https://stackoverflow.com/questions/58055415/how-to-load-bio2vec-in-gensim
223
+ self.model = Bio2VecModel
224
+ print('--> BIO2VEC Fitted', flush=True)
225
+ return self
226
+
227
+ def transform(self, X):
228
+ # This bit should transform the sentences
229
+ embedded_sentences = []
230
+
231
+ for sentence in X:
232
+ sentence_vectors = []
233
+
234
+ for word in sentence:
235
+ if word in self.model:
236
+ vec = self.model[word]
237
+ else:
238
+ vec = np.random.normal(scale=0.6, size=(self.embedding_dim,))
239
+
240
+ sentence_vectors.append(vec)
241
+
242
+ embedded_sentences.append(torch.tensor(sentence_vectors, dtype=torch.float32))
243
+ print('--> Embeddings Complete \n', flush=True)
244
+
245
+ return embedded_sentences
246
+
247
+ class BiLSTM_NER(nn.Module):
248
+ def __init__(self,input_dim, hidden_dim, tagset_size):
249
+ super(BiLSTM_NER, self).__init__()
250
+
251
+ # Embedding layer
252
+ #Freeze= false means that it will fine tune
253
+ #self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze = False, padding_idx=-1)
254
+
255
+ self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
256
+ self.fc = nn.Linear(hidden_dim*2, tagset_size)
257
+
258
+ def forward(self, sentences):
259
+ #embeds = self.embedding(sentences)
260
+ lstm_out, _ = self.lstm(sentences)
261
+ tag_scores = self.fc(lstm_out)
262
+
263
+ return tag_scores
264
+
265
+ def pad(batch):
266
+ # batch is a list of (X, y) pairs
267
+ X_batch, y_batch = zip(*batch)
268
+
269
+ # Convert to tensors
270
+ X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in X_batch]
271
+ y_batch = [torch.tensor(seq, dtype=torch.long) for seq in y_batch]
272
+
273
+ # Pad sequences
274
+ X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
275
+ y_padded = pad_sequence(y_batch, batch_first=True, padding_value=PAD_VALUE)
276
+
277
+ return X_padded, y_padded
278
+
279
+ def pred_pad(batch):
280
+ X_batch = [torch.tensor(seq, dtype=torch.float32) for seq in batch]
281
+ X_padded = pad_sequence(X_batch, batch_first=True, padding_value=PAD_VALUE)
282
+ return X_padded
283
+
284
+ class Ner_Dataset(Dataset):
285
+ def __init__(self, X, y):
286
+ self.X = X
287
+ self.y = y
288
+
289
+ def __len__(self):
290
+ return len(self.X)
291
+
292
+ def __getitem__(self, idx):
293
+ return self.X[idx], self.y[idx]
294
+
295
+
296
+ class LSTM(BaseEstimator, ClassifierMixin):
297
+ def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
298
+ self.embedding_dim = embedding_dim
299
+ self.hidden_dim = hidden_dim
300
+ self.epochs = epochs
301
+ self.learning_rate = learning_rate
302
+ self.tag2idx = tag2idx
303
+
304
+
305
+
306
+ def fit(self, embedded, encoded_tags):
307
+ #print('LSTM started:', flush=True)
308
+ data = Ner_Dataset(embedded, encoded_tags)
309
+ train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
310
+
311
+ self.model = self.train_LSTM(train_loader)
312
+ #print('--> Epochs: ', self.epochs, flush=True)
313
+ #print('--> Learning Rate: ', self.learning_rate)
314
+ return self
315
+
316
+ def predict(self, X):
317
+ # Switch to evaluation mode
318
+
319
+ test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
320
+
321
+ self.model.eval()
322
+ predictions = []
323
+
324
+ # Iterate through test data
325
+ with torch.no_grad():
326
+ for X_batch in test_loader:
327
+ X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
328
+
329
+ tag_scores = self.model(X_batch)
330
+ _, predicted_tags = torch.max(tag_scores, dim=2)
331
+
332
+ flattened_pred = predicted_tags.view(-1)
333
+
334
+ predictions.append(list(flattened_pred.cpu().numpy()))
335
+
336
+
337
+ #print('before concat',predictions)
338
+ #predictions = np.concatenate(predictions)
339
+ #print('after concat',predictions)
340
+
341
+ tag_encoder = LabelEncoder()
342
+ tag_encoder.fit(['B-AC', 'O', 'B-LF', 'I-LF'])
343
+
344
+ str_pred = []
345
+ for sentence in predictions:
346
+ str_sentence = tag_encoder.inverse_transform(sentence)
347
+ str_pred.append(list(str_sentence))
348
+ return str_pred
349
+
350
+
351
+ def train_LSTM(self, train_loader):
352
+
353
+ input_dim = self.embedding_dim
354
+ # Instantiate the lstm_model
355
+ lstm_model = BiLSTM_NER(input_dim, hidden_dim=self.hidden_dim, tagset_size=len(self.tag2idx))
356
+ lstm_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
357
+
358
+ # Loss function and optimizer
359
+ loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE) # Ignore padding
360
+ optimizer = optim.Adam(lstm_model.parameters(), lr=self.learning_rate)
361
+ #print('--> Training LSTM')
362
+
363
+ # Training loop
364
+ for epoch in range(self.epochs):
365
+ total_loss = 0
366
+ total_correct = 0
367
+ total_words = 0
368
+ lstm_model.train() # Set model to training mode
369
+
370
+ for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
371
+ X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
372
+
373
+ # Zero gradients
374
+ optimizer.zero_grad()
375
+
376
+ # Forward pass
377
+ tag_scores = lstm_model(X_batch)
378
+
379
+ # Reshape and compute loss (ignore padded values)
380
+ loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
381
+
382
+ # Backward pass and optimization
383
+ loss.backward()
384
+ optimizer.step()
385
+
386
+ total_loss += loss.item()
387
+
388
+ # Compute accuracy for this batch
389
+ # Get the predicted tags (index of max score)
390
+ _, predicted_tags = torch.max(tag_scores, dim=2)
391
+
392
+ # Flatten the tensors to compare word-by-word
393
+ flattened_pred = predicted_tags.view(-1)
394
+ flattened_true = y_batch.view(-1)
395
+
396
+ # Exclude padding tokens from the accuracy calculation
397
+ mask = flattened_true != PAD_VALUE
398
+ correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
399
+
400
+ # Count the total words in the batch (ignoring padding)
401
+ total_words_batch = mask.sum().item()
402
+
403
+ # Update total correct and total words
404
+ total_correct += correct
405
+ total_words += total_words_batch
406
+
407
+ avg_loss = total_loss / len(train_loader)
408
+ avg_accuracy = total_correct / total_words * 100 # Accuracy in percentage
409
+
410
+ #print(f' ==> Epoch {epoch + 1}/{self.epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
411
+
412
+ return lstm_model
413
+
414
+
415
+ # Define the FeedForward NN Model
416
+ class FeedForwardNN_NER(nn.Module):
417
+ def __init__(self, embedding_dim, hidden_dim, tagset_size):
418
+ super(FeedForwardNN_NER, self).__init__()
419
+ self.fc1 = nn.Linear(embedding_dim, hidden_dim)
420
+ self.relu = nn.ReLU()
421
+ self.fc2 = nn.Linear(hidden_dim, tagset_size)
422
+
423
+ def forward(self, x):
424
+ x = self.fc1(x)
425
+ x = self.relu(x)
426
+ logits = self.fc2(x)
427
+ return logits
428
+
429
+
430
+
431
+ class FeedforwardNN(BaseEstimator, ClassifierMixin):
432
+ def __init__(self, embedding_dim = None, hidden_dim = 128, epochs = 5, learning_rate = 0.001, tag2idx = None):
433
+ self.embedding_dim = embedding_dim
434
+ self.hidden_dim = hidden_dim
435
+ self.epochs = epochs
436
+ self.learning_rate = learning_rate
437
+ self.tag2idx = tag2idx
438
+
439
+
440
+
441
+ def fit(self, embedded, encoded_tags):
442
+ print('Feed Forward NN: ', flush=True)
443
+ data = Ner_Dataset(embedded, encoded_tags)
444
+ train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad)
445
+
446
+ self.model = self.train_FF(train_loader)
447
+ print('--> Feed Forward trained', flush=True)
448
+ return self
449
+
450
+ def predict(self, X):
451
+ # Switch to evaluation mode
452
+
453
+ test_loader = DataLoader(X, batch_size=1, shuffle=False, collate_fn=pred_pad)
454
+
455
+ self.model.eval()
456
+ predictions = []
457
+
458
+ # Iterate through test data
459
+ with torch.no_grad():
460
+ for X_batch in test_loader:
461
+ X_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
462
+
463
+ tag_scores = self.model(X_batch)
464
+ _, predicted_tags = torch.max(tag_scores, dim=2)
465
+
466
+ # Flatten the tensors to compare word-by-word
467
+ flattened_pred = predicted_tags.view(-1)
468
+ predictions.append(flattened_pred.cpu().numpy())
469
+
470
+ str_pred = []
471
+ for sentence in predictions:
472
+ str_sentence = tag_encoder.inverse_transform(sentence)
473
+ str_pred.append(list(str_sentence))
474
+ return str_pred
475
+
476
+
477
+ def train_FF(self, train_loader):
478
+
479
+
480
+
481
+ # Instantiate the lstm_model
482
+ ff_model = FeedForwardNN_NER(self.embedding_dim, hidden_dim=self.hidden_dim, tagset_size=len(self.tag2idx))
483
+ ff_model.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
484
+
485
+ # Loss function and optimizer
486
+ loss_function = nn.CrossEntropyLoss(ignore_index=PAD_VALUE) # Ignore padding
487
+ optimizer = optim.Adam(ff_model.parameters(), lr=self.learning_rate)
488
+ print('--> Training FF')
489
+
490
+ # Training loop
491
+ for epoch in range(self.epochs):
492
+ total_loss = 0
493
+ total_correct = 0
494
+ total_words = 0
495
+ ff_model.train() # Set model to training mode
496
+
497
+ for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
498
+ X_batch, y_batch = X_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')), y_batch.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
499
+
500
+ # Zero gradients
501
+ optimizer.zero_grad()
502
+
503
+ # Forward pass
504
+ tag_scores = ff_model(X_batch)
505
+
506
+ # Reshape and compute loss (ignore padded values)
507
+ loss = loss_function(tag_scores.view(-1, len(self.tag2idx)), y_batch.view(-1))
508
+
509
+ # Backward pass and optimization
510
+ loss.backward()
511
+ optimizer.step()
512
+
513
+ total_loss += loss.item()
514
+
515
+ # Compute accuracy for this batch
516
+ # Get the predicted tags (index of max score)
517
+ _, predicted_tags = torch.max(tag_scores, dim=2)
518
+
519
+ # Flatten the tensors to compare word-by-word
520
+ flattened_pred = predicted_tags.view(-1)
521
+ flattened_true = y_batch.view(-1)
522
+
523
+ # Exclude padding tokens from the accuracy calculation
524
+ mask = flattened_true != PAD_VALUE
525
+ correct = (flattened_pred[mask] == flattened_true[mask]).sum().item()
526
+
527
+ # Count the total words in the batch (ignoring padding)
528
+ total_words_batch = mask.sum().item()
529
+
530
+ # Update total correct and total words
531
+ total_correct += correct
532
+ total_words += total_words_batch
533
+
534
+ avg_loss = total_loss / len(train_loader)
535
+ avg_accuracy = total_correct / total_words * 100 # Accuracy in percentage
536
+
537
+ print(f' ==> Epoch {epoch + 1}/{self.epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.2f}%')
538
+
539
+ return ff_model
540
+
541
+ crf = sklearn_crfsuite.CRF(
542
+ algorithm='lbfgs',
543
+ c1=0.1,
544
+ c2=0.1,
545
+ max_iterations=100,
546
+ all_possible_transitions=True)
547
+