SmitaGautam commited on
Commit
bb65e7e
·
verified ·
1 Parent(s): eba819c

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +28 -0
  2. ner_svm_4_withpos_kaggle.pkl +3 -0
  3. requirements.txt +5 -0
  4. svm_predict.py +24 -0
  5. train.py +233 -0
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from svm_predict import predict
3
+
4
+ def process_sentence(sentence):
5
+ words, tags = predict(sentence)
6
+ return " ".join([f"<span style='color:green;'>{word}</span>_<span style='color:blue;'>{tag}</span>" for word, tag in zip(words, tags)])
7
+
8
+ iface = gr.Interface(
9
+ fn=process_sentence,
10
+ inputs=gr.Textbox(label="Enter a sentence", lines=4),
11
+ outputs=gr.HTML(label="NEI tagged sentence", elem_id="output-box"),
12
+ css="""
13
+ #input-box {
14
+ width: 50%;
15
+ height: 150px;
16
+ }
17
+ #output-box {
18
+ overflow-y: scroll; /* Always allow vertical scrolling */
19
+ padding: 10px;
20
+ border-radius: 5px;
21
+ box-sizing: border-box; /* Ensures padding is included */
22
+ white-space: pre-wrap; /* Ensure the text wraps to avoid horizontal scrolling */
23
+ }
24
+ """,
25
+ live=False
26
+ )
27
+
28
+ iface.launch()
ner_svm_4_withpos_kaggle.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c2e68481dbc9bc18616af8926d7d3cd95733ea8e31bd877314a9493ceb999b1
3
+ size 19938658
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ nltk
3
+ seaborn
4
+ joblib
5
+ numpy
svm_predict.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk import word_tokenize
3
+ from nltk import pos_tag
4
+ import joblib
5
+ from train import feature_vector, pos_tags
6
+
7
+ model = joblib.load('ner_svm_4_withpos_kaggle.pkl')
8
+ nltk.download('averaged_perceptron_tagger_eng')
9
+
10
+ def predict(sentence):
11
+ tokens = word_tokenize(sentence)
12
+ sent_pos_tags = pos_tag(tokens)
13
+ predictions = []
14
+ for idx, word in enumerate(tokens):
15
+ prev_tag = -1 if idx==0 else sent_pos_tags[idx-1][1]
16
+ next_tag = -1 if idx==len(tokens)-1 else sent_pos_tags[idx+1][1]
17
+ current_tag = sent_pos_tags[idx][1]
18
+ prev_idx = pos_tags.index(prev_tag) if prev_tag in pos_tags else -1
19
+ next_idx = pos_tags.index(next_tag) if next_tag in pos_tags else -1
20
+ current_idx = pos_tags.index(current_tag) if current_tag in pos_tags else -1
21
+ vec = feature_vector(word, prev_idx, next_idx, current_idx)
22
+ y_pred = model.predict([vec])
23
+ predictions.append(y_pred[0])
24
+ return tokens, predictions
train.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import numpy as np
3
+ from sklearn.svm import SVC
4
+ from tqdm.notebook import tqdm
5
+ from sklearn.preprocessing import StandardScaler
6
+ from sklearn.metrics import classification_report
7
+ import nltk
8
+ from nltk.corpus import stopwords
9
+ from nltk import word_tokenize
10
+ from nltk import pos_tag
11
+ import pickle
12
+ import time
13
+ from nltk.corpus import names, gazetteers
14
+ from sklearn.model_selection import KFold
15
+ from itertools import chain
16
+ from sklearn.metrics import precision_score, recall_score, fbeta_score, confusion_matrix
17
+ import matplotlib.pyplot as plt
18
+ import seaborn as sns
19
+
20
+
21
+ nltk.download('stopwords')
22
+ stopwords = stopwords.words('english')
23
+
24
+ pos_tags = [ 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS',
25
+ 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD',
26
+ 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'
27
+ ]
28
+
29
+ def feature_vector(word, prev_word_pos_tag, next_word_pos_tag, current_word_pos_tag):
30
+ vec = np.zeros(116).astype('float32')
31
+ if(word.istitle()):
32
+ vec[0] = 1
33
+ if word.lower() in stopwords:
34
+ vec[1] = 1
35
+ if(word.isupper()):
36
+ vec[2] = 1
37
+ vec[3] = len(word)
38
+ vec[4] = word.isdigit()
39
+
40
+ if prev_word_pos_tag!=-1:
41
+ vec[5+prev_word_pos_tag] = 1
42
+
43
+ if next_word_pos_tag!=-1:
44
+ vec[42+next_word_pos_tag] = 1
45
+
46
+ if current_word_pos_tag!=-1:
47
+ vec[79+current_word_pos_tag] = 1
48
+
49
+ return vec
50
+
51
+
52
+ def feature_vector2(word, prev_word_pos_tag, next_word_pos_tag, current_word_pos_tag):
53
+ vec = np.zeros(9).astype('float32')
54
+ if(word.istitle()):
55
+ vec[0] = 1
56
+ if word.lower() in stopwords:
57
+ vec[1] = 1
58
+ if(word.isupper()):
59
+ vec[2] = 1
60
+ vec[3] = len(word)
61
+ vec[4] = word.isdigit()
62
+ # idx : -11, 0...36
63
+ # if prev_word_pos_tag!=-11:
64
+ # vec[5+prev_word_pos_tag] = 1
65
+
66
+ # if next_word_pos_tag!=-11:
67
+ # vec[42+next_word_pos_tag] = 1
68
+
69
+ # if current_word_pos_tag!=-11:
70
+ # vec[79+current_word_pos_tag] = 1
71
+
72
+ vec[5] = 1 if word in places else 0
73
+ vec[6] = 1 if word in people else 0
74
+ vec[7] = 1 if word in countries else 0
75
+ vec[8] = 1 if word in nationalities else 0
76
+ return vec
77
+
78
+
79
+ # This function is used to make dataset with features and target label
80
+
81
+ def create_data(data):
82
+ x_train = []
83
+ y_train = []
84
+ for x in data:
85
+ for y in range(len(x['tokens'])):
86
+ prev_pos = -1 if y==0 or x['pos_tags'][y-1]<10 else x['pos_tags'][y-1]
87
+ next_pos = -1 if y==len(x['tokens'])-1 or x['pos_tags'][y+1]<10 else x['pos_tags'][y+1]
88
+ current_pos = -1 if x['pos_tags'][y]<10 else x['pos_tags'][y]
89
+ wordVec = feature_vector(x['tokens'][y], prev_pos-10, next_pos-10, current_pos-10)
90
+ x_train.append(wordVec)
91
+ y_train.append(1 if x['ner_tags'][y]!=0 else 0)
92
+ return x_train, y_train
93
+
94
+ def evaluate_overall_metrics(predictions, folds):
95
+ precision, recall, f0_5_score, f1_score, f2_score = 0, 0, 0, 0, 0
96
+
97
+ for i, (test_label_flat, y_pred_flat) in enumerate(predictions):
98
+ # test_label_flat = list(chain.from_iterable(test_label))
99
+ # y_pred_flat = list(chain.from_iterable(y_pred))
100
+
101
+ # Calculate scores
102
+ f0_5_score += fbeta_score(test_label_flat, y_pred_flat, beta=0.5, average='weighted')
103
+ f1_score += fbeta_score(test_label_flat, y_pred_flat, beta=1, average='weighted')
104
+ f2_score += fbeta_score(test_label_flat, y_pred_flat, beta=2, average='weighted')
105
+ precision += precision_score(test_label_flat, y_pred_flat, average='weighted')
106
+ recall += recall_score(test_label_flat, y_pred_flat, average='weighted')
107
+
108
+ # Averaging across folds
109
+ f0_5_score /= folds
110
+ f1_score /= folds
111
+ f2_score /= folds
112
+ precision /= folds
113
+ recall /= folds
114
+
115
+ print(f'Overall Metrics:')
116
+ print(f'Precision : {precision:.3f}')
117
+ print(f'Recall : {recall:.3f}')
118
+ print(f'F0.5 Score : {f0_5_score:.3f}')
119
+ print(f'F1 Score : {f1_score:.3f}')
120
+ print(f'F2 Score : {f2_score:.3f}\n')
121
+
122
+ def evaluate_per_pos_metrics(predictions, labels):
123
+ combined_true = []
124
+ combined_pred = []
125
+
126
+ # Flatten the list of lists structure
127
+ for test_label, y_pred in predictions:
128
+ # for sentence_labels, sentence_preds in zip(test_label, y_pred):
129
+ combined_true.extend(test_label)
130
+ combined_pred.extend(y_pred)
131
+
132
+ for tag in labels:
133
+ true_binary = [1 if t == tag else 0 for t in combined_true]
134
+ pred_binary = [1 if p == tag else 0 for p in combined_pred]
135
+
136
+ # Calculate metrics for the tag
137
+ precision = precision_score(true_binary, pred_binary, average='binary', zero_division=0)
138
+ recall = recall_score(true_binary, pred_binary, average='binary', zero_division=0)
139
+ f1_score = fbeta_score(true_binary, pred_binary, beta=1, average='binary', zero_division=0)
140
+
141
+ print(f"Metrics for {tag}:")
142
+ print(f'Precision : {precision:.3f}')
143
+ print(f'Recall : {recall:.3f}')
144
+ print(f'F1 Score : {f1_score:.3f}\n')
145
+
146
+ def plot_confusion_matrix(predictions, labels, folds):
147
+ matrix = None
148
+ for i, (test_label_flat, y_pred_flat) in enumerate(predictions):
149
+ # test_label_flat = list(chain.from_iterable(test_label))
150
+ # y_pred_flat = list(chain.from_iterable(y_pred))
151
+
152
+ # Compute confusion matrix for this fold
153
+ cm = confusion_matrix(test_label_flat, y_pred_flat, labels=labels)
154
+ if i == 0:
155
+ matrix = cm
156
+ else:
157
+ matrix += cm
158
+
159
+ matrix = matrix.astype('float')
160
+ matrix = matrix / folds
161
+ matrix = matrix / np.sum(matrix, axis=1, keepdims=True) # Normalize
162
+
163
+ plt.figure(figsize=(10, 8))
164
+ sns.heatmap(matrix, annot=True, fmt=".2f", cmap='Blues', xticklabels=labels, yticklabels=labels)
165
+ plt.xlabel('Predicted')
166
+ plt.ylabel('Actual')
167
+ plt.title('Normalized Confusion Matrix for NER')
168
+ plt.show()
169
+
170
+ if __name__ == "__main__":
171
+ data = load_dataset("conll2003", trust_remote_code=True)
172
+ d_train = data['train']
173
+ d_validation = data['validation']
174
+ d_test = data['test']
175
+
176
+ nltk.download('gazetteers')
177
+ places=set(gazetteers.words())
178
+ people=set(names.words())
179
+ countries=set(gazetteers.words('countries.txt'))
180
+ nationalities=set(gazetteers.words('nationalities.txt'))
181
+ x_train, y_train = create_data(d_train)
182
+ x_val, y_val = create_data(d_validation)
183
+ x_test, y_test = create_data(d_test)
184
+ all_X_train = np.concatenate((x_train, x_val, x_test))
185
+ all_y_train = np.concatenate((y_train, y_val, y_test))
186
+
187
+ #K-Fold
188
+ num_fold = 5
189
+ kf = KFold(n_splits=num_fold, random_state=42, shuffle=True)
190
+ indices = np.arange(len(all_X_train))
191
+
192
+ predictions = []
193
+ all_models = []
194
+
195
+ for i, (train_index, test_index) in enumerate(kf.split(indices)):
196
+ print(f"Fold {i} Train Length: {len(train_index)} Test Length: {len(test_index)}")
197
+ # all_folds.append((train_index, test_index))# Standardize the features such that all features contribute equally to the distance metric computation of the SVM
198
+ X_train = all_X_train[train_index]
199
+ y_train = all_y_train[train_index]
200
+
201
+ X_test = all_X_train[test_index]
202
+ y_test = all_y_train[test_index]
203
+
204
+ # scaler = StandardScaler()
205
+ # Fit only on the training data (i.e. compute mean and std)
206
+ # X_train = scaler.fit_transform(X_train)
207
+
208
+ # Use the train data fit values to scale val and test
209
+ # X_train = scaler.transform(X_train)
210
+ # X_val = scaler.transform(X_val)
211
+ # X_test = scaler.transform(X_test)
212
+
213
+ model = SVC(random_state = 42, verbose = True)
214
+ model.fit(X_train, y_train)
215
+
216
+ y_pred_val = model.predict(X_test)
217
+
218
+ print("-------"*6)
219
+ print(classification_report(y_true=y_test, y_pred=y_pred_val))
220
+ print("-------"*6)
221
+
222
+ pickle.dump(model, open(f"ner_svm_{str(i)}.pkl", 'wb'))
223
+
224
+ predictions.append((y_test, y_pred_val))
225
+ all_models.append(model)
226
+ break
227
+
228
+
229
+ FOLDS = 5
230
+ labels = sorted(model.classes_)
231
+ evaluate_overall_metrics(predictions, FOLDS)
232
+ evaluate_per_pos_metrics(predictions, labels)
233
+ plot_confusion_matrix(predictions, labels, FOLDS)