nei-demo-backup

Sleeping

App Files Files Community

SmitaGautam commited on Nov 3, 2024

Commit

0732d74

verified ·

1 Parent(s): 25b106e

Update train.py

Browse files

Files changed (1) hide show

train.py +322 -233

train.py CHANGED Viewed

@@ -1,233 +1,322 @@
-from datasets import load_dataset
-import numpy as np
-from sklearn.svm import SVC
-from tqdm.notebook import tqdm
-from sklearn.preprocessing import StandardScaler
-from sklearn.metrics import classification_report
-import nltk
-from nltk.corpus import stopwords
-from nltk import word_tokenize
-from nltk import pos_tag
-import pickle
-import time
-from nltk.corpus import names, gazetteers
-from sklearn.model_selection import KFold
-from itertools import chain
-from sklearn.metrics import precision_score, recall_score, fbeta_score, confusion_matrix
-import matplotlib.pyplot as plt
-import seaborn as sns
-nltk.download('stopwords')
-stopwords = stopwords.words('english')
-pos_tags = [ 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS',
-                'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD',
-                'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'
-            ]
-def feature_vector(word, prev_word_pos_tag, next_word_pos_tag, current_word_pos_tag):
-    vec = np.zeros(116).astype('float32')
-    if(word.istitle()):
-        vec[0] = 1
-    if word.lower() in stopwords:
-        vec[1] = 1
-    if(word.isupper()):
-        vec[2] = 1
-    vec[3] = len(word)
-    vec[4] = word.isdigit()
-    if prev_word_pos_tag!=-1:
-      vec[5+prev_word_pos_tag] = 1
-    if next_word_pos_tag!=-1:
-      vec[42+next_word_pos_tag] = 1
-    if current_word_pos_tag!=-1:
-      vec[79+current_word_pos_tag] = 1
-    return vec
-def feature_vector2(word, prev_word_pos_tag, next_word_pos_tag, current_word_pos_tag):
-    vec = np.zeros(9).astype('float32')
-    if(word.istitle()):
-        vec[0] = 1
-    if word.lower() in stopwords:
-        vec[1] = 1
-    if(word.isupper()):
-        vec[2] = 1
-    vec[3] = len(word)
-    vec[4] = word.isdigit()
-    # idx : -11, 0...36
-    # if prev_word_pos_tag!=-11:
-    #   vec[5+prev_word_pos_tag] = 1
-    # if next_word_pos_tag!=-11:
-    #   vec[42+next_word_pos_tag] = 1
-    # if current_word_pos_tag!=-11:
-    #   vec[79+current_word_pos_tag] = 1
-    vec[5] = 1 if word in places else 0
-    vec[6] = 1 if word in people else 0
-    vec[7] = 1 if word in countries else 0
-    vec[8] = 1 if word in nationalities else 0
-    return vec
-# This function is used to make dataset with features and target label
-def create_data(data):
-    x_train = []
-    y_train = []
-    for x in data:
-        for y in range(len(x['tokens'])):
-            prev_pos = -1 if y==0 or x['pos_tags'][y-1]<10 else x['pos_tags'][y-1]
-            next_pos = -1 if y==len(x['tokens'])-1 or x['pos_tags'][y+1]<10 else x['pos_tags'][y+1]
-            current_pos = -1 if x['pos_tags'][y]<10 else x['pos_tags'][y]
-            wordVec = feature_vector(x['tokens'][y], prev_pos-10, next_pos-10, current_pos-10)
-            x_train.append(wordVec)
-            y_train.append(1 if x['ner_tags'][y]!=0 else 0)
-    return x_train, y_train
-def evaluate_overall_metrics(predictions, folds):
-    precision, recall, f0_5_score, f1_score, f2_score = 0, 0, 0, 0, 0
-    for i, (test_label_flat, y_pred_flat) in enumerate(predictions):
-        # test_label_flat = list(chain.from_iterable(test_label))
-        # y_pred_flat = list(chain.from_iterable(y_pred))
-        # Calculate scores
-        f0_5_score += fbeta_score(test_label_flat, y_pred_flat, beta=0.5, average='weighted')
-        f1_score += fbeta_score(test_label_flat, y_pred_flat, beta=1, average='weighted')
-        f2_score += fbeta_score(test_label_flat, y_pred_flat, beta=2, average='weighted')
-        precision += precision_score(test_label_flat, y_pred_flat, average='weighted')
-        recall += recall_score(test_label_flat, y_pred_flat, average='weighted')
-    # Averaging across folds
-    f0_5_score /= folds
-    f1_score /= folds
-    f2_score /= folds
-    precision /= folds
-    recall /= folds
-    print(f'Overall Metrics:')
-    print(f'Precision : {precision:.3f}')
-    print(f'Recall : {recall:.3f}')
-    print(f'F0.5 Score : {f0_5_score:.3f}')
-    print(f'F1 Score : {f1_score:.3f}')
-    print(f'F2 Score : {f2_score:.3f}\n')
-def evaluate_per_pos_metrics(predictions, labels):
-    combined_true = []
-    combined_pred = []
-    # Flatten the list of lists structure
-    for test_label, y_pred in predictions:
-        # for sentence_labels, sentence_preds in zip(test_label, y_pred):
-        combined_true.extend(test_label)
-        combined_pred.extend(y_pred)
-    for tag in labels:
-        true_binary = [1 if t == tag else 0 for t in combined_true]
-        pred_binary = [1 if p == tag else 0 for p in combined_pred]
-        # Calculate metrics for the tag
-        precision = precision_score(true_binary, pred_binary, average='binary', zero_division=0)
-        recall = recall_score(true_binary, pred_binary, average='binary', zero_division=0)
-        f1_score = fbeta_score(true_binary, pred_binary, beta=1, average='binary', zero_division=0)
-        print(f"Metrics for {tag}:")
-        print(f'Precision : {precision:.3f}')
-        print(f'Recall : {recall:.3f}')
-        print(f'F1 Score : {f1_score:.3f}\n')
-def plot_confusion_matrix(predictions, labels, folds):
-    matrix = None
-    for i, (test_label_flat, y_pred_flat) in enumerate(predictions):
-        # test_label_flat = list(chain.from_iterable(test_label))
-        # y_pred_flat = list(chain.from_iterable(y_pred))
-        # Compute confusion matrix for this fold
-        cm = confusion_matrix(test_label_flat, y_pred_flat, labels=labels)
-        if i == 0:
-            matrix = cm
-        else:
-            matrix += cm
-    matrix = matrix.astype('float')
-    matrix = matrix / folds
-    matrix = matrix / np.sum(matrix, axis=1, keepdims=True)  # Normalize
-    plt.figure(figsize=(10, 8))
-    sns.heatmap(matrix, annot=True, fmt=".2f", cmap='Blues', xticklabels=labels, yticklabels=labels)
-    plt.xlabel('Predicted')
-    plt.ylabel('Actual')
-    plt.title('Normalized Confusion Matrix for NER')
-    plt.show()
-if __name__ == "__main__":
-    data = load_dataset("conll2003", trust_remote_code=True)
-    d_train = data['train']
-    d_validation = data['validation']
-    d_test = data['test']
-    nltk.download('gazetteers')
-    places=set(gazetteers.words())
-    people=set(names.words())
-    countries=set(gazetteers.words('countries.txt'))
-    nationalities=set(gazetteers.words('nationalities.txt'))
-    x_train, y_train = create_data(d_train)
-    x_val, y_val = create_data(d_validation)
-    x_test, y_test = create_data(d_test)
-    all_X_train = np.concatenate((x_train, x_val, x_test))
-    all_y_train = np.concatenate((y_train, y_val, y_test))
-    #K-Fold
-    num_fold = 5
-    kf = KFold(n_splits=num_fold, random_state=42, shuffle=True)
-    indices = np.arange(len(all_X_train))
-    predictions = []
-    all_models = []
-    for i, (train_index, test_index) in enumerate(kf.split(indices)):
-        print(f"Fold {i} Train Length: {len(train_index)} Test Length: {len(test_index)}")
-        # all_folds.append((train_index, test_index))# Standardize the features such that all features contribute equally to the distance metric computation of the SVM
-        X_train = all_X_train[train_index]
-        y_train = all_y_train[train_index]
-        X_test = all_X_train[test_index]
-        y_test = all_y_train[test_index]
-        # scaler = StandardScaler()
-        # Fit only on the training data (i.e. compute mean and std)
-        # X_train = scaler.fit_transform(X_train)
-        # Use the train data fit values to scale val and test
-        # X_train = scaler.transform(X_train)
-        # X_val   = scaler.transform(X_val)
-        # X_test  = scaler.transform(X_test)
-        model = SVC(random_state = 42, verbose = True)
-        model.fit(X_train, y_train)
-        y_pred_val = model.predict(X_test)
-        print("-------"*6)
-        print(classification_report(y_true=y_test, y_pred=y_pred_val))
-        print("-------"*6)
-        pickle.dump(model, open(f"ner_svm_{str(i)}.pkl", 'wb'))
-        predictions.append((y_test, y_pred_val))
-        all_models.append(model)
-        break
-    FOLDS = 5
-    labels = sorted(model.classes_)
-    evaluate_overall_metrics(predictions, FOLDS)
-    evaluate_per_pos_metrics(predictions, labels)
-    plot_confusion_matrix(predictions, labels, FOLDS)

+from datasets import load_dataset
+import numpy as np
+from sklearn.svm import SVC
+from tqdm.notebook import tqdm
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import classification_report
+import nltk
+from nltk.corpus import stopwords
+from nltk import word_tokenize
+from nltk import pos_tag
+import pickle
+import time
+from nltk.corpus import names, gazetteers
+from sklearn.model_selection import KFold
+from itertools import chain
+from sklearn.metrics import precision_score, recall_score, fbeta_score, confusion_matrix
+import matplotlib.pyplot as plt
+import seaborn as sns
+from string import punctuation
+nltk.download('stopwords')
+stopwords = stopwords.words('english')
+PUNCT = list(punctuation)
+nltk.download('gazetteers')
+nltk.download('names')
+from nltk.corpus import names, gazetteers
+places=set(gazetteers.words())
+people=set(names.words())
+countries=set(gazetteers.words('countries.txt'))
+nationalities=set(gazetteers.words('nationalities.txt'))
+pos_tags = [ 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS',
+                'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD',
+                'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'
+            ]
+def feature_vector(w, scaled_position, pos_tag):
+    vec = np.zeros(12).astype(np.float32)
+    #if w[0].isupper():
+        #title = 1
+    #else:
+        #title = 0
+    if w.isupper():
+        allcaps = 1
+    else:
+        allcaps = 0
+    if w in PUNCT:
+        punct = 1
+    else:
+        punct = 0
+    if w.lower() in stopwords:
+      sw=1
+    else:
+      sw=0
+    if w.isdigit():
+      is_digit=1
+    else:
+      is_digit=0
+    if pos_tag in ('VB','VBD','VBG','VBN','VBP','VBZ'):
+      is_verb=1
+    else:
+      is_verb=0
+    #if pos_tag in ('NN','NNP','NNPS','NNS'):
+    if pos_tag in ('NNP','NNPS'):
+      is_noun=1
+    else:
+      is_noun=0
+    if w in places:
+      is_place=1
+    else:
+      is_place=0
+    if w in people:
+      is_people=1
+    else:
+      is_people=0
+    if w in countries:
+      is_country=1
+    else:
+      is_country=0
+    if w in nationalities:
+      is_nation=1
+    else:
+      is_nation=0
+    # Build vector
+    #vec[0] = title
+    vec[0] = allcaps
+    vec[1] = len(w)
+    vec[2] = punct
+    vec[3] = scaled_position
+    vec[4] = sw
+    vec[5] = is_digit
+    vec[6] = is_verb
+    vec[7] = is_noun
+    vec[8] = is_place
+    vec[9] = is_people
+    vec[10] = is_country
+    vec[11] = is_nation
+    return vec
+def feature_vector_d(word, prev_word_pos_tag, next_word_pos_tag, current_word_pos_tag):
+    vec = np.zeros(116).astype('float32')
+    if(word.istitle()):
+        vec[0] = 1
+    if word.lower() in stopwords:
+        vec[1] = 1
+    if(word.isupper()):
+        vec[2] = 1
+    vec[3] = len(word)
+    vec[4] = word.isdigit()
+    if prev_word_pos_tag!=-1:
+      vec[5+prev_word_pos_tag] = 1
+    if next_word_pos_tag!=-1:
+      vec[42+next_word_pos_tag] = 1
+    if current_word_pos_tag!=-1:
+      vec[79+current_word_pos_tag] = 1
+    return vec
+def feature_vector2(word, prev_word_pos_tag, next_word_pos_tag, current_word_pos_tag):
+    vec = np.zeros(9).astype('float32')
+    if(word.istitle()):
+        vec[0] = 1
+    if word.lower() in stopwords:
+        vec[1] = 1
+    if(word.isupper()):
+        vec[2] = 1
+    vec[3] = len(word)
+    vec[4] = word.isdigit()
+    # idx : -11, 0...36
+    # if prev_word_pos_tag!=-11:
+    #   vec[5+prev_word_pos_tag] = 1
+    # if next_word_pos_tag!=-11:
+    #   vec[42+next_word_pos_tag] = 1
+    # if current_word_pos_tag!=-11:
+    #   vec[79+current_word_pos_tag] = 1
+    vec[5] = 1 if word in places else 0
+    vec[6] = 1 if word in people else 0
+    vec[7] = 1 if word in countries else 0
+    vec[8] = 1 if word in nationalities else 0
+    return vec
+# This function is used to make dataset with features and target label
+def create_data(data):
+    x_train = []
+    y_train = []
+    for x in data:
+        for y in range(len(x['tokens'])):
+            prev_pos = -1 if y==0 or x['pos_tags'][y-1]<10 else x['pos_tags'][y-1]
+            next_pos = -1 if y==len(x['tokens'])-1 or x['pos_tags'][y+1]<10 else x['pos_tags'][y+1]
+            current_pos = -1 if x['pos_tags'][y]<10 else x['pos_tags'][y]
+            wordVec = feature_vector(x['tokens'][y], prev_pos-10, next_pos-10, current_pos-10)
+            x_train.append(wordVec)
+            y_train.append(1 if x['ner_tags'][y]!=0 else 0)
+    return x_train, y_train
+def evaluate_overall_metrics(predictions, folds):
+    precision, recall, f0_5_score, f1_score, f2_score = 0, 0, 0, 0, 0
+    for i, (test_label_flat, y_pred_flat) in enumerate(predictions):
+        # test_label_flat = list(chain.from_iterable(test_label))
+        # y_pred_flat = list(chain.from_iterable(y_pred))
+        # Calculate scores
+        f0_5_score += fbeta_score(test_label_flat, y_pred_flat, beta=0.5, average='weighted')
+        f1_score += fbeta_score(test_label_flat, y_pred_flat, beta=1, average='weighted')
+        f2_score += fbeta_score(test_label_flat, y_pred_flat, beta=2, average='weighted')
+        precision += precision_score(test_label_flat, y_pred_flat, average='weighted')
+        recall += recall_score(test_label_flat, y_pred_flat, average='weighted')
+    # Averaging across folds
+    f0_5_score /= folds
+    f1_score /= folds
+    f2_score /= folds
+    precision /= folds
+    recall /= folds
+    print(f'Overall Metrics:')
+    print(f'Precision : {precision:.3f}')
+    print(f'Recall : {recall:.3f}')
+    print(f'F0.5 Score : {f0_5_score:.3f}')
+    print(f'F1 Score : {f1_score:.3f}')
+    print(f'F2 Score : {f2_score:.3f}\n')
+def evaluate_per_pos_metrics(predictions, labels):
+    combined_true = []
+    combined_pred = []
+    # Flatten the list of lists structure
+    for test_label, y_pred in predictions:
+        # for sentence_labels, sentence_preds in zip(test_label, y_pred):
+        combined_true.extend(test_label)
+        combined_pred.extend(y_pred)
+    for tag in labels:
+        true_binary = [1 if t == tag else 0 for t in combined_true]
+        pred_binary = [1 if p == tag else 0 for p in combined_pred]
+        # Calculate metrics for the tag
+        precision = precision_score(true_binary, pred_binary, average='binary', zero_division=0)
+        recall = recall_score(true_binary, pred_binary, average='binary', zero_division=0)
+        f1_score = fbeta_score(true_binary, pred_binary, beta=1, average='binary', zero_division=0)
+        print(f"Metrics for {tag}:")
+        print(f'Precision : {precision:.3f}')
+        print(f'Recall : {recall:.3f}')
+        print(f'F1 Score : {f1_score:.3f}\n')
+def plot_confusion_matrix(predictions, labels, folds):
+    matrix = None
+    for i, (test_label_flat, y_pred_flat) in enumerate(predictions):
+        # test_label_flat = list(chain.from_iterable(test_label))
+        # y_pred_flat = list(chain.from_iterable(y_pred))
+        # Compute confusion matrix for this fold
+        cm = confusion_matrix(test_label_flat, y_pred_flat, labels=labels)
+        if i == 0:
+            matrix = cm
+        else:
+            matrix += cm
+    matrix = matrix.astype('float')
+    matrix = matrix / folds
+    matrix = matrix / np.sum(matrix, axis=1, keepdims=True)  # Normalize
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(matrix, annot=True, fmt=".2f", cmap='Blues', xticklabels=labels, yticklabels=labels)
+    plt.xlabel('Predicted')
+    plt.ylabel('Actual')
+    plt.title('Normalized Confusion Matrix for NER')
+    plt.show()
+if __name__ == "__main__":
+    data = load_dataset("conll2003", trust_remote_code=True)
+    d_train = data['train']
+    d_validation = data['validation']
+    d_test = data['test']
+    nltk.download('gazetteers')
+    places=set(gazetteers.words())
+    people=set(names.words())
+    countries=set(gazetteers.words('countries.txt'))
+    nationalities=set(gazetteers.words('nationalities.txt'))
+    x_train, y_train = create_data(d_train)
+    x_val, y_val = create_data(d_validation)
+    x_test, y_test = create_data(d_test)
+    all_X_train = np.concatenate((x_train, x_val, x_test))
+    all_y_train = np.concatenate((y_train, y_val, y_test))
+    #K-Fold
+    num_fold = 5
+    kf = KFold(n_splits=num_fold, random_state=42, shuffle=True)
+    indices = np.arange(len(all_X_train))
+    predictions = []
+    all_models = []
+    for i, (train_index, test_index) in enumerate(kf.split(indices)):
+        print(f"Fold {i} Train Length: {len(train_index)} Test Length: {len(test_index)}")
+        # all_folds.append((train_index, test_index))# Standardize the features such that all features contribute equally to the distance metric computation of the SVM
+        X_train = all_X_train[train_index]
+        y_train = all_y_train[train_index]
+        X_test = all_X_train[test_index]
+        y_test = all_y_train[test_index]
+        # scaler = StandardScaler()
+        # Fit only on the training data (i.e. compute mean and std)
+        # X_train = scaler.fit_transform(X_train)
+        # Use the train data fit values to scale val and test
+        # X_train = scaler.transform(X_train)
+        # X_val   = scaler.transform(X_val)
+        # X_test  = scaler.transform(X_test)
+        model = SVC(random_state = 42, verbose = True)
+        model.fit(X_train, y_train)
+        y_pred_val = model.predict(X_test)
+        print("-------"*6)
+        print(classification_report(y_true=y_test, y_pred=y_pred_val))
+        print("-------"*6)
+        pickle.dump(model, open(f"ner_svm_{str(i)}.pkl", 'wb'))
+        predictions.append((y_test, y_pred_val))
+        all_models.append(model)
+        break
+    FOLDS = 5
+    labels = sorted(model.classes_)
+    evaluate_overall_metrics(predictions, FOLDS)
+    evaluate_per_pos_metrics(predictions, labels)
+    plot_confusion_matrix(predictions, labels, FOLDS)