awngsz
/

baseline_model

@@ -8,10 +8,13 @@ tags: []
 <!-- Provide a quick summary of what the model is/does. -->
 This is the baseline model for the news source classification project.
-Code to load the model:
 from huggingface_hub import hf_hub_download
 import joblib
 repo_id='awngsz/baseline_model'
 filename='CIS5190_Proj2_AWNGSZ.joblib'
@@ -20,8 +23,90 @@ model=joblib.load(file_path)
 print(model)
-Code to perform inference:
-model.predict('test.csv')
 ## Model Details

 <!-- Provide a quick summary of what the model is/does. -->
 This is the baseline model for the news source classification project.
+Please run the following evaluation pipeline code:
+##### START #####
 from huggingface_hub import hf_hub_download
 import joblib
+#Load model from Huggingface
 repo_id='awngsz/baseline_model'
 filename='CIS5190_Proj2_AWNGSZ.joblib'
 print(model)
+#Load test dataset (assuming the name is the same as the one in the Ed post)
+test_df = pd.read_csv(file_path)
+#Copying the naming convention from the sample dataset in the edpost
+X_test = test_df['title']
+y_test = test_df['labels']
+#Load the embedding model from Huggingface
+############################################# Transformer: DistilBERT #############################################
+from transformers import DistilBertTokenizer, DistilBertModel
+# pytorch related packages
+import torch
+import torchvision
+from torchvision import transforms, utils
+import torch.nn as nn
+import torch.optim as optim
+import torchvision.transforms as transforms
+from PIL import Image
+from skimage import io, transform
+from torchvision.io import read_image
+from torch.utils.data import Dataset, DataLoader
+def get_embeddings(text_all, tokenizer, model, max_len = 128):
+  '''
+  return: embeddings list
+  '''
+  embeddings = []
+  count = 0
+  print('Start embeddings:')
+  for text in text_all:
+    count += 1
+    if count % (len(text_all) // 10) == 0:
+      print(f'{count / len(text_all) * 100:.1f}% done ...')
+    model_input_token = tokenizer(
+                    text,
+                    add_special_tokens = True,
+                    max_length = max_len,
+                    padding = 'max_length',
+                    truncation = True,
+                    return_tensors = 'pt'
+                    )
+    with torch.no_grad():
+      model_output = model(**model_input_token)
+      cls_embedding = model_output.last_hidden_state[:, 0, :]
+      cls_embedding = cls_embedding.squeeze().numpy()
+      embeddings.append(cls_embedding)
+  return embeddings
+#Load the tokenizer and model from Hugging Face
+tokenizer_DBERT = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+transformer_model_DBERT = DistilBertModel.from_pretrained('distilbert-base-uncased')
+#Set the model to evaluation mode
+transformer_model_DBERT.eval()
+#Get the embeddings for the test data
+max_len = max(len(text) for text in X_test)
+#this may take awhile to run
+X_test_embeddings_DBERT = get_embeddings(X_test, tokenizer_DBERT, transformer_model_DBERT, max_len = max_len)
+prediction = model.predict(X_test_embeddings_DBERT)
+#Accuracy
+from sklearn.metrics import accuracy_score
+label_map = {'NBC': 1, 'FoxNews': 0}
+def compute_category_accuracy(y_true, y_pred, label):
+  n_correct = np.sum((y_true == label) & (y_pred == label))
+  n_total = np.sum(y_true == label)
+  cat_accuracy = n_correct / n_total
+  return cat_accuracy
+#Print accuracy
+print(f'Test accuracy: {accuracy_score(y_test, prediction) * 100:.2f}%')
+print(f'Test accuracy for NBC: {compute_category_accuracy(y_test, prediction, label_map["NBC"]) * 100:.2f}%')
+print(f'Test accuracy for FoxNews: {compute_category_accuracy(y_test, prediction, label_map["FoxNews"]) * 100:.2f}%')
+##### END ######
 ## Model Details