zao1234 commited on
Commit
ab07d1b
·
verified ·
1 Parent(s): d03b8e2

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +95 -0
README.md CHANGED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # News Classifier -- The Evaluation Pipeline
2
+ ## The colab link: https://colab.research.google.com/drive/1OmIHVN0joIgjGgYCdqLu2EO2By4yT5Xd#scrollTo=MsmKRoHuHyIp
3
+ ## Ziao You, Samuel Vara, Surya Sandeep Akella
4
+ ----------------------
5
+
6
+ ## The codes here are the same as the colab link. It shows how to call our model to evaluate the test set. Please use the colab link for easier usage.
7
+ ##
8
+
9
+ ----------------------
10
+
11
+ ### pip install package
12
+ ```
13
+ !pip install datasets > delete.txt
14
+ ```
15
+
16
+ ### !!! Load Test Set -- Change the file path of test set
17
+ ```
18
+ import pandas as pd
19
+ df_test = pd.read_csv('/content/test_data.csv',index_col="Unnamed: 0")
20
+ df_test.head()
21
+ ```
22
+
23
+ ### Load Model from Hugging Face Hub (Don't change)
24
+ ```
25
+ from huggingface_hub import snapshot_download
26
+ import keras
27
+
28
+ # Download model from hugging face
29
+ local_path = snapshot_download(repo_id="HermesPenn/athena_model")
30
+
31
+ # Load model from local
32
+ model = keras.saving.load_model(local_path)
33
+ ```
34
+ ### Load Training set (Don't change)
35
+ ```
36
+ from datasets import load_dataset
37
+
38
+ dataset = load_dataset("HermesPenn/athena_data")
39
+ dataset = dataset['train']
40
+ data = dataset.to_pandas()
41
+ data.head()
42
+ ```
43
+
44
+ ### Fit_transform label_encoder and tokenizer (Don't change)
45
+ ```
46
+ from sklearn.preprocessing import LabelEncoder
47
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
48
+ from tensorflow.keras.preprocessing.text import Tokenizer
49
+ # Data preprocessing
50
+ le = LabelEncoder()
51
+ data['label'] = le.fit_transform(data['source'])
52
+ X = data['title']
53
+ y = data['label']
54
+
55
+ # Tokenize and pad text data
56
+ tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
57
+ tokenizer.fit_on_texts(X)
58
+ X_seq = tokenizer.texts_to_sequences(X)
59
+ X_padded = pad_sequences(X_seq, maxlen=200, padding='post', truncating='post')
60
+ ```
61
+
62
+ ### Test set Evaluation (Don't change)
63
+
64
+ ```
65
+ from sklearn.metrics import classification_report
66
+
67
+ X_test = df_test['title']
68
+ y_test = df_test['label']
69
+
70
+
71
+ X_test_seq = tokenizer.texts_to_sequences(X_test)
72
+ X_test_padded = pad_sequences(X_test_seq, maxlen=200, padding='post', truncating='post')
73
+
74
+ # Predict the labels using the model
75
+ y_pred_probs = model.predict(X_test_padded)
76
+ y_pred = (y_pred_probs > 0.5).astype(int)
77
+
78
+ # Evaluate the model
79
+ print("Classification Report:")
80
+ print(classification_report(y_test, y_pred))
81
+
82
+
83
+ try:
84
+ news_outlets = le.inverse_transform(y_pred.flatten()) # le must be pre-fitted
85
+ df_test['Predicted News Outlet'] = news_outlets
86
+ except NameError:
87
+ df_test['Predicted News Outlet'] = y_pred.flatten()
88
+ ```
89
+
90
+ ```
91
+ # Display test set with predictions
92
+ print("\nTest Set with Predictions:")
93
+
94
+ df_test[['title', 'News Outlet', 'Predicted News Outlet']]
95
+ ```