peterkros commited on
Commit
c0f6901
·
1 Parent(s): 8188dcd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -28
app.py CHANGED
@@ -3,33 +3,61 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
3
  import torch
4
  import pickle
5
 
6
- # Load the model and tokenizer from Hugging Face Hub
7
- model_name = "peterkros/COFOG-bert2"
8
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
9
- tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
 
 
 
 
10
 
11
  # Load the label encoder
12
- with open('label_encoder.pkl', 'rb') as file:
13
- label_encoder = pickle.load(file)
 
 
 
 
14
 
15
  def predict(text):
16
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
 
 
 
 
 
17
  with torch.no_grad():
18
- outputs = model(**inputs)
19
- probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
20
- predicted_class = torch.argmax(probs, dim=-1).item()
21
- predicted_label = label_encoder.inverse_transform([predicted_class])[0]
22
- return predicted_label
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  # Define the markdown text with bullet points
25
  markdown_text = """
26
- - Trained with ~1500 rows of data on bert-large-uncased, English.
27
- - Input one budget line per time.
28
  - Accuracy of the model is ~88%.
29
  """
30
  html_table = """
31
- <h2 style="text-align: center;">COFOG Budget Classification</h2>
32
- <p style="text-align: justify; margin-left: 20px; margin-right: 20px;">
33
  This classifier was developed utilizing the pre-trained BERT
34
  (Bidirectional Encoder Representations from Transformers) model
35
  with an uncased configuration, with over 1500 manually
@@ -37,18 +65,18 @@ html_table = """
37
  various budgetary documents. To balance the data, additional data
38
  was generated using GPT-4 where categories were not available
39
  in budget documents. The model training was executed
40
- on a Google Colab environment, specifically utilizing a Tesla T4 GPU.
41
- Detailed metrics of the training process are as follows:
42
- <code>TrainOutput(global_step=395, training_loss=1.1497593360611156,
43
- metrics={'train_runtime': 650.0119, 'train_samples_per_second':
44
- 9.638, 'train_steps_per_second': 0.608, 'total_flos': 1648509163714560.0,
45
- 'train_loss': 1.1497593360611156, 'epoch': 5.0})</code>. The model
46
- is designed to predict the primary classification level
47
  of the Classification of the Functions of Government (COFOG),
48
  with the predictions from the first level serving as contextual
49
  input for subsequent second-level classification. The project
50
  is conducted with an exclusive focus on academic and research
51
- objectives.
 
 
 
 
 
52
  </p>
53
  <table style="margin-left: auto; margin-right: auto;">
54
  <tr>
@@ -95,7 +123,7 @@ iface = gr.Interface(
95
  fn=predict,
96
  inputs=gr.components.Textbox(lines=1, placeholder="Enter Budget line here...", label="Budget Input"),
97
  outputs=gr.components.Label(label="Classification Output"),
98
- title="COFOG Level 1 Classification",
99
  description=markdown_text,
100
  article=html_table,
101
  allow_flagging="auto" # Enables flagging
@@ -103,6 +131,4 @@ iface = gr.Interface(
103
 
104
  # Run the interface
105
  if __name__ == "__main__":
106
- iface.launch()
107
-
108
-
 
3
  import torch
4
  import pickle
5
 
6
+
7
+ # Model names for level1 and level2
8
+ model_name_level1 = "peterkros/COFOG-bert2"
9
+ model_name_level2 = "peterkros/COFOG-bert-level2"
10
+
11
+ # Load models and tokenizers for both levels
12
+ model_level1 = AutoModelForSequenceClassification.from_pretrained(model_name_level1)
13
+ tokenizer_level1 = AutoTokenizer.from_pretrained(model_name_level1)
14
+
15
+ model_level2 = AutoModelForSequenceClassification.from_pretrained(model_name_level2)
16
+ tokenizer_level2 = AutoTokenizer.from_pretrained(model_name_level2)
17
+
18
 
19
  # Load the label encoder
20
+ with open('label_encoder_level1.pkl', 'rb') as file:
21
+ label_encoder_level1 = pickle.load(file)
22
+
23
+ with open('label_encoder_level2.pkl', 'rb') as file:
24
+ label_encoder_level2 = pickle.load(file)
25
+
26
 
27
  def predict(text):
28
+ # Check if the input has at least two words
29
+ if len(text.split()) < 2:
30
+ return "Input must have at least two words."
31
+
32
+ # Predict Level1
33
+ inputs_level1 = tokenizer_level1(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
34
  with torch.no_grad():
35
+ outputs_level1 = model_level1(**inputs_level1)
36
+ probs_level1 = torch.nn.functional.softmax(outputs_level1.logits, dim=-1)
37
+ predicted_class_level1 = torch.argmax(probs_level1, dim=-1).item()
38
+ predicted_label_level1 = label_encoder_level1.inverse_transform([predicted_class_level1])[0]
39
+
40
+ # Predict Level2 (assuming level2 model uses both text and predicted level1 label)
41
+ combined_input = text + " " + predicted_label_level1
42
+ inputs_level2 = tokenizer_level2(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
43
+ with torch.no_grad():
44
+ outputs_level2 = model_level2(**inputs_level2)
45
+ probs_level2 = torch.nn.functional.softmax(outputs_level2.logits, dim=-1)
46
+ predicted_class_level2 = torch.argmax(probs_level2, dim=-1).item()
47
+ predicted_label_level2 = label_encoder_level2.inverse_transform([predicted_class_level2])[0]
48
+ combined_prediction = f"Level1: {predicted_label_level1} - Level2: {predicted_label_level2}"
49
+ return combined_prediction
50
+
51
 
52
  # Define the markdown text with bullet points
53
  markdown_text = """
54
+ - Trained with ~1500 rows of data on bert-base-uncased, English.
55
+ - Input one budget line per time with min 2 words.
56
  - Accuracy of the model is ~88%.
57
  """
58
  html_table = """
59
+ <h2 style="text-align: center;">COFOG Budget AutoClassification</h2>
60
+ <p style="text-align: justify; margin-left: 30px; margin-right: 30px;">
61
  This classifier was developed utilizing the pre-trained BERT
62
  (Bidirectional Encoder Representations from Transformers) model
63
  with an uncased configuration, with over 1500 manually
 
65
  various budgetary documents. To balance the data, additional data
66
  was generated using GPT-4 where categories were not available
67
  in budget documents. The model training was executed
68
+ on a Google Colab environment, specifically utilizing a Tesla T4 GPU.
69
+ The model is designed to predict the primary classification level
 
 
 
 
 
70
  of the Classification of the Functions of Government (COFOG),
71
  with the predictions from the first level serving as contextual
72
  input for subsequent second-level classification. The project
73
  is conducted with an exclusive focus on academic and research
74
+ objectives.
75
+ Detailed metrics of the training process are as follows:
76
+ <code>TrainOutput(global_step=395, training_loss=1.1497593360611156,
77
+ metrics={'train_runtime': 650.0119, 'train_samples_per_second':
78
+ 9.638, 'train_steps_per_second': 0.608, 'total_flos': 1648509163714560.0,
79
+ 'train_loss': 1.1497593360611156, 'epoch': 5.0})</code>.
80
  </p>
81
  <table style="margin-left: auto; margin-right: auto;">
82
  <tr>
 
123
  fn=predict,
124
  inputs=gr.components.Textbox(lines=1, placeholder="Enter Budget line here...", label="Budget Input"),
125
  outputs=gr.components.Label(label="Classification Output"),
126
+ title="COFOG AutoClassification",
127
  description=markdown_text,
128
  article=html_table,
129
  allow_flagging="auto" # Enables flagging
 
131
 
132
  # Run the interface
133
  if __name__ == "__main__":
134
+ iface.launch()