Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,33 +3,61 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
|
3 |
import torch
|
4 |
import pickle
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
# Load the label encoder
|
12 |
-
with open('
|
13 |
-
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def predict(text):
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
17 |
with torch.no_grad():
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
# Define the markdown text with bullet points
|
25 |
markdown_text = """
|
26 |
-
- Trained with ~1500 rows of data on bert-
|
27 |
-
- Input one budget line per time.
|
28 |
- Accuracy of the model is ~88%.
|
29 |
"""
|
30 |
html_table = """
|
31 |
-
<h2 style="text-align: center;">COFOG Budget
|
32 |
-
<p style="text-align: justify; margin-left:
|
33 |
This classifier was developed utilizing the pre-trained BERT
|
34 |
(Bidirectional Encoder Representations from Transformers) model
|
35 |
with an uncased configuration, with over 1500 manually
|
@@ -37,18 +65,18 @@ html_table = """
|
|
37 |
various budgetary documents. To balance the data, additional data
|
38 |
was generated using GPT-4 where categories were not available
|
39 |
in budget documents. The model training was executed
|
40 |
-
on a Google Colab environment, specifically utilizing a Tesla T4 GPU.
|
41 |
-
|
42 |
-
<code>TrainOutput(global_step=395, training_loss=1.1497593360611156,
|
43 |
-
metrics={'train_runtime': 650.0119, 'train_samples_per_second':
|
44 |
-
9.638, 'train_steps_per_second': 0.608, 'total_flos': 1648509163714560.0,
|
45 |
-
'train_loss': 1.1497593360611156, 'epoch': 5.0})</code>. The model
|
46 |
-
is designed to predict the primary classification level
|
47 |
of the Classification of the Functions of Government (COFOG),
|
48 |
with the predictions from the first level serving as contextual
|
49 |
input for subsequent second-level classification. The project
|
50 |
is conducted with an exclusive focus on academic and research
|
51 |
-
objectives.
|
|
|
|
|
|
|
|
|
|
|
52 |
</p>
|
53 |
<table style="margin-left: auto; margin-right: auto;">
|
54 |
<tr>
|
@@ -95,7 +123,7 @@ iface = gr.Interface(
|
|
95 |
fn=predict,
|
96 |
inputs=gr.components.Textbox(lines=1, placeholder="Enter Budget line here...", label="Budget Input"),
|
97 |
outputs=gr.components.Label(label="Classification Output"),
|
98 |
-
title="COFOG
|
99 |
description=markdown_text,
|
100 |
article=html_table,
|
101 |
allow_flagging="auto" # Enables flagging
|
@@ -103,6 +131,4 @@ iface = gr.Interface(
|
|
103 |
|
104 |
# Run the interface
|
105 |
if __name__ == "__main__":
|
106 |
-
iface.launch()
|
107 |
-
|
108 |
-
|
|
|
3 |
import torch
|
4 |
import pickle
|
5 |
|
6 |
+
|
7 |
+
# Model names for level1 and level2
|
8 |
+
model_name_level1 = "peterkros/COFOG-bert2"
|
9 |
+
model_name_level2 = "peterkros/COFOG-bert-level2"
|
10 |
+
|
11 |
+
# Load models and tokenizers for both levels
|
12 |
+
model_level1 = AutoModelForSequenceClassification.from_pretrained(model_name_level1)
|
13 |
+
tokenizer_level1 = AutoTokenizer.from_pretrained(model_name_level1)
|
14 |
+
|
15 |
+
model_level2 = AutoModelForSequenceClassification.from_pretrained(model_name_level2)
|
16 |
+
tokenizer_level2 = AutoTokenizer.from_pretrained(model_name_level2)
|
17 |
+
|
18 |
|
19 |
# Load the label encoder
|
20 |
+
with open('label_encoder_level1.pkl', 'rb') as file:
|
21 |
+
label_encoder_level1 = pickle.load(file)
|
22 |
+
|
23 |
+
with open('label_encoder_level2.pkl', 'rb') as file:
|
24 |
+
label_encoder_level2 = pickle.load(file)
|
25 |
+
|
26 |
|
27 |
def predict(text):
|
28 |
+
# Check if the input has at least two words
|
29 |
+
if len(text.split()) < 2:
|
30 |
+
return "Input must have at least two words."
|
31 |
+
|
32 |
+
# Predict Level1
|
33 |
+
inputs_level1 = tokenizer_level1(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
34 |
with torch.no_grad():
|
35 |
+
outputs_level1 = model_level1(**inputs_level1)
|
36 |
+
probs_level1 = torch.nn.functional.softmax(outputs_level1.logits, dim=-1)
|
37 |
+
predicted_class_level1 = torch.argmax(probs_level1, dim=-1).item()
|
38 |
+
predicted_label_level1 = label_encoder_level1.inverse_transform([predicted_class_level1])[0]
|
39 |
+
|
40 |
+
# Predict Level2 (assuming level2 model uses both text and predicted level1 label)
|
41 |
+
combined_input = text + " " + predicted_label_level1
|
42 |
+
inputs_level2 = tokenizer_level2(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
43 |
+
with torch.no_grad():
|
44 |
+
outputs_level2 = model_level2(**inputs_level2)
|
45 |
+
probs_level2 = torch.nn.functional.softmax(outputs_level2.logits, dim=-1)
|
46 |
+
predicted_class_level2 = torch.argmax(probs_level2, dim=-1).item()
|
47 |
+
predicted_label_level2 = label_encoder_level2.inverse_transform([predicted_class_level2])[0]
|
48 |
+
combined_prediction = f"Level1: {predicted_label_level1} - Level2: {predicted_label_level2}"
|
49 |
+
return combined_prediction
|
50 |
+
|
51 |
|
52 |
# Define the markdown text with bullet points
|
53 |
markdown_text = """
|
54 |
+
- Trained with ~1500 rows of data on bert-base-uncased, English.
|
55 |
+
- Input one budget line per time with min 2 words.
|
56 |
- Accuracy of the model is ~88%.
|
57 |
"""
|
58 |
html_table = """
|
59 |
+
<h2 style="text-align: center;">COFOG Budget AutoClassification</h2>
|
60 |
+
<p style="text-align: justify; margin-left: 30px; margin-right: 30px;">
|
61 |
This classifier was developed utilizing the pre-trained BERT
|
62 |
(Bidirectional Encoder Representations from Transformers) model
|
63 |
with an uncased configuration, with over 1500 manually
|
|
|
65 |
various budgetary documents. To balance the data, additional data
|
66 |
was generated using GPT-4 where categories were not available
|
67 |
in budget documents. The model training was executed
|
68 |
+
on a Google Colab environment, specifically utilizing a Tesla T4 GPU.
|
69 |
+
The model is designed to predict the primary classification level
|
|
|
|
|
|
|
|
|
|
|
70 |
of the Classification of the Functions of Government (COFOG),
|
71 |
with the predictions from the first level serving as contextual
|
72 |
input for subsequent second-level classification. The project
|
73 |
is conducted with an exclusive focus on academic and research
|
74 |
+
objectives.
|
75 |
+
Detailed metrics of the training process are as follows:
|
76 |
+
<code>TrainOutput(global_step=395, training_loss=1.1497593360611156,
|
77 |
+
metrics={'train_runtime': 650.0119, 'train_samples_per_second':
|
78 |
+
9.638, 'train_steps_per_second': 0.608, 'total_flos': 1648509163714560.0,
|
79 |
+
'train_loss': 1.1497593360611156, 'epoch': 5.0})</code>.
|
80 |
</p>
|
81 |
<table style="margin-left: auto; margin-right: auto;">
|
82 |
<tr>
|
|
|
123 |
fn=predict,
|
124 |
inputs=gr.components.Textbox(lines=1, placeholder="Enter Budget line here...", label="Budget Input"),
|
125 |
outputs=gr.components.Label(label="Classification Output"),
|
126 |
+
title="COFOG AutoClassification",
|
127 |
description=markdown_text,
|
128 |
article=html_table,
|
129 |
allow_flagging="auto" # Enables flagging
|
|
|
131 |
|
132 |
# Run the interface
|
133 |
if __name__ == "__main__":
|
134 |
+
iface.launch()
|
|
|
|