File size: 10,575 Bytes
0e1ec7d
1d4525b
dffac30
1d4525b
9e803bc
 
4a76044
9e803bc
 
 
 
 
 
 
 
 
 
dffac30
f9b0725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0f6901
 
 
 
 
 
 
 
 
 
 
 
dffac30
f227140
c0f6901
 
 
 
 
 
dffac30
 
c0f6901
 
 
 
 
 
dffac30
c0f6901
 
 
 
 
f9b0725
c0f6901
 
f9b0725
c0f6901
 
 
f9b0725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0f6901
 
02f4cb1
 
 
 
f0b48bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02f4cb1
f0b48bb
 
02f4cb1
dffac30
f227140
dffac30
c0f6901
 
d87fdf8
dffac30
02f4cb1
 
 
0b4c6c4
02f4cb1
 
b9a616c
c0f6901
 
0f6885b
 
 
 
 
 
 
c0f6901
 
0f6885b
 
 
 
2bbc0be
0b4c6c4
 
 
 
2bbc0be
c0f6901
 
 
 
0f6885b
 
f227140
02f4cb1
 
f227140
 
b9a616c
0b4c6c4
35749ea
b9a616c
9e803bc
6767484
9e803bc
 
dffac30
 
02f4cb1
 
 
 
 
 
 
 
0b4c6c4
02f4cb1
 
 
 
 
 
 
 
dffac30
 
02f4cb1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pickle
from dotenv import load_dotenv
import os 
import pandas as pd 

# Load environment variables from .env file
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")  

# Name of dataset to save flagged data to
HF_dataset = "peterkros/COFOG-feedback"  # <-- Replace with your dataset repo ID

# Load the HuggingFaceDatasetSaver logger
hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, HF_dataset)

level1_to_level2_mapping = {
    "General public services": [
        "Executive and legislative organs, financial and fiscal affairs, external affairs",
        "Foreign economic aid",
        "General services",
        "Basic research",
        "R&D General public services",
        "General public services n.e.c.",
        "Public debt transactions",
        "Transfers of a general character between different levels of government"
    ],
    "Defence": [
        "Military defence",
        "Civil defence",
        "Foreign military aid",
        "R&D Defence",
        "Defence n.e.c."
    ],
    "Public order and safety": [
        "Police services",
        "Fire-protection services",
        "Law courts",
        "Prisons",
        "R&D Public order and safety",
        "Public order and safety n.e.c."
    ],
    "Economic affairs": [
        "General economic, commercial and labour affairs",
        "Agriculture, forestry, fishing and hunting",
        "Fuel and energy",
        "Mining, manufacturing and construction",
        "Transport",
        "Communication",
        "Other industries",
        "R&D Economic affairs",
        "Economic affairs n.e.c."
    ],
    "Environmental protection": [
        "Waste management",
        "Waste water management",
        "Pollution abatement",
        "Protection of biodiversity and landscape",
        "R&D Environmental protection",
        "Environmental protection n.e.c."
    ],
    "Housing and community amenities": [
        "Housing development",
        "Community development",
        "Water supply",
        "Street lighting",
        "R&D Housing and community amenities",
        "Housing and community amenities n.e.c."
    ],
    "Health": [
        "Medical products, appliances and equipment",
        "Outpatient services",
        "Hospital services",
        "Public health services",
        "R&D Health",
        "Health n.e.c."
    ],
    "Recreation, culture and religion": [
        "Recreational and sporting services",
        "Cultural services",
        "Broadcasting and publishing services",
        "Religious and other community services",
        "R&D Recreation, culture and religion",
        "Recreation, culture and religion n.e.c."
    ],
    "Education": [
        "Pre-primary and primary education",
        "Secondary education",
        "Post-secondary non-tertiary education",
        "Tertiary education",
        "Education not definable by level",
        "Subsidiary services to education",
        "R&D Education",
        "Education n.e.c."
    ],
    "Social protection": [
        "Sickness and disability",
        "Old age",
        "Survivors",
        "Family and children",
        "Unemployment",
        "Housing",
        "Social exclusion n.e.c.",
        "R&D Social protection",
        "Social protection n.e.c."
    ]
}


# Model names for level1 and level2
model_name_level1 = "peterkros/COFOG-bert2"
model_name_level2 = "peterkros/COFOG-bert-level2"

# Load models and tokenizers for both levels
model_level1 = AutoModelForSequenceClassification.from_pretrained(model_name_level1)
tokenizer_level1 = AutoTokenizer.from_pretrained(model_name_level1)

model_level2 = AutoModelForSequenceClassification.from_pretrained(model_name_level2)
tokenizer_level2 = AutoTokenizer.from_pretrained(model_name_level2)


# Load the label encoder
with open('label_encoder_level1.pkl', 'rb') as file:
    label_encoder_level1 = pickle.load(file)

with open('label_encoder_level2.pkl', 'rb') as file:
    label_encoder_level2 = pickle.load(file)


def predict(text):
    # Check if the input has at least two words
    if len(text.split()) < 2:
        return "Input must have at least two words."

    # Predict Level1
    inputs_level1 = tokenizer_level1(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs_level1 = model_level1(**inputs_level1)
    probs_level1 = torch.nn.functional.softmax(outputs_level1.logits, dim=-1)
    predicted_class_level1 = torch.argmax(probs_level1, dim=-1).item()
    predicted_label_level1 = label_encoder_level1.inverse_transform([predicted_class_level1])[0]

   # Predict Level2 (assuming level2 model uses both text and predicted level1 label)
    combined_input = text + " " + predicted_label_level1
    inputs_level2 = tokenizer_level2(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    with torch.no_grad():
        outputs_level2 = model_level2(**inputs_level2)
    probs_level2 = torch.nn.functional.softmax(outputs_level2.logits, dim=-1)
    
    # Extract the probabilities for the candidate level2 categories
    level2_candidates = level1_to_level2_mapping.get(predicted_label_level1, [])
    candidate_indices = [label_encoder_level2.transform([candidate])[0] for candidate in level2_candidates if candidate in label_encoder_level2.classes_]
    
    # Filter the probabilities
    filtered_probs = probs_level2[0, candidate_indices]
    
    # Get the highest probability label from the filtered list
    if len(filtered_probs) > 0:
        highest_prob_index = torch.argmax(filtered_probs).item()
        predicted_class_level2 = candidate_indices[highest_prob_index]
        predicted_label_level2 = label_encoder_level2.inverse_transform([predicted_class_level2])[0]
    else:
        predicted_label_level2 = "n.e.c"

    combined_prediction = f"Level1: {predicted_label_level1} - Level2: {predicted_label_level2}"
    return combined_prediction
    
def classify_csv(file_obj):
    # Read the CSV file
    df = pd.read_csv(file_obj)

    # Check if the 'text' column is in the CSV file
    if 'text' not in df.columns:
        return "There is no column named 'text' in the file."

    # Process the file if the 'text' column exists
    results = []
    for i in range(len(df)):
        # Combine the current line with the 5 preceding lines for context
        context_start = max(0, i - 5)
        context = " ".join(df['text'][context_start:i+1])

        # Truncate the context to fit within the model's max length
        inputs = tokenizer_level1(context, truncation=True, max_length=512, return_tensors="pt")

        # Extract the truncated text for prediction
        truncated_context = tokenizer_level1.decode(inputs['input_ids'][0])

        # Make a prediction using the truncated context
        prediction = predict(truncated_context)
        results.append((df['text'][i], prediction))
    
    # Convert the results to a DataFrame with columns 'Line' and 'Prediction'
    results_df = pd.DataFrame(results, columns=["Budget Line", "Prediction"])
    return results_df

# Define the markdown text with bullet points
markdown_text = """
- Trained with ~1500 rows of data on bert-base-uncased, English.
- Input one budget line per time with min 2 words.
- Accuracy of the model is ~88%.
"""
markdown_text_file_upload = """
- Trained with ~1500 rows of data on bert-base-uncased, English.
- Upload CSV ONLY and name your column with budget line item as **text**.
- Using RAG (Retrieval-augmented generation) aproach to feed context into classifier using preceding lines of budget.
- Accuracy of the model is ~88%.
"""
html_table = """
  <h2 style="text-align: center;">COFOG Budget AutoClassification</h2>
   <p style="text-align: justify; margin-left: 30px; margin-right: 30px;">
    This classifier was developed utilizing the pre-trained BERT 
    (Bidirectional Encoder Representations from Transformers) model 
    with an uncased configuration, with over 1500 manually 
    labeled dataset comprising budget line items extracted from 
    various budgetary documents. To balance the data, additional data 
    was generated using GPT-4 where categories were not available 
    in budget documents. The model training was executed 
    on a Google Colab environment, specifically utilizing a Tesla T4 GPU.
    The model is designed to predict the primary classification level 
    of the Classification of the Functions of Government (COFOG),
    with the predictions from the first level serving as contextual 
    input for subsequent second-level classification. The project 
    is conducted with an exclusive focus on academic and research 
    objectives.<br>For batch prediction we integrated Retriever-Augmented Generator (RAG) 
    approach. This approach enriches the prediction process 
    by incorporating contextual information from up to 5 preceding 
    lines in the dataset, significantly enhancing the model's 
    ability to understand and classify each entry in the context 
    of related data.<br>Detailed metrics of the training process are as follows: 
    <code>TrainOutput(global_step=395, training_loss=1.1497593360611156, 
    metrics={'train_runtime': 650.0119, 'train_samples_per_second':
      9.638, 'train_steps_per_second': 0.608, 'total_flos': 1648509163714560.0, 
      'train_loss': 1.1497593360611156, 'epoch': 5.0})</code>. 
  </p>
</div>
"""
# First interface for single line input
iface1 = gr.Interface(
    fn=predict, 
    inputs=gr.components.Textbox(lines=1, placeholder="Enter Budget line here...", label="Budget Input"), 
    outputs=gr.components.Label(label="Classification Output"), 
    title="COFOG AutoClassification - Single Line",
    description=markdown_text,
    article=html_table,
    allow_flagging="manual",  # Enables flagging
    flagging_options=["Incorect Level1", "Incorect Level2"],
    flagging_callback=hf_writer,
    
)


# Second interface (for CSV file upload)
iface2 = gr.Interface(
    fn=classify_csv,
    inputs=gr.components.File(label="Upload CSV File"),
    outputs=gr.components.DataFrame(label="Classification Results"),
    description=markdown_text_file_upload,
    article=html_table,
    title="COFOG AutoClassification - Batch Classification"
)

# Combine the interfaces in a tabbed interface
tabbed_interface = gr.TabbedInterface(
    [iface1, iface2],
    ["Single Prediction", "Batch Prediction"]
)

# Run the interface
if __name__ == "__main__":
    tabbed_interface.launch()