Spaces:
Sleeping
Sleeping
File size: 10,575 Bytes
0e1ec7d 1d4525b dffac30 1d4525b 9e803bc 4a76044 9e803bc dffac30 f9b0725 c0f6901 dffac30 f227140 c0f6901 dffac30 c0f6901 dffac30 c0f6901 f9b0725 c0f6901 f9b0725 c0f6901 f9b0725 c0f6901 02f4cb1 f0b48bb 02f4cb1 f0b48bb 02f4cb1 dffac30 f227140 dffac30 c0f6901 d87fdf8 dffac30 02f4cb1 0b4c6c4 02f4cb1 b9a616c c0f6901 0f6885b c0f6901 0f6885b 2bbc0be 0b4c6c4 2bbc0be c0f6901 0f6885b f227140 02f4cb1 f227140 b9a616c 0b4c6c4 35749ea b9a616c 9e803bc 6767484 9e803bc dffac30 02f4cb1 0b4c6c4 02f4cb1 dffac30 02f4cb1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
import gradio as gr
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pickle
from dotenv import load_dotenv
import os
import pandas as pd
# Load environment variables from .env file
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
# Name of dataset to save flagged data to
HF_dataset = "peterkros/COFOG-feedback" # <-- Replace with your dataset repo ID
# Load the HuggingFaceDatasetSaver logger
hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, HF_dataset)
level1_to_level2_mapping = {
"General public services": [
"Executive and legislative organs, financial and fiscal affairs, external affairs",
"Foreign economic aid",
"General services",
"Basic research",
"R&D General public services",
"General public services n.e.c.",
"Public debt transactions",
"Transfers of a general character between different levels of government"
],
"Defence": [
"Military defence",
"Civil defence",
"Foreign military aid",
"R&D Defence",
"Defence n.e.c."
],
"Public order and safety": [
"Police services",
"Fire-protection services",
"Law courts",
"Prisons",
"R&D Public order and safety",
"Public order and safety n.e.c."
],
"Economic affairs": [
"General economic, commercial and labour affairs",
"Agriculture, forestry, fishing and hunting",
"Fuel and energy",
"Mining, manufacturing and construction",
"Transport",
"Communication",
"Other industries",
"R&D Economic affairs",
"Economic affairs n.e.c."
],
"Environmental protection": [
"Waste management",
"Waste water management",
"Pollution abatement",
"Protection of biodiversity and landscape",
"R&D Environmental protection",
"Environmental protection n.e.c."
],
"Housing and community amenities": [
"Housing development",
"Community development",
"Water supply",
"Street lighting",
"R&D Housing and community amenities",
"Housing and community amenities n.e.c."
],
"Health": [
"Medical products, appliances and equipment",
"Outpatient services",
"Hospital services",
"Public health services",
"R&D Health",
"Health n.e.c."
],
"Recreation, culture and religion": [
"Recreational and sporting services",
"Cultural services",
"Broadcasting and publishing services",
"Religious and other community services",
"R&D Recreation, culture and religion",
"Recreation, culture and religion n.e.c."
],
"Education": [
"Pre-primary and primary education",
"Secondary education",
"Post-secondary non-tertiary education",
"Tertiary education",
"Education not definable by level",
"Subsidiary services to education",
"R&D Education",
"Education n.e.c."
],
"Social protection": [
"Sickness and disability",
"Old age",
"Survivors",
"Family and children",
"Unemployment",
"Housing",
"Social exclusion n.e.c.",
"R&D Social protection",
"Social protection n.e.c."
]
}
# Model names for level1 and level2
model_name_level1 = "peterkros/COFOG-bert2"
model_name_level2 = "peterkros/COFOG-bert-level2"
# Load models and tokenizers for both levels
model_level1 = AutoModelForSequenceClassification.from_pretrained(model_name_level1)
tokenizer_level1 = AutoTokenizer.from_pretrained(model_name_level1)
model_level2 = AutoModelForSequenceClassification.from_pretrained(model_name_level2)
tokenizer_level2 = AutoTokenizer.from_pretrained(model_name_level2)
# Load the label encoder
with open('label_encoder_level1.pkl', 'rb') as file:
label_encoder_level1 = pickle.load(file)
with open('label_encoder_level2.pkl', 'rb') as file:
label_encoder_level2 = pickle.load(file)
def predict(text):
# Check if the input has at least two words
if len(text.split()) < 2:
return "Input must have at least two words."
# Predict Level1
inputs_level1 = tokenizer_level1(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs_level1 = model_level1(**inputs_level1)
probs_level1 = torch.nn.functional.softmax(outputs_level1.logits, dim=-1)
predicted_class_level1 = torch.argmax(probs_level1, dim=-1).item()
predicted_label_level1 = label_encoder_level1.inverse_transform([predicted_class_level1])[0]
# Predict Level2 (assuming level2 model uses both text and predicted level1 label)
combined_input = text + " " + predicted_label_level1
inputs_level2 = tokenizer_level2(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs_level2 = model_level2(**inputs_level2)
probs_level2 = torch.nn.functional.softmax(outputs_level2.logits, dim=-1)
# Extract the probabilities for the candidate level2 categories
level2_candidates = level1_to_level2_mapping.get(predicted_label_level1, [])
candidate_indices = [label_encoder_level2.transform([candidate])[0] for candidate in level2_candidates if candidate in label_encoder_level2.classes_]
# Filter the probabilities
filtered_probs = probs_level2[0, candidate_indices]
# Get the highest probability label from the filtered list
if len(filtered_probs) > 0:
highest_prob_index = torch.argmax(filtered_probs).item()
predicted_class_level2 = candidate_indices[highest_prob_index]
predicted_label_level2 = label_encoder_level2.inverse_transform([predicted_class_level2])[0]
else:
predicted_label_level2 = "n.e.c"
combined_prediction = f"Level1: {predicted_label_level1} - Level2: {predicted_label_level2}"
return combined_prediction
def classify_csv(file_obj):
# Read the CSV file
df = pd.read_csv(file_obj)
# Check if the 'text' column is in the CSV file
if 'text' not in df.columns:
return "There is no column named 'text' in the file."
# Process the file if the 'text' column exists
results = []
for i in range(len(df)):
# Combine the current line with the 5 preceding lines for context
context_start = max(0, i - 5)
context = " ".join(df['text'][context_start:i+1])
# Truncate the context to fit within the model's max length
inputs = tokenizer_level1(context, truncation=True, max_length=512, return_tensors="pt")
# Extract the truncated text for prediction
truncated_context = tokenizer_level1.decode(inputs['input_ids'][0])
# Make a prediction using the truncated context
prediction = predict(truncated_context)
results.append((df['text'][i], prediction))
# Convert the results to a DataFrame with columns 'Line' and 'Prediction'
results_df = pd.DataFrame(results, columns=["Budget Line", "Prediction"])
return results_df
# Define the markdown text with bullet points
markdown_text = """
- Trained with ~1500 rows of data on bert-base-uncased, English.
- Input one budget line per time with min 2 words.
- Accuracy of the model is ~88%.
"""
markdown_text_file_upload = """
- Trained with ~1500 rows of data on bert-base-uncased, English.
- Upload CSV ONLY and name your column with budget line item as **text**.
- Using RAG (Retrieval-augmented generation) aproach to feed context into classifier using preceding lines of budget.
- Accuracy of the model is ~88%.
"""
html_table = """
<h2 style="text-align: center;">COFOG Budget AutoClassification</h2>
<p style="text-align: justify; margin-left: 30px; margin-right: 30px;">
This classifier was developed utilizing the pre-trained BERT
(Bidirectional Encoder Representations from Transformers) model
with an uncased configuration, with over 1500 manually
labeled dataset comprising budget line items extracted from
various budgetary documents. To balance the data, additional data
was generated using GPT-4 where categories were not available
in budget documents. The model training was executed
on a Google Colab environment, specifically utilizing a Tesla T4 GPU.
The model is designed to predict the primary classification level
of the Classification of the Functions of Government (COFOG),
with the predictions from the first level serving as contextual
input for subsequent second-level classification. The project
is conducted with an exclusive focus on academic and research
objectives.<br>For batch prediction we integrated Retriever-Augmented Generator (RAG)
approach. This approach enriches the prediction process
by incorporating contextual information from up to 5 preceding
lines in the dataset, significantly enhancing the model's
ability to understand and classify each entry in the context
of related data.<br>Detailed metrics of the training process are as follows:
<code>TrainOutput(global_step=395, training_loss=1.1497593360611156,
metrics={'train_runtime': 650.0119, 'train_samples_per_second':
9.638, 'train_steps_per_second': 0.608, 'total_flos': 1648509163714560.0,
'train_loss': 1.1497593360611156, 'epoch': 5.0})</code>.
</p>
</div>
"""
# First interface for single line input
iface1 = gr.Interface(
fn=predict,
inputs=gr.components.Textbox(lines=1, placeholder="Enter Budget line here...", label="Budget Input"),
outputs=gr.components.Label(label="Classification Output"),
title="COFOG AutoClassification - Single Line",
description=markdown_text,
article=html_table,
allow_flagging="manual", # Enables flagging
flagging_options=["Incorect Level1", "Incorect Level2"],
flagging_callback=hf_writer,
)
# Second interface (for CSV file upload)
iface2 = gr.Interface(
fn=classify_csv,
inputs=gr.components.File(label="Upload CSV File"),
outputs=gr.components.DataFrame(label="Classification Results"),
description=markdown_text_file_upload,
article=html_table,
title="COFOG AutoClassification - Batch Classification"
)
# Combine the interfaces in a tabbed interface
tabbed_interface = gr.TabbedInterface(
[iface1, iface2],
["Single Prediction", "Batch Prediction"]
)
# Run the interface
if __name__ == "__main__":
tabbed_interface.launch() |