Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
import torch | |
import pickle | |
from dotenv import load_dotenv | |
import os | |
import pandas as pd | |
# Load environment variables from .env file | |
load_dotenv() | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
# Name of dataset to save flagged data to | |
HF_dataset = "peterkros/COFOG-feedback" # <-- Replace with your dataset repo ID | |
# Load the HuggingFaceDatasetSaver logger | |
hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, HF_dataset) | |
level1_to_level2_mapping = { | |
"General public services": [ | |
"Executive and legislative organs, financial and fiscal affairs, external affairs", | |
"Foreign economic aid", | |
"General services", | |
"Basic research", | |
"R&D General public services", | |
"General public services n.e.c.", | |
"Public debt transactions", | |
"Transfers of a general character between different levels of government" | |
], | |
"Defence": [ | |
"Military defence", | |
"Civil defence", | |
"Foreign military aid", | |
"R&D Defence", | |
"Defence n.e.c." | |
], | |
"Public order and safety": [ | |
"Police services", | |
"Fire-protection services", | |
"Law courts", | |
"Prisons", | |
"R&D Public order and safety", | |
"Public order and safety n.e.c." | |
], | |
"Economic affairs": [ | |
"General economic, commercial and labour affairs", | |
"Agriculture, forestry, fishing and hunting", | |
"Fuel and energy", | |
"Mining, manufacturing and construction", | |
"Transport", | |
"Communication", | |
"Other industries", | |
"R&D Economic affairs", | |
"Economic affairs n.e.c." | |
], | |
"Environmental protection": [ | |
"Waste management", | |
"Waste water management", | |
"Pollution abatement", | |
"Protection of biodiversity and landscape", | |
"R&D Environmental protection", | |
"Environmental protection n.e.c." | |
], | |
"Housing and community amenities": [ | |
"Housing development", | |
"Community development", | |
"Water supply", | |
"Street lighting", | |
"R&D Housing and community amenities", | |
"Housing and community amenities n.e.c." | |
], | |
"Health": [ | |
"Medical products, appliances and equipment", | |
"Outpatient services", | |
"Hospital services", | |
"Public health services", | |
"R&D Health", | |
"Health n.e.c." | |
], | |
"Recreation, culture and religion": [ | |
"Recreational and sporting services", | |
"Cultural services", | |
"Broadcasting and publishing services", | |
"Religious and other community services", | |
"R&D Recreation, culture and religion", | |
"Recreation, culture and religion n.e.c." | |
], | |
"Education": [ | |
"Pre-primary and primary education", | |
"Secondary education", | |
"Post-secondary non-tertiary education", | |
"Tertiary education", | |
"Education not definable by level", | |
"Subsidiary services to education", | |
"R&D Education", | |
"Education n.e.c." | |
], | |
"Social protection": [ | |
"Sickness and disability", | |
"Old age", | |
"Survivors", | |
"Family and children", | |
"Unemployment", | |
"Housing", | |
"Social exclusion n.e.c.", | |
"R&D Social protection", | |
"Social protection n.e.c." | |
] | |
} | |
# Model names for level1 and level2 | |
model_name_level1 = "peterkros/COFOG-bert2" | |
model_name_level2 = "peterkros/COFOG-bert-level2" | |
# Load models and tokenizers for both levels | |
model_level1 = AutoModelForSequenceClassification.from_pretrained(model_name_level1) | |
tokenizer_level1 = AutoTokenizer.from_pretrained(model_name_level1) | |
model_level2 = AutoModelForSequenceClassification.from_pretrained(model_name_level2) | |
tokenizer_level2 = AutoTokenizer.from_pretrained(model_name_level2) | |
# Load the label encoder | |
with open('label_encoder_level1.pkl', 'rb') as file: | |
label_encoder_level1 = pickle.load(file) | |
with open('label_encoder_level2.pkl', 'rb') as file: | |
label_encoder_level2 = pickle.load(file) | |
def predict(text): | |
# Check if the input has at least two words | |
if len(text.split()) < 2: | |
return "Input must have at least two words." | |
# Predict Level1 | |
inputs_level1 = tokenizer_level1(text, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
with torch.no_grad(): | |
outputs_level1 = model_level1(**inputs_level1) | |
probs_level1 = torch.nn.functional.softmax(outputs_level1.logits, dim=-1) | |
predicted_class_level1 = torch.argmax(probs_level1, dim=-1).item() | |
predicted_label_level1 = label_encoder_level1.inverse_transform([predicted_class_level1])[0] | |
# Predict Level2 (assuming level2 model uses both text and predicted level1 label) | |
combined_input = text + " " + predicted_label_level1 | |
inputs_level2 = tokenizer_level2(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
with torch.no_grad(): | |
outputs_level2 = model_level2(**inputs_level2) | |
probs_level2 = torch.nn.functional.softmax(outputs_level2.logits, dim=-1) | |
# Extract the probabilities for the candidate level2 categories | |
level2_candidates = level1_to_level2_mapping.get(predicted_label_level1, []) | |
candidate_indices = [label_encoder_level2.transform([candidate])[0] for candidate in level2_candidates if candidate in label_encoder_level2.classes_] | |
# Filter the probabilities | |
filtered_probs = probs_level2[0, candidate_indices] | |
# Get the highest probability label from the filtered list | |
if len(filtered_probs) > 0: | |
highest_prob_index = torch.argmax(filtered_probs).item() | |
predicted_class_level2 = candidate_indices[highest_prob_index] | |
predicted_label_level2 = label_encoder_level2.inverse_transform([predicted_class_level2])[0] | |
else: | |
predicted_label_level2 = "n.e.c" | |
combined_prediction = f"Level1: {predicted_label_level1} - Level2: {predicted_label_level2}" | |
return combined_prediction | |
def classify_csv(file_obj): | |
# Read the CSV file | |
df = pd.read_csv(file_obj) | |
# Check if the 'text' column is in the CSV file | |
if 'text' not in df.columns: | |
return "There is no column named 'text' in the file." | |
# Process the file if the 'text' column exists | |
results = [] | |
for i in range(len(df)): | |
# Combine the current line with the 5 preceding lines for context | |
context_start = max(0, i - 5) | |
context = " ".join(df['text'][context_start:i+1]) | |
# Truncate the context to fit within the model's max length | |
inputs = tokenizer_level1(context, truncation=True, max_length=512, return_tensors="pt") | |
# Extract the truncated text for prediction | |
truncated_context = tokenizer_level1.decode(inputs['input_ids'][0]) | |
# Make a prediction using the truncated context | |
prediction = predict(truncated_context) | |
results.append((df['text'][i], prediction)) | |
# Convert the results to a DataFrame with columns 'Line' and 'Prediction' | |
results_df = pd.DataFrame(results, columns=["Budget Line", "Prediction"]) | |
return results_df | |
# Define the markdown text with bullet points | |
markdown_text = """ | |
- Trained with ~1500 rows of data on bert-base-uncased, English. | |
- Input one budget line per time with min 2 words. | |
- Accuracy of the model is ~88%. | |
""" | |
markdown_text_file_upload = """ | |
- Trained with ~1500 rows of data on bert-base-uncased, English. | |
- Upload CSV ONLY and name your column with budget line item as **text**. | |
- Using RAG (Retrieval-augmented generation) aproach to feed context into classifier using preceding lines of budget. | |
- Accuracy of the model is ~88%. | |
""" | |
html_table = """ | |
<h2 style="text-align: center;">COFOG Budget AutoClassification</h2> | |
<p style="text-align: justify; margin-left: 30px; margin-right: 30px;"> | |
This classifier was developed utilizing the pre-trained BERT | |
(Bidirectional Encoder Representations from Transformers) model | |
with an uncased configuration, with over 1500 manually | |
labeled dataset comprising budget line items extracted from | |
various budgetary documents. To balance the data, additional data | |
was generated using GPT-4 where categories were not available | |
in budget documents. The model training was executed | |
on a Google Colab environment, specifically utilizing a Tesla T4 GPU. | |
The model is designed to predict the primary classification level | |
of the Classification of the Functions of Government (COFOG), | |
with the predictions from the first level serving as contextual | |
input for subsequent second-level classification. The project | |
is conducted with an exclusive focus on academic and research | |
objectives.<br>For batch prediction we integrated Retriever-Augmented Generator (RAG) | |
approach. This approach enriches the prediction process | |
by incorporating contextual information from up to 5 preceding | |
lines in the dataset, significantly enhancing the model's | |
ability to understand and classify each entry in the context | |
of related data.<br>Detailed metrics of the training process are as follows: | |
<code>TrainOutput(global_step=395, training_loss=1.1497593360611156, | |
metrics={'train_runtime': 650.0119, 'train_samples_per_second': | |
9.638, 'train_steps_per_second': 0.608, 'total_flos': 1648509163714560.0, | |
'train_loss': 1.1497593360611156, 'epoch': 5.0})</code>. | |
</p> | |
</div> | |
""" | |
# First interface for single line input | |
iface1 = gr.Interface( | |
fn=predict, | |
inputs=gr.components.Textbox(lines=1, placeholder="Enter Budget line here...", label="Budget Input"), | |
outputs=gr.components.Label(label="Classification Output"), | |
title="COFOG AutoClassification - Single Line", | |
description=markdown_text, | |
article=html_table, | |
allow_flagging="manual", # Enables flagging | |
flagging_options=["Incorect Level1", "Incorect Level2"], | |
flagging_callback=hf_writer, | |
) | |
# Second interface (for CSV file upload) | |
iface2 = gr.Interface( | |
fn=classify_csv, | |
inputs=gr.components.File(label="Upload CSV File"), | |
outputs=gr.components.DataFrame(label="Classification Results"), | |
description=markdown_text_file_upload, | |
article=html_table, | |
title="COFOG AutoClassification - Batch Classification" | |
) | |
# Combine the interfaces in a tabbed interface | |
tabbed_interface = gr.TabbedInterface( | |
[iface1, iface2], | |
["Single Prediction", "Batch Prediction"] | |
) | |
# Run the interface | |
if __name__ == "__main__": | |
tabbed_interface.launch() |