Spaces:

peterkros
/

COFOG-Bert-AutoClassifier

Sleeping

App Files Files Community

peterkros commited on Dec 13, 2023

Commit

c0f6901

1 Parent(s): 8188dcd

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -28

app.py CHANGED Viewed

@@ -3,33 +3,61 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
 import torch
 import pickle
-# Load the model and tokenizer from Hugging Face Hub
-model_name = "peterkros/COFOG-bert2"
-model = AutoModelForSequenceClassification.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
 # Load the label encoder
-with open('label_encoder.pkl', 'rb') as file:
-    label_encoder = pickle.load(file)
 def predict(text):
-    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
     with torch.no_grad():
-        outputs = model(**inputs)
-    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
-    predicted_class = torch.argmax(probs, dim=-1).item()
-    predicted_label = label_encoder.inverse_transform([predicted_class])[0]
-    return predicted_label
 # Define the markdown text with bullet points
 markdown_text = """
-- Trained with ~1500 rows of data on bert-large-uncased, English.
-- Input one budget line per time.
 - Accuracy of the model is ~88%.
 """
 html_table = """
-  <h2 style="text-align: center;">COFOG Budget Classification</h2>
-   <p style="text-align: justify; margin-left: 20px; margin-right: 20px;">
     This classifier was developed utilizing the pre-trained BERT
     (Bidirectional Encoder Representations from Transformers) model
     with an uncased configuration, with over 1500 manually
@@ -37,18 +65,18 @@ html_table = """
     various budgetary documents. To balance the data, additional data
     was generated using GPT-4 where categories were not available
     in budget documents. The model training was executed
-    on a Google Colab environment, specifically utilizing a Tesla T4 GPU.
-    Detailed metrics of the training process are as follows:
-    <code>TrainOutput(global_step=395, training_loss=1.1497593360611156,
-    metrics={'train_runtime': 650.0119, 'train_samples_per_second':
-      9.638, 'train_steps_per_second': 0.608, 'total_flos': 1648509163714560.0,
-      'train_loss': 1.1497593360611156, 'epoch': 5.0})</code>. The model
-    is designed to predict the primary classification level
     of the Classification of the Functions of Government (COFOG),
     with the predictions from the first level serving as contextual
     input for subsequent second-level classification. The project
     is conducted with an exclusive focus on academic and research
-    objectives.
   </p>
   <table style="margin-left: auto; margin-right: auto;">
     <tr>
@@ -95,7 +123,7 @@ iface = gr.Interface(
     fn=predict,
     inputs=gr.components.Textbox(lines=1, placeholder="Enter Budget line here...", label="Budget Input"),
     outputs=gr.components.Label(label="Classification Output"),
-    title="COFOG Level 1 Classification",
     description=markdown_text,
     article=html_table,
     allow_flagging="auto"  # Enables flagging
@@ -103,6 +131,4 @@ iface = gr.Interface(
 # Run the interface
 if __name__ == "__main__":
-    iface.launch()

 import torch
 import pickle
+# Model names for level1 and level2
+model_name_level1 = "peterkros/COFOG-bert2"
+model_name_level2 = "peterkros/COFOG-bert-level2"
+# Load models and tokenizers for both levels
+model_level1 = AutoModelForSequenceClassification.from_pretrained(model_name_level1)
+tokenizer_level1 = AutoTokenizer.from_pretrained(model_name_level1)
+model_level2 = AutoModelForSequenceClassification.from_pretrained(model_name_level2)
+tokenizer_level2 = AutoTokenizer.from_pretrained(model_name_level2)
 # Load the label encoder
+with open('label_encoder_level1.pkl', 'rb') as file:
+    label_encoder_level1 = pickle.load(file)
+with open('label_encoder_level2.pkl', 'rb') as file:
+    label_encoder_level2 = pickle.load(file)
 def predict(text):
+    # Check if the input has at least two words
+    if len(text.split()) < 2:
+        return "Input must have at least two words."
+    # Predict Level1
+    inputs_level1 = tokenizer_level1(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
     with torch.no_grad():
+        outputs_level1 = model_level1(**inputs_level1)
+    probs_level1 = torch.nn.functional.softmax(outputs_level1.logits, dim=-1)
+    predicted_class_level1 = torch.argmax(probs_level1, dim=-1).item()
+    predicted_label_level1 = label_encoder_level1.inverse_transform([predicted_class_level1])[0]
+    # Predict Level2 (assuming level2 model uses both text and predicted level1 label)
+    combined_input = text + " " + predicted_label_level1
+    inputs_level2 = tokenizer_level2(combined_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    with torch.no_grad():
+        outputs_level2 = model_level2(**inputs_level2)
+    probs_level2 = torch.nn.functional.softmax(outputs_level2.logits, dim=-1)
+    predicted_class_level2 = torch.argmax(probs_level2, dim=-1).item()
+    predicted_label_level2 = label_encoder_level2.inverse_transform([predicted_class_level2])[0]
+    combined_prediction = f"Level1: {predicted_label_level1} - Level2: {predicted_label_level2}"
+    return combined_prediction
 # Define the markdown text with bullet points
 markdown_text = """
+- Trained with ~1500 rows of data on bert-base-uncased, English.
+- Input one budget line per time with min 2 words.
 - Accuracy of the model is ~88%.
 """
 html_table = """
+  <h2 style="text-align: center;">COFOG Budget AutoClassification</h2>
+   <p style="text-align: justify; margin-left: 30px; margin-right: 30px;">
     This classifier was developed utilizing the pre-trained BERT
     (Bidirectional Encoder Representations from Transformers) model
     with an uncased configuration, with over 1500 manually
     various budgetary documents. To balance the data, additional data
     was generated using GPT-4 where categories were not available
     in budget documents. The model training was executed
+    on a Google Colab environment, specifically utilizing a Tesla T4 GPU.
+    The model is designed to predict the primary classification level
     of the Classification of the Functions of Government (COFOG),
     with the predictions from the first level serving as contextual
     input for subsequent second-level classification. The project
     is conducted with an exclusive focus on academic and research
+    objectives.
+    Detailed metrics of the training process are as follows:
+    <code>TrainOutput(global_step=395, training_loss=1.1497593360611156,
+    metrics={'train_runtime': 650.0119, 'train_samples_per_second':
+      9.638, 'train_steps_per_second': 0.608, 'total_flos': 1648509163714560.0,
+      'train_loss': 1.1497593360611156, 'epoch': 5.0})</code>.
   </p>
   <table style="margin-left: auto; margin-right: auto;">
     <tr>
     fn=predict,
     inputs=gr.components.Textbox(lines=1, placeholder="Enter Budget line here...", label="Budget Input"),
     outputs=gr.components.Label(label="Classification Output"),
+    title="COFOG AutoClassification",
     description=markdown_text,
     article=html_table,
     allow_flagging="auto"  # Enables flagging
 # Run the interface
 if __name__ == "__main__":
+    iface.launch()