File size: 4,222 Bytes
32e8749
 
 
 
 
 
a7f33dd
 
32e8749
49e21b1
cebee93
 
fba8174
32e8749
412b9d9
32e8749
a0332ee
 
 
 
32e8749
 
 
 
 
2401d61
32e8749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fba8174
49e21b1
fba8174
 
 
 
49e21b1
 
 
 
 
 
fba8174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73e4fda
 
 
fba8174
 
cebee93
42ebb8e
73e4fda
7040da3
c056a91
fba8174
4397a91
 
108650c
73e4fda
4397a91
 
6ca040b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tf_keras as keras
import pandas as pd
from tensorflow.keras.models import load_model
import classifier_data_lib
import tokenization
import joblib
from deep_translator import GoogleTranslator
import sys
import os
import gradio as gr

model = load_model('ISCO-Coder-BERT.h5', custom_objects={'KerasLayer': hub.KerasLayer})

bert_layer = hub.KerasLayer("https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/1",trainable=True)

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file,do_lower_case)

# Parameters
max_seq_length = 128
dummy_label = 100
label_list = list(pd.read_excel('label_list.xlsx')['label_list'])


# Define a function to preprocess the new data
def get_feature_new(text, max_seq_length, tokenizer, dummy_label):
    example = classifier_data_lib.InputExample(guid=None,
                                               text_a=text.numpy().decode('utf-8'),
                                               text_b=None,
                                               label=dummy_label)  # Use a valid dummy label
    feature = classifier_data_lib.convert_single_example(0, example, label_list, max_seq_length, tokenizer)
    return feature.input_ids, feature.input_mask, feature.segment_ids

def get_feature_map_new(text):
    input_ids, input_mask, segment_ids = tf.py_function(
        lambda text: get_feature_new(text, max_seq_length, tokenizer, dummy_label),
        inp=[text],
        Tout=[tf.int32, tf.int32, tf.int32]
    )
    input_ids.set_shape([max_seq_length])
    input_mask.set_shape([max_seq_length])
    segment_ids.set_shape([max_seq_length])
    
    x = {'input_word_ids': input_ids,
         'input_mask': input_mask,
         'input_type_ids': segment_ids}
    
    return x

def preprocess_new_data(texts):
    dataset = tf.data.Dataset.from_tensor_slices((texts,))
    dataset = dataset.map(get_feature_map_new,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(32, drop_remainder=False)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
    return dataset
def launch(text_input):
    # Load the label encoder
    label_encoder = joblib.load('label_encoder.joblib')

    # Preprocess the new data
    try:
        text_input = GoogleTranslator(source = 'auto',target = 'en').translate(text_input)
    except:
        text_input = text_input
        
    sample_example = [text_input]
    new_data_dataset = preprocess_new_data(sample_example)

    # Assuming you have a model already loaded (add model loading code if needed)
    # Make predictions on the new data
    predictions = model.predict(new_data_dataset)

    # Decode the predictions
    predicted_classes = [label_list[np.argmax(pred)] for pred in predictions]

    # Calculate the highest probabilities
    highest_probabilities = [max(instance) for instance in predictions]

    # Decode labels using the label encoder
    decoded_labels = label_encoder.inverse_transform(predicted_classes)

    map_data = pd.read_excel("ISCO-08 EN Structure and definitions.xlsx")
    isco_description = map_data[map_data['ISCO 08 Code']==decoded_labels]['Title EN']

    print("Most likely ISCO code is {} and probability is {}".format(decoded_labels,highest_probabilities))

    # Create descriptive text for the output
    #result_text = "Most likely ISCO code is {} and probability is {:.2f}".format(decoded_labels[0], highest_probabilities[0])
    result_text = f"Predicted ISCO Code: {decoded_labels[0]}\nProbability: {highest_probabilities[0]:.2f}\nISCO Description:{isco_description}"
    return result_text #[decoded_labels[0], highest_probabilities[0]]


iface = gr.Interface(
    fn=launch,
    inputs=gr.Textbox(lines=2, placeholder="Enter job title in any language (e.g., Software Engineer) and description here (e.g., Develops and maintains software applications)..."),
    outputs=gr.Textbox(lines=2, placeholder="Predicted ISCO Code: <result> Probability: <result> ISCO Description: <result>")
)

iface.launch()