File size: 4,059 Bytes
e42ecfa
a0f03b5
fd58ad7
 
cb01a3d
cf64ea1
add7bd7
b871eea
 
 
32e8749
 
 
 
 
 
a7f33dd
 
32e8749
49e21b1
e42ecfa
7796c6e
 
 
 
 
 
 
fba8174
32e8749
412b9d9
32e8749
a0332ee
 
 
 
32e8749
 
 
 
 
2401d61
32e8749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fba8174
49e21b1
fba8174
 
 
 
49e21b1
 
 
 
 
 
fba8174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c056a91
 
fba8174
4397a91
 
 
 
 
 
6ca040b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import zipfile
import sys
import os

#sys.path.append('https://huggingface.co/spaces/PradeepJha/ISCO-code-predictor-api/resolve/main/models.zip')

# Check current directory and list files 
print("Current Directory:", os.getcwd())
print("Files in Directory:", os.listdir())

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tf_keras as keras
import pandas as pd
from tensorflow.keras.models import load_model
import classifier_data_lib
import tokenization
import joblib
from deep_translator import GoogleTranslator

print("NumPy Version:", np.__version__)
print("TensorFlow Version:", tf.__version__)
print("TensorFlow Hub Version:", hub.__version__)
print("Keras (via TensorFlow) Version:", keras.__version__)
print("Pandas Version:", pd.__version__)
print("Joblib Version:", joblib.__version__)

import gradio as gr

model = load_model('ISCO-Coder-BERT.h5', custom_objects={'KerasLayer': hub.KerasLayer})

bert_layer = hub.KerasLayer("https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/1",trainable=True)

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file,do_lower_case)

# Parameters
max_seq_length = 128
dummy_label = 100
label_list = list(pd.read_excel('label_list.xlsx')['label_list'])


# Define a function to preprocess the new data
def get_feature_new(text, max_seq_length, tokenizer, dummy_label):
    example = classifier_data_lib.InputExample(guid=None,
                                               text_a=text.numpy().decode('utf-8'),
                                               text_b=None,
                                               label=dummy_label)  # Use a valid dummy label
    feature = classifier_data_lib.convert_single_example(0, example, label_list, max_seq_length, tokenizer)
    return feature.input_ids, feature.input_mask, feature.segment_ids

def get_feature_map_new(text):
    input_ids, input_mask, segment_ids = tf.py_function(
        lambda text: get_feature_new(text, max_seq_length, tokenizer, dummy_label),
        inp=[text],
        Tout=[tf.int32, tf.int32, tf.int32]
    )
    input_ids.set_shape([max_seq_length])
    input_mask.set_shape([max_seq_length])
    segment_ids.set_shape([max_seq_length])
    
    x = {'input_word_ids': input_ids,
         'input_mask': input_mask,
         'input_type_ids': segment_ids}
    
    return x

def preprocess_new_data(texts):
    dataset = tf.data.Dataset.from_tensor_slices((texts,))
    dataset = dataset.map(get_feature_map_new,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(32, drop_remainder=False)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
    return dataset
def launch(text_input):
    # Load the label encoder
    label_encoder = joblib.load('label_encoder.joblib')

    # Preprocess the new data
    try:
        text_input = GoogleTranslator(source = 'auto',target = 'en').translate(text_input)
    except:
        text_input = text_input
        
    sample_example = [text_input]
    new_data_dataset = preprocess_new_data(sample_example)

    # Assuming you have a model already loaded (add model loading code if needed)
    # Make predictions on the new data
    predictions = model.predict(new_data_dataset)

    # Decode the predictions
    predicted_classes = [label_list[np.argmax(pred)] for pred in predictions]

    # Calculate the highest probabilities
    highest_probabilities = [max(instance) for instance in predictions]

    # Decode labels using the label encoder
    decoded_labels = label_encoder.inverse_transform(predicted_classes)

    print("Most likely ISCO code is {} and probability is {}".format(decoded_labels,highest_probabilities))

    return [decoded_labels,highest_probabilities]


iface = gr.Interface(
    fn=launch,
    inputs=gr.Textbox(lines=2, placeholder="Enter job title and description here..."),
    outputs="text"
)

iface.launch()