File size: 2,517 Bytes
8d30fa7
 
 
 
 
 
 
bae027f
8d30fa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bae027f
8d30fa7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# import gradio as gr

# gr.load("models/BAAI/bge-m3").launch()

import json
import faiss
import numpy as np
import gradio as gr
from FlagEmbedding import BGEM3FlagModel

# Define a function to load the ISCO taxonomy
def load_isco_taxonomy(file_path: str) -> list:
    with open(file_path, 'r', encoding='utf-8') as file:
        isco_data = [json.loads(line.strip()) for line in file]
    return isco_data

# Define a function to create a FAISS index
def create_faiss_index(isco_taxonomy, model_name='BAAI/bge-m3'):
    model = BGEM3FlagModel(model_name, use_fp16=True)
    texts = [str(entry['ESCO_DESCRIPTION']) for entry in isco_taxonomy]
    embeddings = model.encode(texts, batch_size=12, max_length=256)['dense_vecs']
    embeddings = np.array(embeddings).astype('float32')
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    faiss.write_index(index, 'isco_taxonomy.index')
    with open('isco_taxonomy_mapping.json', 'w') as f:
        json.dump({i: entry for i, entry in enumerate(isco_taxonomy)}, f)

# Define a function to retrieve and rerank using FAISS
def retrieve_and_rerank_faiss(job_duties, model_name="BAAI/bge-m3", top_k=4):
    # Check if isco_taxonomy.index exists, if not, create it with create_faiss_index
    if not os.path.exists("isco_taxonomy.index"):
        isco_taxonomy = load_isco_taxonomy('isco_taxonomy.jsonl')
        create_faiss_index(isco_taxonomy)
    index = faiss.read_index("isco_taxonomy.index")
    with open("isco_taxonomy_mapping.json", "r") as f:
        isco_taxonomy = json.load(f)
    model = BGEM3FlagModel(model_name, use_fp16=True)
    query_embedding = model.encode([job_duties], max_length=256)["dense_vecs"]
    query_embedding = np.array(query_embedding).astype("float32")
    distances, indices = index.search(query_embedding, top_k)
    results = [
        (isco_taxonomy[str(idx)]["ESCO_DESCRIPTION"], distances[0][i])
        for i, idx in enumerate(indices[0])
    ]
    return results

# Load data and create index (should be done once and then commented out or moved to a setup script)
# isco_taxonomy = load_isco_taxonomy('isco_taxonomy.jsonl')
# create_faiss_index(isco_taxonomy)

# Gradio Interface
def gradio_interface(job_duties):
    results = retrieve_and_rerank_faiss(job_duties)
    return [f"Description: {desc}, Distance: {dist}" for desc, dist in results]

iface = gr.Interface(fn=gradio_interface, inputs="text", outputs="text", title="Job Duties to ISCO Descriptions")
iface.launch()