# import gradio as gr # gr.load("models/BAAI/bge-m3").launch() import json import faiss import numpy as np import gradio as gr from FlagEmbedding import BGEM3FlagModel # Define a function to load the ISCO taxonomy def load_isco_taxonomy(file_path: str) -> list: with open(file_path, 'r', encoding='utf-8') as file: isco_data = [json.loads(line.strip()) for line in file] return isco_data # Define a function to create a FAISS index def create_faiss_index(isco_taxonomy, model_name='BAAI/bge-m3'): model = BGEM3FlagModel(model_name, use_fp16=True) texts = [str(entry['ESCO_DESCRIPTION']) for entry in isco_taxonomy] embeddings = model.encode(texts, batch_size=12, max_length=256)['dense_vecs'] embeddings = np.array(embeddings).astype('float32') dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) faiss.write_index(index, 'isco_taxonomy.index') with open('isco_taxonomy_mapping.json', 'w') as f: json.dump({i: entry for i, entry in enumerate(isco_taxonomy)}, f) # Define a function to retrieve and rerank using FAISS def retrieve_and_rerank_faiss(job_duties, model_name="BAAI/bge-m3", top_k=4): # Check if isco_taxonomy.index exists, if not, create it with create_faiss_index if not os.path.exists("isco_taxonomy.index"): isco_taxonomy = load_isco_taxonomy('isco_taxonomy.jsonl') create_faiss_index(isco_taxonomy) index = faiss.read_index("isco_taxonomy.index") with open("isco_taxonomy_mapping.json", "r") as f: isco_taxonomy = json.load(f) model = BGEM3FlagModel(model_name, use_fp16=True) query_embedding = model.encode([job_duties], max_length=256)["dense_vecs"] query_embedding = np.array(query_embedding).astype("float32") distances, indices = index.search(query_embedding, top_k) results = [ (isco_taxonomy[str(idx)]["ESCO_DESCRIPTION"], distances[0][i]) for i, idx in enumerate(indices[0]) ] return results # Load data and create index (should be done once and then commented out or moved to a setup script) # isco_taxonomy = load_isco_taxonomy('isco_taxonomy.jsonl') # create_faiss_index(isco_taxonomy) # Gradio Interface def gradio_interface(job_duties): results = retrieve_and_rerank_faiss(job_duties) return [f"Description: {desc}, Distance: {dist}" for desc, dist in results] iface = gr.Interface(fn=gradio_interface, inputs="text", outputs="text", title="Job Duties to ISCO Descriptions") iface.launch()