import gradio as gr from transformers import pipeline # from embedding_gen import load_skills_from_date, visualize3D import numpy as np import pickle # token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first") # token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction") # token_knowledge_classifier = pipeline(model="Robzy/jobbert_knowledge_extraction") examples = [ "High proficiency in Python and AI/ML frameworks, i.e. Pytorch.", "Experience with Unreal and/or Unity and/or native IOS/Android 3D development", ] def aggregate_span(results): new_results = [] current_result = results[0] for result in results[1:]: if result["start"] == current_result["end"] + 1: current_result["word"] += " " + result["word"] current_result["end"] = result["end"] else: new_results.append(current_result) current_result = result new_results.append(current_result) return new_results # def ner(text): # output_knowledge = token_knowledge_classifier(text) # for result in output_knowledge: # if result.get("entity_group"): # result["entity"] = "Knowledge" # del result["entity_group"] # if len(output_knowledge) > 0: # output_knowledge = aggregate_span(output_knowledge) # return {"text": text, "entities": output_knowledge} ### Visualisation 3D import os def load_skills_from_date(base_folder, date): date_folder = os.path.join(base_folder, date) all_skills = set() # To ensure unique skills if os.path.exists(date_folder) and os.path.isdir(date_folder): for file_name in os.listdir(date_folder): file_path = os.path.join(date_folder, file_name) if file_name.endswith(".txt"): with open(file_path, 'r', encoding='utf-8') as f: all_skills.update(line.strip() for line in f if line.strip()) return list(all_skills) def visualize3D(reduced_embeddings, labels, skills, n_clusters, output_folder, date): fig = px.scatter_3d( x=reduced_embeddings[:, 0], y=reduced_embeddings[:, 1], z=reduced_embeddings[:, 2], color=labels, text=skills, title=f"KMeans Clustering with {n_clusters} Clusters ({date})" ) # Save the clustered plot # os.makedirs(output_folder, exist_ok=True) # plot_path = os.path.join(output_folder, f"{date}_3D_clustering.html") # fig.write_html(plot_path) # print(f"3D clustered plot saved at {plot_path}") # fig.show() return fig import plotly.express as px import numpy as np specific_date = "03-01-2024" # Example date folder to process skills = load_skills_from_date('./tags', specific_date) embeddings = np.load(f"./vectorstore/{specific_date}_embeddings.npy") with open(f"./vectorstore/{specific_date}_metadata.pkl", "rb") as f: metadata = pickle.load(f) labels, skills = metadata["labels"], metadata["skills"] fig = visualize3D(embeddings, labels, skills, n_clusters=5, output_folder="./plots", date=specific_date) fig.update_layout( height=900 ) with gr.Blocks() as demo: gr.Markdown("# 3D Visualization of Skills in ML Job Postings", elem_id="title") # gr.Markdown("Embedding visualisation of sought skills in ML job posting in Stockholm, Sweden on LinkedIn") gr.Plot(fig) demo.launch()