File size: 3,615 Bytes
c59a66a
 
acee5d9
762e05d
 
c59a66a
acee5d9
 
 
c59a66a
 
 
762e05d
 
c59a66a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
acee5d9
762e05d
c59a66a
acee5d9
 
 
 
 
c59a66a
acee5d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c59a66a
762e05d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c59a66a
762e05d
 
acee5d9
 
 
 
 
 
c59a66a
762e05d
 
 
c59a66a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import gradio as gr
from transformers import pipeline
# from embedding_gen import load_skills_from_date, visualize3D
import numpy as np
import pickle

# token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
# token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction")
# token_knowledge_classifier = pipeline(model="Robzy/jobbert_knowledge_extraction")


examples = [
        "High proficiency in Python and AI/ML frameworks, i.e. Pytorch.",
        "Experience with Unreal and/or Unity and/or native IOS/Android 3D development",
        ]


def aggregate_span(results):
    new_results = []
    current_result = results[0]

    for result in results[1:]:
        if result["start"] == current_result["end"] + 1:
            current_result["word"] += " " + result["word"]
            current_result["end"] = result["end"]
        else:
            new_results.append(current_result)
            current_result = result

    new_results.append(current_result)

    return new_results

# def ner(text):


#     output_knowledge = token_knowledge_classifier(text)
#     for result in output_knowledge:
#         if result.get("entity_group"):
#             result["entity"] = "Knowledge"
#             del result["entity_group"]

#     if len(output_knowledge) > 0:
#         output_knowledge = aggregate_span(output_knowledge)

#     return {"text": text, "entities": output_knowledge}

### Visualisation 3D

import os

def load_skills_from_date(base_folder, date):
    date_folder = os.path.join(base_folder, date)
    all_skills = set()  # To ensure unique skills
    if os.path.exists(date_folder) and os.path.isdir(date_folder):
        for file_name in os.listdir(date_folder):
            file_path = os.path.join(date_folder, file_name)
            if file_name.endswith(".txt"):
                with open(file_path, 'r', encoding='utf-8') as f:
                    all_skills.update(line.strip() for line in f if line.strip())
    return list(all_skills)

def visualize3D(reduced_embeddings, labels, skills, n_clusters, output_folder, date):
    
    fig = px.scatter_3d(
        x=reduced_embeddings[:, 0],
        y=reduced_embeddings[:, 1],
        z=reduced_embeddings[:, 2],
        color=labels,
        text=skills,
        title=f"KMeans Clustering with {n_clusters} Clusters ({date})"
    )
    
    # Save the clustered plot
    # os.makedirs(output_folder, exist_ok=True)
    # plot_path = os.path.join(output_folder, f"{date}_3D_clustering.html")
    # fig.write_html(plot_path)
    # print(f"3D clustered plot saved at {plot_path}")
    
    # fig.show()
    return fig



import plotly.express as px
import numpy as np

specific_date = "03-01-2024"  # Example date folder to process
skills = load_skills_from_date('./tags', specific_date)
embeddings = np.load(f"./vectorstore/{specific_date}_embeddings.npy")
with open(f"./vectorstore/{specific_date}_metadata.pkl", "rb") as f:
    metadata =   pickle.load(f)
labels, skills = metadata["labels"], metadata["skills"]
fig = visualize3D(embeddings, labels, skills, n_clusters=5, output_folder="./plots", date=specific_date)
fig.update_layout(
     height=900
)

with gr.Blocks() as demo:
    
    # gr.Interface(fn=ner,
    #     inputs=gr.Textbox(placeholder="Enter sentence here..."),
    #     outputs=["highlight"],
    #     examples=examples,
    #     title="In-demand skills in machine learning (ML) industry"
    # )

    # gr.Markdown("Embedding visualisation of sought skills in ML job posting in Stockholm, Sweden on LinkedIn")
    gr.Plot(fig)
    

demo.launch()