File size: 5,205 Bytes
f8da2f0
 
 
 
 
 
762e05d
 
f8da2f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
762e05d
6431e51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
762e05d
 
6431e51
f8da2f0
 
 
 
f63b5f6
 
9cda09e
f8da2f0
762e05d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6431e51
762e05d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
from sentence_transformers import SentenceTransformer
import numpy as np
import umap
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.cluster import KMeans
import pickle

# Step 1: Load skills from all files in a specific date folder
def load_skills_from_date(base_folder, date):
    date_folder = os.path.join(base_folder, date)
    all_skills = set()  # To ensure unique skills
    if os.path.exists(date_folder) and os.path.isdir(date_folder):
        for file_name in os.listdir(date_folder):
            file_path = os.path.join(date_folder, file_name)
            if file_name.endswith(".txt"):
                with open(file_path, 'r', encoding='utf-8') as f:
                    all_skills.update(line.strip() for line in f if line.strip())
    return list(all_skills)

# Step 2: Generate embeddings using a pretrained model
def generate_embeddings(skills, model_name="paraphrase-MiniLM-L3-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(skills, convert_to_numpy=True)
    return embeddings

# Step 3: Reduce dimensionality using UMAP
def reduce_dimensions(embeddings, n_components=2):
    reducer = umap.UMAP(n_components=n_components, random_state=42)
    reduced_embeddings = reducer.fit_transform(embeddings)
    return reduced_embeddings

# Step 4: Visualize the reduced embeddings (2D)
def visualize_embeddings_2d(reduced_embeddings, skills, output_folder, date):
    plt.figure(figsize=(10, 8))
    plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], s=50, alpha=0.8)
    for i, skill in enumerate(skills):
        plt.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], skill, fontsize=9, alpha=0.75)
    plt.title(f"UMAP Projection of Skill Embeddings ({date})")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    
    # Save the plot
    os.makedirs(output_folder, exist_ok=True)
    plot_path = os.path.join(output_folder, f"{date}_2D_projection.png")
    plt.savefig(plot_path, format="png", dpi=300)
    print(f"2D plot saved at {plot_path}")
    
    plt.show()

# Step 5: Visualize the reduced embeddings (3D)
def visualize_embeddings_3d(reduced_embeddings, skills, output_folder, date):
    fig = px.scatter_3d(
        x=reduced_embeddings[:, 0],
        y=reduced_embeddings[:, 1],
        z=reduced_embeddings[:, 2],
        text=skills,
        title=f"3D UMAP Projection of Skill Embeddings ({date})"
    )
    
    # Save the plot
    os.makedirs(output_folder, exist_ok=True)
    plot_path = os.path.join(output_folder, f"{date}_3D_projection.html")
    fig.write_html(plot_path)
    print(f"3D plot saved at {plot_path}")
    
    fig.show()

def visualize3D(reduced_embeddings, labels, skills, n_clusters, output_folder, date):
    
    fig = px.scatter_3d(
        x=reduced_embeddings[:, 0],
        y=reduced_embeddings[:, 1],
        z=reduced_embeddings[:, 2],
        color=labels,
        text=skills,
        title=f"KMeans Clustering with {n_clusters} Clusters ({date})"
    )
    
    # Save the clustered plot
    os.makedirs(output_folder, exist_ok=True)
    plot_path = os.path.join(output_folder, f"{date}_3D_clustering.html")
    fig.write_html(plot_path)
    print(f"3D clustered plot saved at {plot_path}")
    
    # fig.show()
    return fig

# Main execution
base_folder = "./tags"
output_folder = "./plots"
specific_date = "03-01-2024"  # Example date folder to process
# Get today's date in the desired format
# specific_date = datetime.now().strftime("%d-%m-%Y")
n_clusters = 5

    # Main execution
    base_folder = "./tags"
    output_folder = "./plots"
    vector_store = "./vectorstore"
    specific_date = "03-01-2024"  # Example date folder to process
    n_clusters = 5

    # Load skills from the specified date folder
    skills = load_skills_from_date(base_folder, specific_date)
    if not skills:
        print(f"No skills found for the date: {specific_date}")
    else:
        print(f"Loaded {len(skills)} unique skills for the date: {specific_date}")
        
        # Generate embeddings
        embeddings = generate_embeddings(skills)
        
        # Reduce dimensions to 2D and visualize
        # reduced_embeddings_2d = reduce_dimensions(embeddings, n_components=2)
        # visualize_embeddings_2d(reduced_embeddings_2d, skills, output_folder, specific_date)
        
        # Reduce dimensions to 3D, cluster, and visualize
        reduced_embeddings_3d = reduce_dimensions(embeddings, n_components=3)
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        labels = kmeans.fit_predict(reduced_embeddings_3d)
        visualize3D(reduced_embeddings_3d, labels, skills, n_clusters, output_folder, specific_date)

        # Save the reduced embeddings and metadata
        np.save(os.path.join(vector_store, f"{specific_date}_embeddings.npy"), reduced_embeddings_3d)
        with open(os.path.join(vector_store, f"{specific_date}_metadata.pkl"), 'wb') as f:
            pickle.dump({'labels': labels, 'skills': skills}, f)
        

        # Perform KMeans clustering and visualize in 3D
        # perform_kmeans_and_visualize(reduced_embeddings_3d, skills, n_clusters, output_folder, specific_date)