Aqsa-K commited on
Commit
f8da2f0
·
1 Parent(s): 9c3e55b

embedding and graphs added

Browse files
create_sample_skills.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generating sample folder structure and files with multiple skills per file
2
+
3
+ import os
4
+
5
+ # Base folder for the structure
6
+ base_folder = "tags"
7
+
8
+ # Sample data: dates and skills for each date
9
+ sample_dates = ["03-01-2024", "04-01-2024", "05-01-2024"]
10
+ sample_skills = {
11
+ "03-01-2024": [
12
+ ["Python", "Machine Learning", "Data Analysis"],
13
+ ["Python", "Deep Learning"],
14
+ ["Data Science", "AI"]
15
+ ],
16
+ "04-01-2024": [
17
+ ["Python", "AI", "Data Analysis"],
18
+ ["Deep Learning", "Machine Learning"],
19
+ ["AI", "Data Engineering"]
20
+ ],
21
+ "05-01-2024": [
22
+ ["AI", "Machine Learning", "Python"],
23
+ ["Data Science", "Deep Learning"],
24
+ ["Python", "AI", "Cloud Computing"]
25
+ ]
26
+ }
27
+
28
+ # Create the folder structure and files
29
+ for date in sample_dates:
30
+ date_folder = os.path.join(base_folder, date)
31
+ os.makedirs(date_folder, exist_ok=True)
32
+
33
+ for i, skills in enumerate(sample_skills[date], start=1):
34
+ file_path = os.path.join(date_folder, f"{i}.txt")
35
+ with open(file_path, "w", encoding="utf-8") as f:
36
+ f.write("\n".join(skills))
37
+
38
+ print(f"Sample files with multiple skills per file have been generated in the '{base_folder}' folder.")
embedding_gen.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from sentence_transformers import SentenceTransformer
3
+ import numpy as np
4
+ import umap
5
+ import matplotlib.pyplot as plt
6
+ import plotly.express as px
7
+
8
+ # Step 1: Load skills from all files in a specific date folder
9
+ def load_skills_from_date(base_folder, date):
10
+ date_folder = os.path.join(base_folder, date)
11
+ all_skills = set() # To ensure unique skills
12
+ if os.path.exists(date_folder) and os.path.isdir(date_folder):
13
+ for file_name in os.listdir(date_folder):
14
+ file_path = os.path.join(date_folder, file_name)
15
+ if file_name.endswith(".txt"):
16
+ with open(file_path, 'r', encoding='utf-8') as f:
17
+ all_skills.update(line.strip() for line in f if line.strip())
18
+ return list(all_skills)
19
+
20
+ # Step 2: Generate embeddings using a pretrained model
21
+ def generate_embeddings(skills, model_name="paraphrase-MiniLM-L3-v2"):
22
+ model = SentenceTransformer(model_name)
23
+ embeddings = model.encode(skills, convert_to_numpy=True)
24
+ return embeddings
25
+
26
+ # Step 3: Reduce dimensionality using UMAP
27
+ def reduce_dimensions(embeddings, n_components=2):
28
+ reducer = umap.UMAP(n_components=n_components, random_state=42)
29
+ reduced_embeddings = reducer.fit_transform(embeddings)
30
+ return reduced_embeddings
31
+
32
+ # Step 4: Visualize the reduced embeddings (2D)
33
+ def visualize_embeddings_2d(reduced_embeddings, skills, output_folder, date):
34
+ plt.figure(figsize=(10, 8))
35
+ plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], s=50, alpha=0.8)
36
+ for i, skill in enumerate(skills):
37
+ plt.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], skill, fontsize=9, alpha=0.75)
38
+ plt.title(f"UMAP Projection of Skill Embeddings ({date})")
39
+ plt.xlabel("UMAP Dimension 1")
40
+ plt.ylabel("UMAP Dimension 2")
41
+
42
+ # Save the plot
43
+ os.makedirs(output_folder, exist_ok=True)
44
+ plot_path = os.path.join(output_folder, f"{date}_2D_projection.png")
45
+ plt.savefig(plot_path, format="png", dpi=300)
46
+ print(f"2D plot saved at {plot_path}")
47
+
48
+ plt.show()
49
+
50
+ # Step 5: Visualize the reduced embeddings (3D)
51
+ def visualize_embeddings_3d(reduced_embeddings, skills, output_folder, date):
52
+ fig = px.scatter_3d(
53
+ x=reduced_embeddings[:, 0],
54
+ y=reduced_embeddings[:, 1],
55
+ z=reduced_embeddings[:, 2],
56
+ text=skills,
57
+ title=f"3D UMAP Projection of Skill Embeddings ({date})"
58
+ )
59
+
60
+ # Save the plot
61
+ os.makedirs(output_folder, exist_ok=True)
62
+ plot_path = os.path.join(output_folder, f"{date}_3D_projection.html")
63
+ fig.write_html(plot_path)
64
+ print(f"3D plot saved at {plot_path}")
65
+
66
+ fig.show()
67
+
68
+ # Main execution
69
+ base_folder = "./tags"
70
+ output_folder = "./plots"
71
+ specific_date = "03-01-2024" # Example date folder to process
72
+
73
+ # Load skills from the specified date folder
74
+ skills = load_skills_from_date(base_folder, specific_date)
75
+ if not skills:
76
+ print(f"No skills found for the date: {specific_date}")
77
+ else:
78
+ print(f"Loaded {len(skills)} unique skills for the date: {specific_date}")
79
+
80
+ # Generate embeddings
81
+ embeddings = generate_embeddings(skills)
82
+
83
+ # Reduce dimensions to 2D and visualize
84
+ reduced_embeddings_2d = reduce_dimensions(embeddings, n_components=2)
85
+ visualize_embeddings_2d(reduced_embeddings_2d, skills, output_folder, specific_date)
86
+
87
+ # Reduce dimensions to 3D and visualize
88
+ reduced_embeddings_3d = reduce_dimensions(embeddings, n_components=3)
89
+ visualize_embeddings_3d(reduced_embeddings_3d, skills, output_folder, specific_date)
plots/AI_trend.png ADDED
plots/Deep Learning_trend.png ADDED
plots/Python_trend.png ADDED
trend_graph.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ from collections import Counter
5
+
6
+ # Path to the folder with date-wise subfolders
7
+ base_folder = "./tags"
8
+
9
+ # Directory to save the plots
10
+ output_folder = "./plots"
11
+ os.makedirs(output_folder, exist_ok=True)
12
+
13
+ # Step 1: Initialize data structure to store skill counts
14
+ date_skill_counts = {}
15
+
16
+ # Step 2: Loop through the date folders
17
+ for date_folder in sorted(os.listdir(base_folder)):
18
+ folder_path = os.path.join(base_folder, date_folder)
19
+ if os.path.isdir(folder_path):
20
+ # Initialize skill counter for the date
21
+ skill_counter = Counter()
22
+
23
+ # Loop through all files in the date folder
24
+ for file_name in os.listdir(folder_path):
25
+ file_path = os.path.join(folder_path, file_name)
26
+ if file_name.endswith(".txt"):
27
+ with open(file_path, "r", encoding="utf-8") as file:
28
+ # Read skills from the file
29
+ skills = file.read().strip().splitlines()
30
+ skill_counter.update(skills)
31
+
32
+ # Save counts for the date
33
+ date_skill_counts[date_folder] = skill_counter
34
+
35
+ # Step 3: Aggregate the data into a DataFrame
36
+ all_dates = sorted(date_skill_counts.keys())
37
+ all_skills = set(skill for counts in date_skill_counts.values() for skill in counts)
38
+ data = {skill: [date_skill_counts[date].get(skill, 0) for date in all_dates] for skill in all_skills}
39
+ df = pd.DataFrame(data, index=all_dates)
40
+
41
+ print(df)
42
+
43
+ # Step 4: Identify the top 3 skills
44
+ total_counts = df.sum(axis=0)
45
+ top_skills = total_counts.nlargest(3).index
46
+
47
+ # Step 5: Plot and save separate graphs for the top 3 skills
48
+ for skill in top_skills:
49
+ plt.figure(figsize=(8, 5))
50
+ plt.plot(df.index, df[skill], marker="o", label=skill)
51
+
52
+ # Add labels and legend
53
+ plt.title(f"Trend of {skill} Over Time")
54
+ plt.xlabel("Date")
55
+ plt.ylabel("Count")
56
+ plt.xticks(rotation=45)
57
+ plt.legend(title="Skill")
58
+ plt.grid()
59
+ plt.tight_layout()
60
+
61
+ # Save the plot
62
+ plot_path = os.path.join(output_folder, f"{skill}_trend.png")
63
+ plt.savefig(plot_path, format="png", dpi=300)
64
+ print(f"Saved plot for {skill} at {plot_path}")
65
+
66
+ # Show the plot
67
+ plt.show()