Spaces:

Robzy
/

jobbert_knowledge_extraction

Sleeping

App Files Files Community

Aqsa-K commited on Jan 8

Commit

f8da2f0

1 Parent(s): 9c3e55b

embedding and graphs added

Browse files

Files changed (6) hide show

create_sample_skills.py +38 -0
embedding_gen.py +89 -0
plots/AI_trend.png +0 -0
plots/Deep Learning_trend.png +0 -0
plots/Python_trend.png +0 -0
trend_graph.py +67 -0

create_sample_skills.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Generating sample folder structure and files with multiple skills per file
+import os
+# Base folder for the structure
+base_folder = "tags"
+# Sample data: dates and skills for each date
+sample_dates = ["03-01-2024", "04-01-2024", "05-01-2024"]
+sample_skills = {
+    "03-01-2024": [
+        ["Python", "Machine Learning", "Data Analysis"],
+        ["Python", "Deep Learning"],
+        ["Data Science", "AI"]
+    ],
+    "04-01-2024": [
+        ["Python", "AI", "Data Analysis"],
+        ["Deep Learning", "Machine Learning"],
+        ["AI", "Data Engineering"]
+    ],
+    "05-01-2024": [
+        ["AI", "Machine Learning", "Python"],
+        ["Data Science", "Deep Learning"],
+        ["Python", "AI", "Cloud Computing"]
+    ]
+}
+# Create the folder structure and files
+for date in sample_dates:
+    date_folder = os.path.join(base_folder, date)
+    os.makedirs(date_folder, exist_ok=True)
+    for i, skills in enumerate(sample_skills[date], start=1):
+        file_path = os.path.join(date_folder, f"{i}.txt")
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write("\n".join(skills))
+print(f"Sample files with multiple skills per file have been generated in the '{base_folder}' folder.")

embedding_gen.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+from sentence_transformers import SentenceTransformer
+import numpy as np
+import umap
+import matplotlib.pyplot as plt
+import plotly.express as px
+# Step 1: Load skills from all files in a specific date folder
+def load_skills_from_date(base_folder, date):
+    date_folder = os.path.join(base_folder, date)
+    all_skills = set()  # To ensure unique skills
+    if os.path.exists(date_folder) and os.path.isdir(date_folder):
+        for file_name in os.listdir(date_folder):
+            file_path = os.path.join(date_folder, file_name)
+            if file_name.endswith(".txt"):
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    all_skills.update(line.strip() for line in f if line.strip())
+    return list(all_skills)
+# Step 2: Generate embeddings using a pretrained model
+def generate_embeddings(skills, model_name="paraphrase-MiniLM-L3-v2"):
+    model = SentenceTransformer(model_name)
+    embeddings = model.encode(skills, convert_to_numpy=True)
+    return embeddings
+# Step 3: Reduce dimensionality using UMAP
+def reduce_dimensions(embeddings, n_components=2):
+    reducer = umap.UMAP(n_components=n_components, random_state=42)
+    reduced_embeddings = reducer.fit_transform(embeddings)
+    return reduced_embeddings
+# Step 4: Visualize the reduced embeddings (2D)
+def visualize_embeddings_2d(reduced_embeddings, skills, output_folder, date):
+    plt.figure(figsize=(10, 8))
+    plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], s=50, alpha=0.8)
+    for i, skill in enumerate(skills):
+        plt.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], skill, fontsize=9, alpha=0.75)
+    plt.title(f"UMAP Projection of Skill Embeddings ({date})")
+    plt.xlabel("UMAP Dimension 1")
+    plt.ylabel("UMAP Dimension 2")
+    # Save the plot
+    os.makedirs(output_folder, exist_ok=True)
+    plot_path = os.path.join(output_folder, f"{date}_2D_projection.png")
+    plt.savefig(plot_path, format="png", dpi=300)
+    print(f"2D plot saved at {plot_path}")
+    plt.show()
+# Step 5: Visualize the reduced embeddings (3D)
+def visualize_embeddings_3d(reduced_embeddings, skills, output_folder, date):
+    fig = px.scatter_3d(
+        x=reduced_embeddings[:, 0],
+        y=reduced_embeddings[:, 1],
+        z=reduced_embeddings[:, 2],
+        text=skills,
+        title=f"3D UMAP Projection of Skill Embeddings ({date})"
+    )
+    # Save the plot
+    os.makedirs(output_folder, exist_ok=True)
+    plot_path = os.path.join(output_folder, f"{date}_3D_projection.html")
+    fig.write_html(plot_path)
+    print(f"3D plot saved at {plot_path}")
+    fig.show()
+# Main execution
+base_folder = "./tags"
+output_folder = "./plots"
+specific_date = "03-01-2024"  # Example date folder to process
+# Load skills from the specified date folder
+skills = load_skills_from_date(base_folder, specific_date)
+if not skills:
+    print(f"No skills found for the date: {specific_date}")
+else:
+    print(f"Loaded {len(skills)} unique skills for the date: {specific_date}")
+    # Generate embeddings
+    embeddings = generate_embeddings(skills)
+    # Reduce dimensions to 2D and visualize
+    reduced_embeddings_2d = reduce_dimensions(embeddings, n_components=2)
+    visualize_embeddings_2d(reduced_embeddings_2d, skills, output_folder, specific_date)
+    # Reduce dimensions to 3D and visualize
+    reduced_embeddings_3d = reduce_dimensions(embeddings, n_components=3)
+    visualize_embeddings_3d(reduced_embeddings_3d, skills, output_folder, specific_date)

plots/AI_trend.png ADDED Viewed

plots/Deep Learning_trend.png ADDED Viewed

plots/Python_trend.png ADDED Viewed

trend_graph.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+from collections import Counter
+# Path to the folder with date-wise subfolders
+base_folder = "./tags"
+# Directory to save the plots
+output_folder = "./plots"
+os.makedirs(output_folder, exist_ok=True)
+# Step 1: Initialize data structure to store skill counts
+date_skill_counts = {}
+# Step 2: Loop through the date folders
+for date_folder in sorted(os.listdir(base_folder)):
+    folder_path = os.path.join(base_folder, date_folder)
+    if os.path.isdir(folder_path):
+        # Initialize skill counter for the date
+        skill_counter = Counter()
+        # Loop through all files in the date folder
+        for file_name in os.listdir(folder_path):
+            file_path = os.path.join(folder_path, file_name)
+            if file_name.endswith(".txt"):
+                with open(file_path, "r", encoding="utf-8") as file:
+                    # Read skills from the file
+                    skills = file.read().strip().splitlines()
+                    skill_counter.update(skills)
+        # Save counts for the date
+        date_skill_counts[date_folder] = skill_counter
+# Step 3: Aggregate the data into a DataFrame
+all_dates = sorted(date_skill_counts.keys())
+all_skills = set(skill for counts in date_skill_counts.values() for skill in counts)
+data = {skill: [date_skill_counts[date].get(skill, 0) for date in all_dates] for skill in all_skills}
+df = pd.DataFrame(data, index=all_dates)
+print(df)
+# Step 4: Identify the top 3 skills
+total_counts = df.sum(axis=0)
+top_skills = total_counts.nlargest(3).index
+# Step 5: Plot and save separate graphs for the top 3 skills
+for skill in top_skills:
+    plt.figure(figsize=(8, 5))
+    plt.plot(df.index, df[skill], marker="o", label=skill)
+    # Add labels and legend
+    plt.title(f"Trend of {skill} Over Time")
+    plt.xlabel("Date")
+    plt.ylabel("Count")
+    plt.xticks(rotation=45)
+    plt.legend(title="Skill")
+    plt.grid()
+    plt.tight_layout()
+    # Save the plot
+    plot_path = os.path.join(output_folder, f"{skill}_trend.png")
+    plt.savefig(plot_path, format="png", dpi=300)
+    print(f"Saved plot for {skill} at {plot_path}")
+    # Show the plot
+    plt.show()