Spaces:

Robzy
/

jobbert_knowledge_extraction

Running

App Files Files Community

Robzy commited on Jan 8

Commit

762e05d

1 Parent(s): 6431e51

mismatch deletion

Browse files

Files changed (30) hide show

README.md +10 -1
__pycache__/embedding_gen.cpython-312.pyc +0 -0
app.py +34 -16
config.yaml +2 -2
data/tags-04-01-2025.jsonl +123 -0
demo-app.py +6 -0
demo.py +39 -0
embedding_gen.py +40 -27
filter-faults.py +33 -0
job-postings/{03-01-2024 → 03-01-2025}/1.txt +0 -0
job-postings/{03-01-2024 → 03-01-2025}/2.txt +0 -0
job-postings/{03-01-2024 → 03-01-2025}/3.txt +0 -0
job-postings/{04-01-2024 → 04-01-2025}/1.txt +0 -0
job-postings/{04-01-2024 → 04-01-2025}/2.txt +0 -0
job-postings/{04-01-2024 → 04-01-2025}/3.txt +0 -0
llm-tagging.py +53 -7
plots/03-01-2024_2D_projection.png +0 -0
plots/03-01-2024_3D_clustering.html +0 -0
plots/03-01-2024_3D_projection.html +0 -0
requirements.txt +3 -1
tag-posting.py +6 -1
tags/{03-01-2024 → 03-01-2025}/1.txt +0 -0
tags/{03-01-2024 → 03-01-2025}/2.txt +0 -0
tags/{03-01-2024 → 03-01-2025}/3.txt +0 -0
tags/{04-01-2024 → 04-01-2025}/1.txt +0 -0
tags/{04-01-2024 → 04-01-2025}/2.txt +0 -0
tags/{04-01-2024 → 04-01-2025}/3.txt +0 -0
train.py +9 -1
vectorstore/03-01-2024_embeddings.npy +0 -0
vectorstore/03-01-2024_metadata.pkl +0 -0

README.md CHANGED Viewed

@@ -31,7 +31,7 @@ Save all skills. Make a comprehensive overview by:
 1. Embed skills to a vector with an embedding model
 2. Perform clustering with HDBSCAN
 2. Visualize clustering with dimensionality reduction (UMAP)
 Inspiration: [link](https://dylancastillo.co/posts/clustering-documents-with-openai-langchain-hdbscan.html)
@@ -46,3 +46,12 @@ You should define your own project by writing at most one page description of th
 ### What to deliver
 You should deliver your project as a stand alone serverless ML system. You should submit a URL for your service, a zip file containing your code, and a short report (two to three pages) about what you have done, the dataset, your method, your results, and how to run the code. I encourage you to have the README.md for your project in your Github report as the report for your project.

 1. Embed skills to a vector with an embedding model
 2. Perform clustering with HDBSCAN
 2. Visualize clustering with dimensionality reduction (UMAP)
 Inspiration: [link](https://dylancastillo.co/posts/clustering-documents-with-openai-langchain-hdbscan.html)
 ### What to deliver
 You should deliver your project as a stand alone serverless ML system. You should submit a URL for your service, a zip file containing your code, and a short report (two to three pages) about what you have done, the dataset, your method, your results, and how to run the code. I encourage you to have the README.md for your project in your Github report as the report for your project.
+1. Scraping
+2. Tagging of JP
+    - tag date
+3. Training
+4. Visualisation

__pycache__/embedding_gen.cpython-312.pyc ADDED Viewed

Binary file (6.41 kB). View file

app.py CHANGED Viewed

@@ -1,14 +1,16 @@
 import gradio as gr
 from transformers import pipeline
 token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
-token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction", aggregation_strategy="first")
 examples = [
-        "Knowing Python is a plus",
-        "Recommend changes, develop and implement processes to ensure compliance with IFRS standards",
-        "Experience with Unreal and/or Unity and/or native IOS/Android 3D development and/or Web based 3D engines",
         ]
@@ -29,11 +31,7 @@ def aggregate_span(results):
     return new_results
 def ner(text):
-    output_skills = token_skill_classifier(text)
-    for result in output_skills:
-        if result.get("entity_group"):
-            result["entity"] = "Skill"
-            del result["entity_group"]
     output_knowledge = token_knowledge_classifier(text)
     for result in output_knowledge:
@@ -41,17 +39,37 @@ def ner(text):
             result["entity"] = "Knowledge"
             del result["entity_group"]
-    if len(output_skills) > 0:
-        output_skills = aggregate_span(output_skills)
     if len(output_knowledge) > 0:
         output_knowledge = aggregate_span(output_knowledge)
-    return {"text": text, "entities": output_skills}, {"text": text, "entities": output_knowledge}
-demo = gr.Interface(fn=ner,
-                    inputs=gr.Textbox(placeholder="Enter sentence here..."),
-                    outputs=["highlight", "highlight"],
-                    examples=examples)
 demo.launch()

 import gradio as gr
 from transformers import pipeline
+from embedding_gen import load_skills_from_date, visualize3D
+import numpy as np
+import pickle
 token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
+token_knowledge_classifier = pipeline(model="Robzy/jobbert_knowledge_extraction", aggregation_strategy="first")
 examples = [
+        "High proficiency in Python and AI/ML frameworks, i.e. Pytorch.",
+        "Experience with Unreal and/or Unity and/or native IOS/Android 3D development",
         ]
     return new_results
 def ner(text):
     output_knowledge = token_knowledge_classifier(text)
     for result in output_knowledge:
             result["entity"] = "Knowledge"
             del result["entity_group"]
     if len(output_knowledge) > 0:
         output_knowledge = aggregate_span(output_knowledge)
+    return {"text": text, "entities": output_knowledge}
+import plotly.express as px
+import numpy as np
+specific_date = "03-01-2024"  # Example date folder to process
+skills = load_skills_from_date('./tags', specific_date)
+embeddings = np.load(f"./vectorstore/{specific_date}_embeddings.npy")
+with open(f"./vectorstore/{specific_date}_metadata.pkl", "rb") as f:
+    metadata =   pickle.load(f)
+labels, skills = metadata["labels"], metadata["skills"]
+fig = visualize3D(embeddings, labels, skills, n_clusters=5, output_folder="./plots", date=specific_date)
+fig.update_layout(
+     height=900
+)
+with gr.Blocks() as demo:
+    gr.Interface(fn=ner,
+        inputs=gr.Textbox(placeholder="Enter sentence here..."),
+        outputs=["highlight"],
+        examples=examples,
+        title="In-demand skills in machine learning (ML) industry"
+    )
+    # gr.Markdown("Embedding visualisation of sought skills in ML job posting in Stockholm, Sweden on LinkedIn")
+    gr.Plot(fig)
 demo.launch()

config.yaml CHANGED Viewed

@@ -1,4 +1,4 @@
 training:
-    epochs: 3
-    batch_size: 16
     learning_rate: 0.00005

 training:
+    epochs: 2
+    batch_size: 32
     learning_rate: 0.00005

data/tags-04-01-2025.jsonl ADDED Viewed

	@@ -0,0 +1,123 @@

+{"tokens": ["About", "the", "job"], "tags_knowledge": ["O", "O", "O"]}
+{"tokens": ["R", "##ill", "##ion", "is", "seeking", "a", "skilled", "AI", "Engineer", "to", "join", "our", "innovative", "Data", "/", "AI", "team", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "B", "O", "B", "O", "O"]}
+{"tokens": ["In", "this", "role", ",", "you", "\u2019", "ll", "be", "pivotal", "in", "developing", "machine", "learning", "models", "that", "drive", "AI", "-", "powered", "AP", "(", "A", "##cco", "##unts", "Pay", "##able", ")", "automation", "and", "enhance", "our", "products", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "B", "O", "O", "B", "O", "O", "B", "O", "B", "I", "I", "I", "I", "O", "B", "O", "O", "O", "O", "O"]}
+{"tokens": ["Together", ",", "we", "collaborate", "across", "the", "entire", "product", "life", "##cycle", "\u2014", "from", "brains", "##tor", "##ming", "and", "design", "to", "implementation", "\u2014", "unlock", "##ing", "AI", "\u2019", "s", "potential", "in", "AP", "automation", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "B", "O", "B", "O", "O", "O", "B", "O", "O", "O", "O", "B", "B", "O"]}
+{"tokens": ["If", "you", "\u2019", "re", "an", "experienced", "AI", "/", "M", "##L", "developer", "who", "th", "##rive", "##s", "in", "a", "dynamic", "environment", ",", "we", "\u2019", "d", "love", "to", "have", "you", "on", "board", "!"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Re", "##sp", "##ons", "##ibi", "##lities", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
+{"tokens": ["The", "AI", "Engineer", "plays", "a", "key", "role", "in", "developing", "and", "implementing", "AI", "technologies", "into", "our", "Sa", "##a", "##S", "products", ",", "with", "a", "focus", "on", "technical", "execution", "rather", "than", "leadership", "."], "tags_knowledge": ["O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "O", "O", "B", "I", "I", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O"]}
+{"tokens": ["This", "role", "is", "critical", "in", "building", ",", "integrating", ",", "and", "op", "##ti", "##mizing", "AI", "/", "M", "##L", "solutions", "to", "meet", "product", "goals", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "B", "I", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Dev", "##elo", "##p", "and", "implement", "s", "##cal", "##able", "AI", "/", "M", "##L", "models", "that", "support", "product", "objectives", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "B", "I", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Col", "##la", "##bor", "##ate", "closely", "with", "product", ",", "engineering", ",", "and", "data", "teams", "to", "integrate", "AI", "features", "into", "our", "products", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "B", "O", "O", "O", "B", "O", "O", "O", "O", "O"]}
+{"tokens": ["Stay", "up", "-", "to", "-", "date", "with", "emerging", "AI", "technologies", "and", "contribute", "to", "experimentation", "and", "innovation", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["B", "##uild", "and", "maintain", "effective", "AI", "/", "M", "##L", "pipeline", "##s", "and", "deployment", "infrastructure", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "O", "B", "I", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["What", "You", "'", "ll", "Bring", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
+{"tokens": ["2", "+", "years", "hands", "-", "on", "experience", "with", "putting", "self", "-", "developed", "machine", "learning", "solutions", "into", "a", "production", "environment", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["High", "pro", "##ficiency", "in", "Python", "and", "AI", "/", "M", "##L", "framework", "##s", ",", "i", ".", "e", ".", "P", "##yt", "##or", "##ch", "."], "tags_knowledge": ["O", "O", "O", "O", "B", "O", "B", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "B", "I", "I", "I", "O"]}
+{"tokens": ["A", "curious", "minds", "##et", "with", "strong", "collaboration", "skills", "who", "is", "comfortable", "in", "environments", "without", "clear", "-", "cut", "processes", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Master", "'", "s", "degree", "in", "engineering", "or", "similar", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "O", "O", "O"]}
+{"tokens": ["Bonus", "skills", ":"], "tags_knowledge": ["O", "O", "O"]}
+{"tokens": ["Cloud", "Op", "##s", "and", "I", "##a", "##C", "tools", "such", "as", "Terra", "##form", "."], "tags_knowledge": ["B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O"]}
+{"tokens": ["M", "##L", "##O", "##ps", "best", "practices", "and", "tools", "like", "Data", "##bri", "##cks", "."], "tags_knowledge": ["B", "I", "I", "I", "O", "O", "O", "O", "O", "B", "I", "I", "O"]}
+{"tokens": ["Knowledge", "of", "working", "with", "visually", "rich", "documents", "(", "V", "##RD", "##s", ")", "and", "genera", "##tive", "AI", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O"]}
+{"tokens": ["Experience", "working", "with", "RA", "##G", ",", "LL", "##M", "evaluation", ",", "API", "-", "driven", "micro", "##ser", "##vice", "##s", ",", "cache", "management", "and", "production", "-", "level", "software", "."], "tags_knowledge": ["O", "O", "O", "B", "I", "O", "B", "I", "O", "O", "B", "O", "O", "B", "I", "I", "I", "O", "B", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["What", "we", "offer", ":"], "tags_knowledge": ["O", "O", "O", "O"]}
+{"tokens": ["Op", "##port", "##unity", "to", "work", "in", "a", "dynamic", "growth", "company"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Talent", "##ed", "colleagues", "ready", "to", "support", "the", "success", "in", "your", "career", "path"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Social", "events", "with", "your", "colleagues", "(", "breakfast", ",", "candy", "-", "time", ",", "after", "##work", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["A", "collection", "of", "different", "benefits", ",", "including", "a", "generous", "pension", "and", "insurance", "package"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Hybrid", "working", "model", ",", "2", "days", "per", "week", "in", "the", "office"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Come", "and", "enjoy", "our", "beautiful", "office", "in", "central", "Stockholm", "(", "on", "the", "14th", "floor", ",", "with", "amazing", "views", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["The", "recruitment", "process", ":"], "tags_knowledge": ["O", "O", "O", "O"]}
+{"tokens": ["We", "review", "applications", "and", "invite", "for", "interviews", "continuously", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["A", "background", "check", "will", "be", "conducted", "on", "final", "candidates", ",", "pre", "-", "employment", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["About", "R", "##ill", "##ion", ":"], "tags_knowledge": ["O", "O", "O", "O", "O"]}
+{"tokens": ["We", "are", "a", "global", "company", "founded", "in", "Sweden", "with", "30", "years", "\u2019", "experience", "in", "the", "AP", "Auto", "##mation", "industry", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "I", "O", "O"]}
+{"tokens": ["By", "removing", "the", "manual", "steps", "of", "in", "##vo", "##ice", "handling", ",", "we", "enable", "finance", "teams", "to", "save", "time", "and", "effort", ",", "reducing", "the", "possibility", "of", "human", "error", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Because", "we", "\u2019", "re", "AP", "professionals", "ourselves", ",", "we", "understand", "how", "to", "give", "our", "customers", "everything", "they", "need", ",", "and", "nothing", "they", "don", "\u2019", "t", "."], "tags_knowledge": ["O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Together", "with", "our", "owners", "at", "Alto", "##r", ",", "we", "##\u00b4", "##re", "on", "a", "journey", "to", "expand", "in", "our", "home", "markets", ",", "while", "entering", "new", "territories", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["To", "complete", "our", "mission", ",", "we", "need", "more", "talented", "people", "!"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["R", "##ill", "##ion", "is", "an", "equal", "opportunity", "employer", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["About", "the", "job"], "tags_knowledge": ["O", "O", "O"]}
+{"tokens": ["Job", "Des", "##cription"], "tags_knowledge": ["O", "O", "O"]}
+{"tokens": ["We", "are", "on", "the", "journey", "to", "transform", "our", "digital", "capabilities", ",", "bringing", "core", "business", "processes", ",", "people", ",", "data", "&", "technology", "together", "-", "an", "enable", "##r", "for", "I", "##KE", "##A", "to", "become", "an", "even", "better", "home", "fur", "##nish", "##ing", "retailer", "in", "the", "future", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "O", "O", "B", "B", "B", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["If", "that", "sounds", "like", "you", ",", "come", "and", "join", "us", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Together", "we", "can", "do", "great", "things", "!"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Do", "you", "want", "to", "be", "part", "of", "making", "it", "happen", "?"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Then", "keep", "on", "reading", "."], "tags_knowledge": ["O", "O", "O", "O", "O"]}
+{"tokens": ["The", "Team"], "tags_knowledge": ["O", "O"]}
+{"tokens": ["The", "Spa", "##tial", "Computing", "Team", "drives", "the", "digital", "innovation", "work", "in", "the", "spatial", "computing", "area", "for", "all", "I", "##KE", "##A", "companies", "."], "tags_knowledge": ["O", "B", "I", "B", "I", "O", "O", "B", "B", "O", "O", "O", "B", "B", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["We", "are", "looking", "for", "a", "Research", "Engineer", "with", "knowledge", "of", "Spa", "##tial", "Computing", "/", "X", "##R", "Development", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "B", "O", "B", "I", "B", "O"]}
+{"tokens": ["In", "this", "position", "you", "will", "use", "your", "technical", "expertise", "to", "find", ",", "explore", ",", "evaluate", "and", "transfer", "innovation", "insights", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "B", "B", "O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "O"]}
+{"tokens": ["You", "are", "someone", "with", "an", "innovative", "mind", "and", "lateral", "thinking", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Someone", "who", "thinks", "virtual", "worlds", "are", "cool", "but", "helping", "real", "people", "is", "even", "better", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["You", "know", "why", "robots", "and", "pie", "go", "well", "together", "and", "why", "point", "clouds", "on", "a", "sunny", "day", "don", "'", "t", "matter", "at", "all", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Your", "Main", "Re", "##sp", "##ons", "##ibi", "##lities", "Will", "Be", "To"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Ex", "##p", "##lore", "and", "evaluate", "new", "technology", "and", "its", "possibility", "to", "improve", "I", "##KE", "##A", "customer", "and", "co", "-", "worker", "experience"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Dev", "##elo", "##p", "real", "-", "time", "3D", "applications", "to", "serve", "as", "inspiring", "proof", "of", "concepts"], "tags_knowledge": ["O", "O", "O", "B", "I", "I", "B", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Col", "##la", "##bor", "##ate", "with", "external", "experts", "and", "inn", "##ova", "##tors", "in", "exploration", "##s", "of", "technical", "solutions"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "B", "O"]}
+{"tokens": ["Col", "##lect", "and", "share", "expertise", "with", "I", "##KE", "##A", "stakeholders"], "tags_knowledge": ["O", "O", "O", "O", "B", "O", "O", "O", "O", "O"]}
+{"tokens": ["About", "You"], "tags_knowledge": ["O", "O"]}
+{"tokens": ["The", "person", "we", "'", "re", "looking", "for", "is", "someone", "passionate", "about", "the", "future", "of", "3D", "graphics", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "B", "O"]}
+{"tokens": ["A", "person", "who", "wants", "to", "use", "game", "engine", "for", "more", "than", "just", "for", "games", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "B", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Someone", "who", "likes", "the", "idea", "of", "conceptual", "##izing", "the", "u", "##topia", "##n", "future", "of", "the", "digital", "-", "human", "interfaces", ",", "to", "interact", "with", "people", "in", "a", "more", "natural", "way", "than", "ever", "before", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "B", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Someone", "who", "thinks", "Swedish", "meat", "##balls", "have", "the", "potential", "to", "taste", "even", "better", "in", "mixed", "reality", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "O"]}
+{"tokens": ["We", "'", "re", "looking", "for", "a", "person", "who", "can", "break", "down", "high", "-", "level", "concepts", "like", "these", "and", "explore", "them", "single", "##hand", "##edly", "or", "as", "part", "of", "a", "team", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["To", "be", "successful", "in", "this", "role", ",", "the", "following", "knowledge", ",", "skills", "and", "experiences", "would", "be", "valuable", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Understanding", "of", "graphics", "pipeline", "##s", "related", "to", "real", "-", "time", "3D", "environments", "."], "tags_knowledge": ["O", "O", "B", "B", "O", "O", "O", "B", "I", "I", "B", "B", "O"]}
+{"tokens": ["Passion", "for", "how", "sound", "and", "audio", "design", "and", "ha", "##ptic", "##s", "can", "be", "used", "to", "el", "##eva", "##te", "im", "##mers", "##ive", "experiences"], "tags_knowledge": ["O", "O", "O", "B", "O", "B", "B", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B"]}
+{"tokens": ["Experience", "with", "Un", "##real", "and", "/", "or", "Unity", "and", "/", "or", "native", "I", "##OS", "/", "Android", "3D", "development", "and", "/", "or", "Web", "based", "3D", "engines"], "tags_knowledge": ["O", "O", "B", "I", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "O", "O", "O", "B", "O", "B", "B"]}
+{"tokens": ["Experience", "with", "mobile", "application", "development", "and", "deployment", "."], "tags_knowledge": ["O", "O", "B", "B", "B", "O", "B", "O"]}
+{"tokens": ["Programming", "skills", "building", "applications", "communicating", "with", "back", "-", "ends", "and", "building", "applications", "of", "interactive", "worlds", "using", "game", "engines", "and", "3D", "Graphic", "##s", "."], "tags_knowledge": ["B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "O", "B", "B", "B", "O"]}
+{"tokens": ["Good", "knowledge", "of", "at", "least", "2", "different", "core", "programming", "languages", "such", "as", "C", ",", "C", "#", ",", "Python", ",", "C", "+", "+", "or", "Java", "##Script"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "O", "O", "B", "O", "B", "I", "O", "B", "O", "B", "I", "O", "B", "B", "I"]}
+{"tokens": ["You", "'", "ll", "have", "a", "passion", "for", "sharing", "the", "knowledge", "you", "'", "ve", "acquired", ",", "with", "the", "ability", "to", "communicate", "with", "both", "technical", "and", "non", "-", "technical", "stakeholders", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "B", "O"]}
+{"tokens": ["A", "##bility", "to", "formula", "##te", "new", "ideas", "surrounding", "technological", "innovations", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "O"]}
+{"tokens": ["A", "##bility", "to", "discuss", "problems", "with", "program", "code", "examples", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "B", "O", "O"]}
+{"tokens": ["Strong", "collaboration", "skills", ",", "with", "experience", "developing", "solutions", "alongside", "other", "teammates", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Additional", "Information"], "tags_knowledge": ["O", "O"]}
+{"tokens": ["This", "role", "is", "full", "-", "time", "(", "40", "hours", "per", "week", ")", "and", "based", "in", "\u00c4", "##lm", "##hul", "##t", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["This", "role", "sits", "in", "the", "Range", "Operations", "and", "reports", "to", "Innovation", "Manager", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "B", "O", "O", "O", "B", "B", "O"]}
+{"tokens": ["At", "I", "##KE", "##A", ",", "we", "are", "looking", "for", "people", "who", "believe", "everyone", "deserves", "a", "seat", "at", "the", "table", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["You", "\u2019", "re", "welcome", "no", "matter", "where", "you", "come", "from", ",", "what", "you", "believe", ",", "and", "what", "you", "look", "like", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["We", "don", "\u2019", "t", "even", "care", "how", "you", "have", "furnished", "your", "home", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["We", "\u2019", "re", "interested", "in", "you", "simply", "because", "you", "\u2019", "re", "you", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Even", "if", "your", "experience", "doesn", "\u2019", "t", "al", "##ign", "perfectly", "with", "every", "qualification", "in", "the", "job", "description", ",", "we", "encourage", "you", "to", "apply", "anyway", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["We", "believe", "that", "people", "\u2019", "s", "different", "perspectives", ",", "backgrounds", ",", "and", "personalities", "make", "us", "better", "at", "understanding", "our", "customers", "dreams", "and", "needs", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["At", "I", "##KE", "##A", ",", "we", "\u2019", "re", "all", "on", "the", "same", "project", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["If", "you", "have", "a", "special", "need", "that", "requires", "accommodation", "in", "the", "recruitment", "process", ",", "just", "let", "us", "know", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Please", "note", "that", "due", "to", "the", "upcoming", "holiday", "season", "in", "December", ",", "our", "recruitment", "process", "may", "take", "a", "bit", "longer", "than", "usual", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["We", "will", "do", "our", "best", "to", "keep", "all", "candidates", "updated", "on", "their", "status", "before", "any", "breaks", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Thank", "you", "for", "your", "patience", "during", "this", "time", "!"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Interest", "##ed", "?"], "tags_knowledge": ["O", "O", "O"]}
+{"tokens": ["Sub", "##mit", "your", "C", "##V", "and", "let", "us", "know", "why", "you", "would", "be", "a", "good", "fit", "for", "this", "role", ",", "in", "English", ",", "by", "7th", "of", "January", "202", "##5", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["About", "the", "job"], "tags_knowledge": ["O", "O", "O"]}
+{"tokens": ["Defence", "projects", ",", "unlocked", "."], "tags_knowledge": ["B", "B", "O", "O", "O"]}
+{"tokens": ["At", "Defence", "."], "tags_knowledge": ["O", "B", "O"]}
+{"tokens": ["Works", ",", "we", "match", "you", "with", "the", "industry", "'", "s", "top", "R", "&", "D", "projects", "and", "help", "you", "deliver", "impact", "for", "a", "safer", "tomorrow", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "B", "I", "I", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["With", "Defence", "."], "tags_knowledge": ["O", "B", "O"]}
+{"tokens": ["Works", ",", "you", "will", "get", "access", "to", "interesting", "projects", "without", "consuming", "time", "on", "sales", "and", "project", "hunting", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "B", "O", "O"]}
+{"tokens": ["Our", "platform", "allows", "us", "to", "keep", "our", "organization", "structure", "thin", ",", "which", "means", "less", "internal", "costs", ",", "more", "value", "to", "customers", ",", "and", "higher", "rates", "for", "you", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["As", "an", "Art", "##ific", "##ial", "Intelligence", "Engineer", "you", "will", "join", "our", "trusted", "network", "to", "work", "on", "cutting", "-", "edge", "R", "&", "D", "projects", "in", "the", "defense", "sector", ",", "contributing", "to", "innovative", "solutions", "that", "enhance", "safety", "and", "security", "for", "a", "safer", "tomorrow", "."], "tags_knowledge": ["O", "O", "B", "I", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "I", "B", "O", "O", "B", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["You", "will", "bring", "deep", "expertise", "in", "machine", "learning", "and", "artificial", "intelligence", ",", "ideal", "##ly", "with", "prior", "experience", "in", "developing", "autonomous", "products", ",", "radar", "systems", ",", "or", "applications", "that", "heavily", "leverage", "AI", "capabilities", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "I", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O"]}
+{"tokens": ["The", "ideal", "candidate", "will", "have", "a", "proven", "track", "record", "of", "building", "secure", ",", "high", "-", "performing", "solutions", "designed", "to", "meet", "the", "defense", "industry", "'", "s", "rigorous", "standards", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "O", "O", "O", "O", "O"]}
+{"tokens": ["Your", "presence", "can", "be", "in", "Northern", "Europe", "(", "Finland", ",", "Sweden", ",", "Norway", ",", "Denmark", ")", ",", "United", "Kingdom", ",", "or", "Germany", "and", "you", "are", "eligible", "to", "pass", "security", "-", "clearance", "before", "starting", "on", "a", "project", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Please", "notice", "that", "consultant", "##s", "might", "need", "to", "pass", "security", "clearance", "before", "starting", "on", "a", "project", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Re", "##sp", "##ons", "##ibi", "##lities", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Design", ",", "implement", ",", "and", "op", "##ti", "##mize", "AI", "models", "and", "algorithms", "for", "defense", "sector", "applications", ",", "including", "autonomous", "systems", ",", "radar", "##s", ",", "and", "AI", "-", "driven", "decision", "-", "making", "tools", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "B", "B", "O", "O", "O", "B", "O", "O", "B", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Dev", "##elo", "##p", "secure", ",", "high", "-", "performance", "software", "solutions", "tailored", "to", "meet", "the", "string", "##ent", "requirements", "of", "defense", "industry", "projects", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "B", "B", "O"]}
+{"tokens": ["I", "##dent", "##ify", "and", "address", "v", "##ul", "##ner", "##abi", "##lities", "in", "AI", "systems", ",", "ensuring", "robust", "c", "##y", "##bers", "##ec", "##urity", "measures", "are", "integrated", "throughout", "the", "development", "life", "##cycle", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Col", "##la", "##bor", "##ate", "with", "cross", "-", "functional", "teams", "to", "al", "##ign", "AI", "capabilities", "with", "overall", "system", "architecture", "##s", ",", "ensuring", "sea", "##m", "##less", "integration", "and", "performance", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Con", "##duct", "rigorous", "testing", ",", "valid", "##ation", ",", "and", "documentation", "of", "AI", "solutions", "to", "ensure", "reliability", ",", "s", "##cal", "##ability", ",", "and", "compliance", "with", "industry", "standards", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Qualification", "##s", ":"], "tags_knowledge": ["O", "O", "O"]}
+{"tokens": ["Master", "'", "s", "or", "Ph", ".", "D", ".", "in", "Computer", "Science", ",", "Machine", "Learning", ",", "or", "related", "field"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "B", "I", "O", "O", "O", "O"]}
+{"tokens": ["Strong", "skills", "in", "Pat", "##tern", "Re", "##cognition", ",", "N", "##eur", "##al", "Networks", ",", "and", "Al", "##gor", "##ith", "##ms", "."], "tags_knowledge": ["O", "O", "O", "B", "I", "I", "I", "O", "B", "I", "I", "I", "O", "O", "B", "I", "I", "I", "O"]}
+{"tokens": ["Experience", "from", "developing", "products", "which", "rely", "on", "AI", "/", "M", "##L", "capabilities", "such", "as", "autonomous", "systems", ",", "radar", "technologies", ",", "or", "other", "AI", "-", "re", "##lian", "##t", "defense", "applications", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "B", "O", "B", "I", "O", "O", "O", "B", "O", "O", "B", "O", "O", "O", "O", "B", "O", "O", "O", "O", "B", "O", "O"]}
+{"tokens": ["F", "##ami", "##lia", "##rity", "with", "secure", "system", "design", ",", "c", "##y", "##bers", "##ec", "##urity", "principles", ",", "and", "compliance", "with", "defense", "-", "specific", "regulations", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
+{"tokens": ["Additional", "Re", "##quire", "##ments", ":"], "tags_knowledge": ["O", "O", "O", "O", "O"]}
+{"tokens": ["Security", "certification", "##s", "or", "training", "(", "e", ".", "g", ".", ",", "C", "##IS", "##SP", ",", "CE", "##H", ")", "are", "highly", "desirable", "."], "tags_knowledge": ["B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "I", "O", "B", "I", "O", "O", "O", "O", "O"]}
+{"tokens": ["Next", "steps", ":"], "tags_knowledge": ["O", "O", "O"]}
+{"tokens": ["Press", "\"", "A", "##pp", "##ly", "\""], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Sub", "##mit", "your", "resume", "on", "Has", "##hl", "##ist", "platform", "(", "takes", "5", "minutes", ")", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "I", "I", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["Has", "##hl", "##ist", "is", "our", "parental", "company", "which", "infrastructure", "we", "are", "utilizing", "to", "manage", "the", "profiles"], "tags_knowledge": ["B", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["We", "will", "review", "your", "profile", "and", "if", "your", "profile", "matches", "our", "criteria"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["We", "call", "you", "and", "make", "sure", "that", "your", "expectations", "and", "career", "ambitions", "are", "aligned", "with", "our", "offer", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
+{"tokens": ["You", "will", "get", "accepted", "to", "our", "trusted", "network", "of", "partners", "and", "our", "clients", "and", "receive", "offers", "from", "different", "clients", "who", "are", "looking", "for", "your", "expertise"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}

demo-app.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import plotly.express as px
+import numpy as np
+X = np.random.randint(0, 10, (10, 3))
+fig = px.scatter_3d(x=X[:,0], y=X[:, 1], z=X[:, 2])
+fig.show()

demo.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import gradio as gr
+import plotly.graph_objects as go
+import numpy as np
+# Function to create a 3D plot
+def create_3d_plot(x_range, y_range):
+    # Generate 3D data
+    x = np.linspace(-x_range, x_range, 100)
+    y = np.linspace(-y_range, y_range, 100)
+    x, y = np.meshgrid(x, y)
+    z = np.sin(np.sqrt(x**2 + y**2))
+    # Create a 3D surface plot
+    fig = go.Figure(data=[go.Surface(z=z, x=x, y=y)])
+    fig.update_layout(
+        scene=dict(
+            xaxis_title='X Axis',
+            yaxis_title='Y Axis',
+            zaxis_title='Z Axis'
+        ),
+        margin=dict(l=0, r=0, b=0, t=0),
+        height=1000
+    )
+    return fig
+# Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("## Interactive 3D Plot with Gradio and Plotly")
+    with gr.Row():
+        x_slider = gr.Slider(minimum=1, maximum=10, step=1, value=5, label="X Range")
+        y_slider = gr.Slider(minimum=1, maximum=10, step=1, value=5, label="Y Range")
+    plot_output = gr.Plot(label="3D Surface Plot")
+    # Update the plot on slider change
+    x_slider.change(create_3d_plot, inputs=[x_slider, y_slider], outputs=plot_output)
+    y_slider.change(create_3d_plot, inputs=[x_slider, y_slider], outputs=plot_output)
+# Launch the app
+demo.launch()

embedding_gen.py CHANGED Viewed

@@ -4,6 +4,8 @@ import numpy as np
 import umap
 import matplotlib.pyplot as plt
 import plotly.express as px
 # Step 1: Load skills from all files in a specific date folder
 def load_skills_from_date(base_folder, date):
@@ -65,9 +67,7 @@ def visualize_embeddings_3d(reduced_embeddings, skills, output_folder, date):
     fig.show()
-def perform_kmeans_and_visualize(reduced_embeddings, skills, n_clusters, output_folder, date):
-    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
-    labels = kmeans.fit_predict(reduced_embeddings)
     fig = px.scatter_3d(
         x=reduced_embeddings[:, 0],
@@ -84,30 +84,43 @@ def perform_kmeans_and_visualize(reduced_embeddings, skills, n_clusters, output_
     fig.write_html(plot_path)
     print(f"3D clustered plot saved at {plot_path}")
-    fig.show()
-# Main execution
-base_folder = "./tags"
-output_folder = "./plots"
-specific_date = "03-01-2024"  # Example date folder to process
-# Load skills from the specified date folder
-skills = load_skills_from_date(base_folder, specific_date)
-if not skills:
-    print(f"No skills found for the date: {specific_date}")
-else:
-    print(f"Loaded {len(skills)} unique skills for the date: {specific_date}")
-    # Generate embeddings
-    embeddings = generate_embeddings(skills)
-    # Reduce dimensions to 2D and visualize
-    reduced_embeddings_2d = reduce_dimensions(embeddings, n_components=2)
-    visualize_embeddings_2d(reduced_embeddings_2d, skills, output_folder, specific_date)
-    # Reduce dimensions to 3D and visualize
-    reduced_embeddings_3d = reduce_dimensions(embeddings, n_components=3)
-    visualize_embeddings_3d(reduced_embeddings_3d, skills, output_folder, specific_date)
-    # Perform KMeans clustering and visualize in 3D
-    perform_kmeans_and_visualize(reduced_embeddings_3d, skills, n_clusters, output_folder, specific_date)

 import umap
 import matplotlib.pyplot as plt
 import plotly.express as px
+from sklearn.cluster import KMeans
+import pickle
 # Step 1: Load skills from all files in a specific date folder
 def load_skills_from_date(base_folder, date):
     fig.show()
+def visualize3D(reduced_embeddings, labels, skills, n_clusters, output_folder, date):
     fig = px.scatter_3d(
         x=reduced_embeddings[:, 0],
     fig.write_html(plot_path)
     print(f"3D clustered plot saved at {plot_path}")
+    # fig.show()
+    return fig
+if __name__ == "__main__":
+    # Main execution
+    base_folder = "./tags"
+    output_folder = "./plots"
+    vector_store = "./vectorstore"
+    specific_date = "03-01-2024"  # Example date folder to process
+    n_clusters = 5
+    # Load skills from the specified date folder
+    skills = load_skills_from_date(base_folder, specific_date)
+    if not skills:
+        print(f"No skills found for the date: {specific_date}")
+    else:
+        print(f"Loaded {len(skills)} unique skills for the date: {specific_date}")
+        # Generate embeddings
+        embeddings = generate_embeddings(skills)
+        # Reduce dimensions to 2D and visualize
+        # reduced_embeddings_2d = reduce_dimensions(embeddings, n_components=2)
+        # visualize_embeddings_2d(reduced_embeddings_2d, skills, output_folder, specific_date)
+        # Reduce dimensions to 3D, cluster, and visualize
+        reduced_embeddings_3d = reduce_dimensions(embeddings, n_components=3)
+        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+        labels = kmeans.fit_predict(reduced_embeddings_3d)
+        visualize3D(reduced_embeddings_3d, labels, skills, n_clusters, output_folder, specific_date)
+        # Save the reduced embeddings and metadata
+        np.save(os.path.join(vector_store, f"{specific_date}_embeddings.npy"), reduced_embeddings_3d)
+        with open(os.path.join(vector_store, f"{specific_date}_metadata.pkl"), 'wb') as f:
+            pickle.dump({'labels': labels, 'skills': skills}, f)
+        # Perform KMeans clustering and visualize in 3D
+        # perform_kmeans_and_visualize(reduced_embeddings_3d, skills, n_clusters, output_folder, specific_date)

filter-faults.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import json
+def count_mismatch(file_path):
+    count_mismatch = 0
+    with open(file_path, 'r') as file:
+        for line_number, line in enumerate(file, start=1):
+            data = json.loads(line)
+            tokens, tags = data['tokens'], data['tags_knowledge']
+            if len(tokens) != len(tags):
+                count_mismatch += 1
+    return count_mismatch
+def delete_mismatched_lines(file_path):
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+    with open(file_path, 'w') as file:
+        for line in lines:
+            data = json.loads(line)
+            tokens, tags = data['tokens'], data['tags_knowledge']
+            if len(tokens) == len(tags):
+                file.write(line)
+if __name__ == "__main__":
+    file_path = 'data/tags-04-01-2025.jsonl'
+    count = count_mismatch(file_path)
+    if  count > 0:
+        delete_mismatched_lines(file_path)
+        print(f"Deleted {count} mismatched lines.")

job-postings/{03-01-2024 → 03-01-2025}/1.txt RENAMED Viewed

File without changes

job-postings/{03-01-2024 → 03-01-2025}/2.txt RENAMED Viewed

File without changes

job-postings/{03-01-2024 → 03-01-2025}/3.txt RENAMED Viewed

File without changes

job-postings/{04-01-2024 → 04-01-2025}/1.txt RENAMED Viewed

File without changes

job-postings/{04-01-2024 → 04-01-2025}/2.txt RENAMED Viewed

File without changes

job-postings/{04-01-2024 → 04-01-2025}/3.txt RENAMED Viewed

File without changes

llm-tagging.py CHANGED Viewed

@@ -16,6 +16,10 @@ from tabulate import tabulate
 import spacy
 import re
 import json
 load_dotenv(".env")
 nlp = spacy.load("en_core_web_sm")
@@ -29,8 +33,9 @@ def split_text_recursively(text):
 def tokenize_to_sent(path):
-    # Read the file
     with open(path, 'r') as file:
         text = file.read()
@@ -47,6 +52,8 @@ def tokenize_to_sent(path):
         doc = nlp(line)
         for sent in doc.sents:
             sents.append(sent.text)
     return sents
@@ -92,14 +99,23 @@ prompt = PromptTemplate(
                        "knowledge_definition": knowledge_definition},
 )
-def extract_tags(text: str, tokenize = True) -> Results:
     if tokenize:
-        tokens = [tokenizer.tokenize(t) for t in text]
     prompt_and_model = prompt | model
     output = prompt_and_model.invoke({"input": tokens})
     output = parser.invoke(output)
     return tokens, output
@@ -111,13 +127,43 @@ def tag_posting(job_path, output_path):
     # LLM-based tag extraction
     tokens, output = extract_tags(sents, tokenize=True)
-    with open("./data/data.jsonl", "w") as file:
         for entry in output['results']:
             json.dump(entry, file)
             file.write("\n")
 if __name__ == "__main__":
-    job_path = './job-postings/03-01-2024/1.txt'
-    output_path = './data/data.json'
-    tag_posting(job_path, output_path)

 import spacy
 import re
 import json
+from datetime import datetime
+from tqdm import tqdm
+import time
 load_dotenv(".env")
 nlp = spacy.load("en_core_web_sm")
 def tokenize_to_sent(path):
+    print(f"Tokenizing {path} to sentences...")
+    # Read the file
     with open(path, 'r') as file:
         text = file.read()
         doc = nlp(line)
         for sent in doc.sents:
             sents.append(sent.text)
+    print(f"Tokenization completed. {len(sents)} sentences found.")
     return sents
                        "knowledge_definition": knowledge_definition},
 )
+def extract_tags(sents: str, tokenize = True) -> Results:
+    print("Extracting tags...")
+    print(f"Tokenizing {len(sents)} sentences...")
+    start_time = time.time()
     if tokenize:
+        tokens = [tokenizer.tokenize(t) for t in sents]
     prompt_and_model = prompt | model
     output = prompt_and_model.invoke({"input": tokens})
     output = parser.invoke(output)
+    time_taken = time.time() - start_time
+    print(f"Tags extracted in {time_taken} seconds.")
     return tokens, output
     # LLM-based tag extraction
     tokens, output = extract_tags(sents, tokenize=True)
+    with open(output_path, "w") as file:
         for entry in output['results']:
             json.dump(entry, file)
             file.write("\n")
+def tag_all_today():
+    date = datetime.today().strftime('%d-%m-%Y')
+    date = "04-01-2025"
+    jobs = os.listdir(f'./job-postings/{date}')
+    output_path = f'./data/tags-{date}.jsonl'
+    count = 0
+    for job in tqdm(jobs, desc="Tagging job postings"):
+        job_path = f'./job-postings/{date}/{job}'
+        # Reading & sentence tokenization
+        sents = tokenize_to_sent(job_path)
+        # LLM-based tag extraction
+        tokens, output = extract_tags(sents, tokenize=True)
+        with open(output_path, "a") as file:
+            for entry in output['results']:
+                json.dump(entry, file)
+                file.write("\n")
+        count += 1
+        if count > 2:
+            break
+    print(f"Tagging completed. Output saved to {output_path}")
 if __name__ == "__main__":
+    tag_all_today()

plots/03-01-2024_2D_projection.png ADDED Viewed

plots/03-01-2024_3D_clustering.html ADDED Viewed

The diff for this file is too large to render. See raw diff

plots/03-01-2024_3D_projection.html ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -6,4 +6,6 @@ idna
 langchain_openai
 python-dotenv
 torch
-spacy

 langchain_openai
 python-dotenv
 torch
+spacy
+umap-learn
+plotly

tag-posting.py CHANGED Viewed

@@ -215,6 +215,10 @@ def backfill():
             print(f"Saved skills to: {tag_path}")
 if __name__ == '__main__':
     # Backfill
@@ -224,4 +228,5 @@ if __name__ == '__main__':
     # path = './job-postings/03-01-2024/2.txt'
     # sents = parse_post(path)
     # skills = extract_skills(sents)
-    # skills_save('./tags/03-01-2024/2.txt',skills)

             print(f"Saved skills to: {tag_path}")
+def tag_date():
+    pass
 if __name__ == '__main__':
     # Backfill
     # path = './job-postings/03-01-2024/2.txt'
     # sents = parse_post(path)
     # skills = extract_skills(sents)
+    # skills_save('./tags/03-01-2024/2.txt',skills)RAPID_API_KEY : 60a10b11e6msh821d32f6e1e955ep15b5b1jsnf61a46680409
+1

tags/{03-01-2024 → 03-01-2025}/1.txt RENAMED Viewed

File without changes

tags/{03-01-2024 → 03-01-2025}/2.txt RENAMED Viewed

File without changes

tags/{03-01-2024 → 03-01-2025}/3.txt RENAMED Viewed

File without changes

tags/{04-01-2024 → 04-01-2025}/1.txt RENAMED Viewed

File without changes

tags/{04-01-2024 → 04-01-2025}/2.txt RENAMED Viewed

File without changes

tags/{04-01-2024 → 04-01-2025}/3.txt RENAMED Viewed

File without changes

train.py CHANGED Viewed

@@ -157,6 +157,14 @@ def train(json_path: str):
     # Log the artifact to W&B
     wandb.log_artifact(artifact)
 if __name__ == "__main__":
-    train(json_path="./data/data.jsonl")

     # Log the artifact to W&B
     wandb.log_artifact(artifact)
+def train_today():
+    date = datetime.today().strftime('%d-%m-%Y')
+    date = "04-01-2025"
+    json_path = os.path.join(os.getcwd(),f'data/tags-{date}.jsonl')
+    print(f"Training on {json_path}")
+    train(json_path=json_path)
 if __name__ == "__main__":
+    train_today()

vectorstore/03-01-2024_embeddings.npy ADDED Viewed

Binary file (860 Bytes). View file

vectorstore/03-01-2024_metadata.pkl ADDED Viewed

Binary file (1.28 kB). View file