Spaces:

forestav
/

jobsai

Running

App Files Files Community

forestav commited on Jan 8

Commit

ba6957d

1 Parent(s): af91fc8

update

Browse files

Files changed (8) hide show

feature_pipeline.ipynb +282 -0
feedback.db +0 -0
gradioapp.py +40 -46
modeltrain.py +0 -102
pinecone_handler.py +2 -1
settings.py +2 -1
app.py → streamlit_app.py +0 -0
training_pipeline.ipynb +641 -0

feature_pipeline.ipynb ADDED Viewed

	@@ -0,0 +1,282 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import hopsworks\n",
+    "import os\n",
+    "import re\n",
+    "from dotenv import load_dotenv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-01-08 19:51:38,754 INFO: Closing external client and cleaning up certificates.\n",
+      "Connection closed.\n",
+      "2025-01-08 19:51:38,758 INFO: Initializing external client\n",
+      "2025-01-08 19:51:38,758 INFO: Base URL: https://c.app.hopsworks.ai:443\n",
+      "2025-01-08 19:51:39,828 INFO: Python Engine initialized.\n",
+      "\n",
+      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1158296\n"
+     ]
+    }
+   ],
+   "source": [
+    "load_dotenv()\n",
+    "\n",
+    "api_key = os.getenv(\"HOPSWORKS_API_KEY\")\n",
+    "project = hopsworks.login(project=\"orestavf\", api_key_value=api_key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fs = project.get_feature_store()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Retrieve feature groups\n",
+    "feedback_fg = fs.get_feature_group(name=\"job_feedback\", version=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.93s) \n"
+     ]
+    }
+   ],
+   "source": [
+    "feedback_df = feedback_fg.read()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>job_id</th>\n",
+       "      <th>resume_text</th>\n",
+       "      <th>job_headline</th>\n",
+       "      <th>job_occupation</th>\n",
+       "      <th>job_description</th>\n",
+       "      <th>is_relevant</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>29321628</td>\n",
+       "      <td>Filip Orestav  \\nTransformatorvägen 6, Sollent...</td>\n",
+       "      <td>Junior Projektadmin till talangprogram på AFRY...</td>\n",
+       "      <td>Projektledare, bygg och anläggning</td>\n",
+       "      <td>Vill du kickstarta din karriär hos en av Sveri...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     job_id                                        resume_text  \\\n",
+       "0  29321628  Filip Orestav  \\nTransformatorvägen 6, Sollent...   \n",
+       "\n",
+       "                                        job_headline  \\\n",
+       "0  Junior Projektadmin till talangprogram på AFRY...   \n",
+       "\n",
+       "                       job_occupation  \\\n",
+       "0  Projektledare, bygg och anläggning   \n",
+       "\n",
+       "                                     job_description  is_relevant  \n",
+       "0  Vill du kickstarta din karriär hos en av Sveri...         True  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "feedback_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Columns to preprocess\n",
+    "columns_to_process = ['resume_text', 'job_headline', 'job_occupation', 'job_description']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define preprocessing functions\n",
+    "def preprocess_text(text):\n",
+    "    if isinstance(text, str):\n",
+    "        # Lowercase\n",
+    "        text = text.lower()\n",
+    "        # Remove special characters (preserving letters, numbers, and spaces)\n",
+    "        text = re.sub(r\"[^a-zåäöA-Z0-9\\s]\", \"\", text)\n",
+    "        # Remove extra spaces\n",
+    "        text = re.sub(r\"\\s+\", \" \", text)\n",
+    "        return text.strip()  # Strip leading/trailing spaces\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-01-08 18:38:35,968 WARNING: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>job_id</th>\n",
+       "      <th>resume_text</th>\n",
+       "      <th>job_headline</th>\n",
+       "      <th>job_occupation</th>\n",
+       "      <th>job_description</th>\n",
+       "      <th>is_relevant</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>29321628</td>\n",
+       "      <td>filip orestav transformatorvägen 6 sollentuna ...</td>\n",
+       "      <td>junior projektadmin till talangprogram på afry...</td>\n",
+       "      <td>projektledare bygg och anläggning</td>\n",
+       "      <td>vill du kickstarta din karriär hos en av sveri...</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     job_id                                        resume_text  \\\n",
+       "0  29321628  filip orestav transformatorvägen 6 sollentuna ...   \n",
+       "\n",
+       "                                        job_headline  \\\n",
+       "0  junior projektadmin till talangprogram på afry...   \n",
+       "\n",
+       "                      job_occupation  \\\n",
+       "0  projektledare bygg och anläggning   \n",
+       "\n",
+       "                                     job_description  is_relevant  \n",
+       "0  vill du kickstarta din karriär hos en av sveri...         True  "
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Apply preprocessing\n",
+    "feedback_df[columns_to_process] = feedback_df[columns_to_process].applymap(preprocess_text)\n",
+    "\n",
+    "# Display processed dataframe\n",
+    "feedback_df.head()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

feedback.db DELETED Viewed

Binary file (61.4 kB)

gradioapp.py CHANGED Viewed

@@ -7,50 +7,43 @@ from pinecone_handler import PineconeHandler
 from datetime import datetime
 import sqlite3
 import threading
 class Database:
-    def __init__(self, db_name="feedback.db"):
-        self.db_name = db_name
-        self.thread_local = threading.local()
-        self._create_tables()
-    def get_connection(self):
-        if not hasattr(self.thread_local, "connection"):
-            self.thread_local.connection = sqlite3.connect(self.db_name)
-        return self.thread_local.connection
-    def _create_tables(self):
-        conn = sqlite3.connect(self.db_name)
-        cursor = conn.cursor()
-        cursor.execute('''
-        CREATE TABLE IF NOT EXISTS feedback (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            job_id TEXT,
-            resume_text TEXT,
-            job_headline TEXT,
-            job_occupation TEXT,
-            job_description TEXT,
-            is_relevant BOOLEAN,
-            timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
         )
-        ''')
-        conn.commit()
-        conn.close()
-    def save_feedback(self, job_id: str, resume_text: str, headline: str,
-                     occupation: str, description: str, is_relevant: bool):
-        conn = self.get_connection()
-        cursor = conn.cursor()
-        try:
-            cursor.execute('''
-            INSERT INTO feedback
-            (job_id, resume_text, job_headline, job_occupation, job_description, is_relevant)
-            VALUES (?, ?, ?, ?, ?, ?)
-            ''', (job_id, resume_text, headline, occupation, description, is_relevant))
-            conn.commit()
-        except Exception as e:
-            conn.rollback()
-            raise e
 def extract_text(file) -> Optional[str]:
     """Extract text from uploaded resume file"""
@@ -121,12 +114,12 @@ class JobMatcher:
         try:
             # Find the job in current results by Pinecone ID
             job = next((job for job in self.current_results if job['id'] == pinecone_id), None)
             if not job:
                 return "Error: Job not found"
             metadata = job['metadata']
             self.db.save_feedback(
                 job_id=pinecone_id,  # Use Pinecone's ID
                 resume_text=self.current_resume_text,
@@ -135,10 +128,11 @@ class JobMatcher:
                 description=metadata['description'],
                 is_relevant=is_relevant
             )
-            return f"✓ Feedback saved for '{metadata['headline']}'"
         except Exception as e:
             return f"Error saving feedback: {str(e)}"
 def create_interface():
     matcher = JobMatcher()
@@ -258,4 +252,4 @@ def create_interface():
 if __name__ == "__main__":
     interface = create_interface()
-    interface.launch()

 from datetime import datetime
 import sqlite3
 import threading
+import hopsworks
+import pandas as pd
+import os
+from dotenv import load_dotenv
+load_dotenv()
 class Database:
+    def __init__(self):
+        # Initialize Hopsworks
+        project = "orestavf"
+        api_key = os.getenv("HOPSWORKS_API_KEY")
+        self.project = hopsworks.login(project=project, api_key_value=api_key)
+        self.fs = self.project.get_feature_store()
+        self.feedback_fg = self.fs.get_or_create_feature_group(
+            name="job_feedback",
+            version=1,
+            primary_key=["job_id"],
+            description="Feature group for storing user feedback on job matches.",
+            online_enabled=True
         )
+    def save_feedback(self, job_id: str, resume_text: str, headline: str,
+                    occupation: str, description: str, is_relevant: bool):
+        # Prepare feedback data as a pandas DataFrame
+        feedback_data = pd.DataFrame([{
+            "job_id": job_id,
+            "resume_text": resume_text,
+            "job_headline": headline,
+            "job_occupation": occupation,
+            "job_description": description,
+            "is_relevant": is_relevant,
+            #"timestamp": datetime.now()
+        }])
+        self.feedback_fg.insert(feedback_data)
+        print(f"Feedback saved to Hopsworks for job ID: {job_id}")
 def extract_text(file) -> Optional[str]:
     """Extract text from uploaded resume file"""
         try:
             # Find the job in current results by Pinecone ID
             job = next((job for job in self.current_results if job['id'] == pinecone_id), None)
             if not job:
                 return "Error: Job not found"
             metadata = job['metadata']
             self.db.save_feedback(
                 job_id=pinecone_id,  # Use Pinecone's ID
                 resume_text=self.current_resume_text,
                 description=metadata['description'],
                 is_relevant=is_relevant
             )
+            return f"\u2713 Feedback saved for '{metadata['headline']}'"
         except Exception as e:
             return f"Error saving feedback: {str(e)}"
 def create_interface():
     matcher = JobMatcher()
 if __name__ == "__main__":
     interface = create_interface()
+    interface.launch(debug=True)

modeltrain.py DELETED Viewed

@@ -1,102 +0,0 @@
-#!/usr/bin/env python3
-import os
-from torch.utils.data import DataLoader
-from sentence_transformers import SentenceTransformer, InputExample, losses
-# If you want to push to the HF Hub/Spaces programmatically:
-#   pip install huggingface_hub
-# from huggingface_hub import HfApi, HfFolder
-def main():
-    #--------------------------------------------------------------------------
-    # 1. (Optional) Setup your Hugging Face auth
-    #--------------------------------------------------------------------------
-    # If you need to log into your HF account, you can do:
-    #   hf_token = os.getenv("HF_TOKEN")  # or read from a config file
-    #   HfFolder.save_token(hf_token)
-    #   api = HfApi()
-    #
-    # Then set something like:
-    #   repo_id = "KolumbusLindh/my-weekly-model"
-    #
-    # Alternatively, you can push manually later via huggingface-cli.
-    #--------------------------------------------------------------------------
-    # 2. Placeholder training data
-    #--------------------------------------------------------------------------
-    # Suppose each tuple is: (CV_text, liked_job_text, disliked_job_text).
-    # In a real scenario, you'd gather user feedback from your database.
-    train_data = [
-        ("My CV #1", "Job #1 that user liked", "Job #1 that user disliked"),
-        ("My CV #2", "Job #2 that user liked", "Job #2 that user disliked"),
-        # ...
-    ]
-    #--------------------------------------------------------------------------
-    # 3. Convert data into Sentence Transformers InputExamples
-    #--------------------------------------------------------------------------
-    train_examples = []
-    for (cv_text, liked_job_text, disliked_job_text) in train_data:
-        example = InputExample(
-            texts=[cv_text, liked_job_text, disliked_job_text]
-            # TripletLoss expects exactly 3 texts: anchor, positive, negative
-        )
-        train_examples.append(example)
-    #--------------------------------------------------------------------------
-    # 4. Load the base model
-    #--------------------------------------------------------------------------
-    model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
-    model = SentenceTransformer(model_name)
-    #--------------------------------------------------------------------------
-    # 5. Prepare DataLoader & define the Triplet Loss
-    #--------------------------------------------------------------------------
-    # A typical margin is 0.5–1.0. Feel free to adjust it.
-    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
-    train_loss = losses.TripletLoss(
-        model=model,
-        distance_metric=losses.TripletDistanceMetric.COSINE,
-        margin=0.5
-    )
-    #--------------------------------------------------------------------------
-    # 6. Fine-tune (fit) the model
-    #--------------------------------------------------------------------------
-    # Just 1 epoch here for demo. In practice, tune #epochs/batch_size, etc.
-    num_epochs = 1
-    warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)  # ~10% warmup
-    model.fit(
-        train_objectives=[(train_dataloader, train_loss)],
-        epochs=num_epochs,
-        warmup_steps=warmup_steps,
-        show_progress_bar=True
-    )
-    #--------------------------------------------------------------------------
-    # 7. Save model locally
-    #--------------------------------------------------------------------------
-    local_output_path = "my_finetuned_model"
-    model.save(local_output_path)
-    print(f"Model fine-tuned and saved locally to: {local_output_path}")
-    #--------------------------------------------------------------------------
-    # 8. (Optional) Push to your Hugging Face Space
-    #--------------------------------------------------------------------------
-    # If you want to push automatically:
-    #
-    #   model.push_to_hub(repo_id=repo_id, commit_message="Weekly model update")
-    #
-    # Or if you have a Space at e.g. https://huggingface.co/spaces/KolumbusLindh/<some-name>,
-    # you’d create a repo on HF, then push to that repo. Typically one uses
-    # huggingface-cli or the huggingface_hub methods for that:
-    #
-    #   api.create_repo(repo_id=repo_id, repo_type="model", private=False)
-    #   model.push_to_hub(repo_id=repo_id)
-    #
-    #   # If it's a Space, you might need to store your model in the "models" folder
-    #   # or however your Gradio app is set up to load it.
-if __name__ == "__main__":
-    main()

pinecone_handler.py CHANGED Viewed

@@ -52,7 +52,8 @@ class PineconeHandler:
             self.index = self.pc.Index(PINECONE_INDEX_NAME)
         #self.model = SentenceTransformer('all-MiniLM-L6-v2')
-        self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
         log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
     def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:

             self.index = self.pc.Index(PINECONE_INDEX_NAME)
         #self.model = SentenceTransformer('all-MiniLM-L6-v2')
+        #self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
+        self.model = SentenceTransformer('forestav/job_matching_sentence_transformer')
         log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
     def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:

settings.py CHANGED Viewed

@@ -2,7 +2,8 @@ import logging
 PINECONE_ENVIRONMENT = "gcp-starter"
 #PINECONE_INDEX_NAME = "jobads-index"
-PINECONE_INDEX_NAME = "jobsai-multilingual-small"
 DB_TABLE_NAME = 'jobads'
 DB_FILE_NAME = 'jobads_database_20220127.db'

 PINECONE_ENVIRONMENT = "gcp-starter"
 #PINECONE_INDEX_NAME = "jobads-index"
+#PINECONE_INDEX_NAME = "jobsai-multilingual-small"
+PINECONE_INDEX_NAME = "jobads-finetuned-small"
 DB_TABLE_NAME = 'jobads'
 DB_FILE_NAME = 'jobads_database_20220127.db'

app.py → streamlit_app.py RENAMED Viewed

File without changes

training_pipeline.ipynb ADDED Viewed

	@@ -0,0 +1,641 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import hopsworks\n",
+    "from sentence_transformers import SentenceTransformer, InputExample, losses\n",
+    "from torch.utils.data import DataLoader\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from dotenv import load_dotenv\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-01-08 19:52:22,417 INFO: Closing external client and cleaning up certificates.\n",
+      "Connection closed.\n",
+      "2025-01-08 19:52:22,421 INFO: Initializing external client\n",
+      "2025-01-08 19:52:22,421 INFO: Base URL: https://c.app.hopsworks.ai:443\n",
+      "2025-01-08 19:52:23,548 INFO: Python Engine initialized.\n",
+      "\n",
+      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1158296\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize Hopsworks connection\n",
+    "load_dotenv()\n",
+    "\n",
+    "api_key = os.getenv(\"HOPSWORKS_API_KEY\")\n",
+    "project = hopsworks.login(project=\"orestavf\", api_key_value=api_key)\n",
+    "fs = project.get_feature_store()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.84s) \n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load preprocessed data\n",
+    "feedback_fg = fs.get_feature_group(name=\"job_feedback\", version=1)\n",
+    "feedback_df = feedback_fg.read()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Split into train and validation sets\n",
+    "train_df, val_df = train_test_split(feedback_df, test_size=0.2, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare data for SentenceTransformer\n",
+    "def prepare_examples(df):\n",
+    "    examples = []\n",
+    "    for _, row in df.iterrows():\n",
+    "        examples.append(\n",
+    "            InputExample(\n",
+    "                texts=[row[\"resume_text\"], row[\"job_description\"]],\n",
+    "                label=float(row[\"is_relevant\"])  # Convert to float for loss calculation\n",
+    "            )\n",
+    "        )\n",
+    "    return examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_examples = prepare_examples(train_df)\n",
+    "val_examples = prepare_examples(val_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-01-08 19:25:05,476 INFO: Use pytorch device_name: cpu\n",
+      "2025-01-08 19:25:05,477 INFO: Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load pretrained SentenceTransformer\n",
+    "model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define DataLoader\n",
+    "train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)\n",
+    "val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define loss\n",
+    "train_loss = losses.CosineSimilarityLoss(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Configure training\n",
+    "num_epochs = 3\n",
+    "warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)  # 10% of training as warmup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "65a11878fdad456a94ae2e4d44e403a3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'train_runtime': 5.2094, 'train_samples_per_second': 2.879, 'train_steps_per_second': 0.576, 'train_loss': 0.27454523245493573, 'epoch': 3.0}\n",
+      "2025-01-08 19:25:14,162 INFO: Save model to ./finetuned_model\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7bc7a5e2e56e4abe8bbf47e5ed251d6a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a4218c62846f43c7be217513f8fd86de",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Train the model\n",
+    "model.fit(\n",
+    "    train_objectives=[(train_dataloader, train_loss)],\n",
+    "    evaluator=None,  # Add an evaluator if needed\n",
+    "    epochs=num_epochs,\n",
+    "    warmup_steps=warmup_steps,\n",
+    "    output_path=\"./finetuned_model\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the trained model locally\n",
+    "#model.save(\"./finetuned_model\")\n",
+    "#print(\"Model finetuned and saved locally!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from hsml.schema import Schema\n",
+    "from hsml.model_schema import ModelSchema"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the Model Schema\n",
+    "X_train_sample = train_df[[\"resume_text\", \"job_description\"]].sample(1).values  # Input example\n",
+    "y_train_sample = train_df[\"is_relevant\"].sample(1).values  # Output example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_schema = Schema(X_train_sample)\n",
+    "output_schema = Schema(y_train_sample)\n",
+    "model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get Model Registry\n",
+    "mr = project.get_model_registry()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Register the model in the Model Registry\n",
+    "job_matching_model = mr.python.create_model(\n",
+    "    name=\"job_matching_sentence_transformer\",\n",
+    "    #metrics=metrics,\n",
+    "    model_schema=model_schema,\n",
+    "    input_example=X_train_sample,\n",
+    "    description=\"Finetuned SentenceTransformer for job matching\",\n",
+    "    version=1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "78d7674e395848acb4586dd4dff1fee8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/6 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ff6231d9469840229d992abb28a5740b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/727 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "53650ea045a24041a44626336d9a9c9b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/212 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "21e688c69c7a46a4bb7485d1d088c887",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/470637416 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c68dc00bc2ae4804b73780d0f23e21ba",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/242 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bb846d6b71b746749d59c0bf25779f61",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/21034 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "afda6046a57c42dca546be6defd13f11",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/56 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "df4c630a40954b9784a2a631be4ed4e3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/1015 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4c6430695b7e43c0815fe278328e0448",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/17082987 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a1aa72dd0b344a26b22d8839aa12c4fb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/1512 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2b599ffc95d4499b80673d873b6dfe7a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/14763260 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fa11d5018fcb4ab094dac6c27128bcf5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/305 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7e37354a957d45b88b7a25007b3c7889",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/6678 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f4e0c3e6d64b457bb01e5a0ac2162433",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading: 0.000%|          | 0/216 elapsed<00:00 remaining<?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model created, explore it at https://c.app.hopsworks.ai:443/p/1158296/models/job_matching_sentence_transformer/1\n",
+      "Model registered in Hopsworks Model Registry!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Save model artifacts to the Model Registry\n",
+    "job_matching_model.save(\"./finetuned_model\")\n",
+    "print(\"Model registered in Hopsworks Model Registry!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2025-01-08 19:44:05,458 INFO: Save model to C:\\Users\\Filip\\AppData\\Local\\Temp\\tmpa217ndkp\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "05f6bc89a66a4202b9f3b4b4fadee783",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c1688de9db8d46b6ba410bcdcff839fd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c496a24b0d4044ffbbd9278a9f56996d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4f0e6b478aa4a8dad8c798761289a8f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e2a88564cf8b438bad836c593d4d78f6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b302ff2e4cf74982bccdcf9d3a221240",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'https://huggingface.co/forestav/job_matching_sentence_transformer/commit/7168a70785fae3fee6f5576b40a7556072ba31a2'"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Push the model to huggingface\n",
+    "model.push_to_hub(\"forestav/job_matching_sentence_transformer\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}