{ "cells": [ { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "import hopsworks\n", "from sentence_transformers import SentenceTransformer, InputExample, losses\n", "from torch.utils.data import DataLoader\n", "from sklearn.model_selection import train_test_split\n", "from dotenv import load_dotenv\n", "import os" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2025-01-08 19:52:22,417 INFO: Closing external client and cleaning up certificates.\n", "Connection closed.\n", "2025-01-08 19:52:22,421 INFO: Initializing external client\n", "2025-01-08 19:52:22,421 INFO: Base URL: https://c.app.hopsworks.ai:443\n", "2025-01-08 19:52:23,548 INFO: Python Engine initialized.\n", "\n", "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1158296\n" ] } ], "source": [ "# Initialize Hopsworks connection\n", "load_dotenv()\n", "\n", "api_key = os.getenv(\"HOPSWORKS_API_KEY\")\n", "project = hopsworks.login(project=\"orestavf\", api_key_value=api_key)\n", "fs = project.get_feature_store()\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.84s) \n" ] } ], "source": [ "# Load preprocessed data\n", "feedback_fg = fs.get_feature_group(name=\"job_feedback\", version=1)\n", "feedback_df = feedback_fg.read()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Split into train and validation sets\n", "train_df, val_df = train_test_split(feedback_df, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Prepare data for SentenceTransformer\n", "def prepare_examples(df):\n", " examples = []\n", " for _, row in df.iterrows():\n", " examples.append(\n", " InputExample(\n", " texts=[row[\"resume_text\"], row[\"job_description\"]],\n", " label=float(row[\"is_relevant\"]) # Convert to float for loss calculation\n", " )\n", " )\n", " return examples" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "train_examples = prepare_examples(train_df)\n", "val_examples = prepare_examples(val_df)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2025-01-08 19:25:05,476 INFO: Use pytorch device_name: cpu\n", "2025-01-08 19:25:05,477 INFO: Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2\n" ] } ], "source": [ "# Load pretrained SentenceTransformer\n", "model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Define DataLoader\n", "train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)\n", "val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=16)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Define loss\n", "train_loss = losses.CosineSimilarityLoss(model)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Configure training\n", "num_epochs = 3\n", "warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # 10% of training as warmup" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "65a11878fdad456a94ae2e4d44e403a3", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/3 [00:00