forestav commited on
Commit
ba6957d
·
1 Parent(s): af91fc8
feature_pipeline.ipynb ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import hopsworks\n",
10
+ "import os\n",
11
+ "import re\n",
12
+ "from dotenv import load_dotenv"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 5,
18
+ "metadata": {},
19
+ "outputs": [
20
+ {
21
+ "name": "stdout",
22
+ "output_type": "stream",
23
+ "text": [
24
+ "2025-01-08 19:51:38,754 INFO: Closing external client and cleaning up certificates.\n",
25
+ "Connection closed.\n",
26
+ "2025-01-08 19:51:38,758 INFO: Initializing external client\n",
27
+ "2025-01-08 19:51:38,758 INFO: Base URL: https://c.app.hopsworks.ai:443\n",
28
+ "2025-01-08 19:51:39,828 INFO: Python Engine initialized.\n",
29
+ "\n",
30
+ "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1158296\n"
31
+ ]
32
+ }
33
+ ],
34
+ "source": [
35
+ "load_dotenv()\n",
36
+ "\n",
37
+ "api_key = os.getenv(\"HOPSWORKS_API_KEY\")\n",
38
+ "project = hopsworks.login(project=\"orestavf\", api_key_value=api_key)"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": 5,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "fs = project.get_feature_store()"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 6,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "# Retrieve feature groups\n",
57
+ "feedback_fg = fs.get_feature_group(name=\"job_feedback\", version=1)"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": 24,
63
+ "metadata": {},
64
+ "outputs": [
65
+ {
66
+ "name": "stdout",
67
+ "output_type": "stream",
68
+ "text": [
69
+ "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.93s) \n"
70
+ ]
71
+ }
72
+ ],
73
+ "source": [
74
+ "feedback_df = feedback_fg.read()"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 14,
80
+ "metadata": {},
81
+ "outputs": [
82
+ {
83
+ "data": {
84
+ "text/html": [
85
+ "<div>\n",
86
+ "<style scoped>\n",
87
+ " .dataframe tbody tr th:only-of-type {\n",
88
+ " vertical-align: middle;\n",
89
+ " }\n",
90
+ "\n",
91
+ " .dataframe tbody tr th {\n",
92
+ " vertical-align: top;\n",
93
+ " }\n",
94
+ "\n",
95
+ " .dataframe thead th {\n",
96
+ " text-align: right;\n",
97
+ " }\n",
98
+ "</style>\n",
99
+ "<table border=\"1\" class=\"dataframe\">\n",
100
+ " <thead>\n",
101
+ " <tr style=\"text-align: right;\">\n",
102
+ " <th></th>\n",
103
+ " <th>job_id</th>\n",
104
+ " <th>resume_text</th>\n",
105
+ " <th>job_headline</th>\n",
106
+ " <th>job_occupation</th>\n",
107
+ " <th>job_description</th>\n",
108
+ " <th>is_relevant</th>\n",
109
+ " </tr>\n",
110
+ " </thead>\n",
111
+ " <tbody>\n",
112
+ " <tr>\n",
113
+ " <th>0</th>\n",
114
+ " <td>29321628</td>\n",
115
+ " <td>Filip Orestav \\nTransformatorvägen 6, Sollent...</td>\n",
116
+ " <td>Junior Projektadmin till talangprogram på AFRY...</td>\n",
117
+ " <td>Projektledare, bygg och anläggning</td>\n",
118
+ " <td>Vill du kickstarta din karriär hos en av Sveri...</td>\n",
119
+ " <td>True</td>\n",
120
+ " </tr>\n",
121
+ " </tbody>\n",
122
+ "</table>\n",
123
+ "</div>"
124
+ ],
125
+ "text/plain": [
126
+ " job_id resume_text \\\n",
127
+ "0 29321628 Filip Orestav \\nTransformatorvägen 6, Sollent... \n",
128
+ "\n",
129
+ " job_headline \\\n",
130
+ "0 Junior Projektadmin till talangprogram på AFRY... \n",
131
+ "\n",
132
+ " job_occupation \\\n",
133
+ "0 Projektledare, bygg och anläggning \n",
134
+ "\n",
135
+ " job_description is_relevant \n",
136
+ "0 Vill du kickstarta din karriär hos en av Sveri... True "
137
+ ]
138
+ },
139
+ "execution_count": 14,
140
+ "metadata": {},
141
+ "output_type": "execute_result"
142
+ }
143
+ ],
144
+ "source": [
145
+ "feedback_df.head()"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 25,
151
+ "metadata": {},
152
+ "outputs": [],
153
+ "source": [
154
+ "# Columns to preprocess\n",
155
+ "columns_to_process = ['resume_text', 'job_headline', 'job_occupation', 'job_description']"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": 26,
161
+ "metadata": {},
162
+ "outputs": [],
163
+ "source": [
164
+ "# Define preprocessing functions\n",
165
+ "def preprocess_text(text):\n",
166
+ " if isinstance(text, str):\n",
167
+ " # Lowercase\n",
168
+ " text = text.lower()\n",
169
+ " # Remove special characters (preserving letters, numbers, and spaces)\n",
170
+ " text = re.sub(r\"[^a-zåäöA-Z0-9\\s]\", \"\", text)\n",
171
+ " # Remove extra spaces\n",
172
+ " text = re.sub(r\"\\s+\", \" \", text)\n",
173
+ " return text.strip() # Strip leading/trailing spaces\n",
174
+ " return text"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "code",
179
+ "execution_count": 28,
180
+ "metadata": {},
181
+ "outputs": [
182
+ {
183
+ "name": "stdout",
184
+ "output_type": "stream",
185
+ "text": [
186
+ "2025-01-08 18:38:35,968 WARNING: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
187
+ "\n"
188
+ ]
189
+ },
190
+ {
191
+ "data": {
192
+ "text/html": [
193
+ "<div>\n",
194
+ "<style scoped>\n",
195
+ " .dataframe tbody tr th:only-of-type {\n",
196
+ " vertical-align: middle;\n",
197
+ " }\n",
198
+ "\n",
199
+ " .dataframe tbody tr th {\n",
200
+ " vertical-align: top;\n",
201
+ " }\n",
202
+ "\n",
203
+ " .dataframe thead th {\n",
204
+ " text-align: right;\n",
205
+ " }\n",
206
+ "</style>\n",
207
+ "<table border=\"1\" class=\"dataframe\">\n",
208
+ " <thead>\n",
209
+ " <tr style=\"text-align: right;\">\n",
210
+ " <th></th>\n",
211
+ " <th>job_id</th>\n",
212
+ " <th>resume_text</th>\n",
213
+ " <th>job_headline</th>\n",
214
+ " <th>job_occupation</th>\n",
215
+ " <th>job_description</th>\n",
216
+ " <th>is_relevant</th>\n",
217
+ " </tr>\n",
218
+ " </thead>\n",
219
+ " <tbody>\n",
220
+ " <tr>\n",
221
+ " <th>0</th>\n",
222
+ " <td>29321628</td>\n",
223
+ " <td>filip orestav transformatorvägen 6 sollentuna ...</td>\n",
224
+ " <td>junior projektadmin till talangprogram på afry...</td>\n",
225
+ " <td>projektledare bygg och anläggning</td>\n",
226
+ " <td>vill du kickstarta din karriär hos en av sveri...</td>\n",
227
+ " <td>True</td>\n",
228
+ " </tr>\n",
229
+ " </tbody>\n",
230
+ "</table>\n",
231
+ "</div>"
232
+ ],
233
+ "text/plain": [
234
+ " job_id resume_text \\\n",
235
+ "0 29321628 filip orestav transformatorvägen 6 sollentuna ... \n",
236
+ "\n",
237
+ " job_headline \\\n",
238
+ "0 junior projektadmin till talangprogram på afry... \n",
239
+ "\n",
240
+ " job_occupation \\\n",
241
+ "0 projektledare bygg och anläggning \n",
242
+ "\n",
243
+ " job_description is_relevant \n",
244
+ "0 vill du kickstarta din karriär hos en av sveri... True "
245
+ ]
246
+ },
247
+ "execution_count": 28,
248
+ "metadata": {},
249
+ "output_type": "execute_result"
250
+ }
251
+ ],
252
+ "source": [
253
+ "# Apply preprocessing\n",
254
+ "feedback_df[columns_to_process] = feedback_df[columns_to_process].applymap(preprocess_text)\n",
255
+ "\n",
256
+ "# Display processed dataframe\n",
257
+ "feedback_df.head()"
258
+ ]
259
+ }
260
+ ],
261
+ "metadata": {
262
+ "kernelspec": {
263
+ "display_name": "venv",
264
+ "language": "python",
265
+ "name": "python3"
266
+ },
267
+ "language_info": {
268
+ "codemirror_mode": {
269
+ "name": "ipython",
270
+ "version": 3
271
+ },
272
+ "file_extension": ".py",
273
+ "mimetype": "text/x-python",
274
+ "name": "python",
275
+ "nbconvert_exporter": "python",
276
+ "pygments_lexer": "ipython3",
277
+ "version": "3.12.2"
278
+ }
279
+ },
280
+ "nbformat": 4,
281
+ "nbformat_minor": 2
282
+ }
feedback.db DELETED
Binary file (61.4 kB)
 
gradioapp.py CHANGED
@@ -7,50 +7,43 @@ from pinecone_handler import PineconeHandler
7
  from datetime import datetime
8
  import sqlite3
9
  import threading
 
 
 
 
 
 
10
 
11
  class Database:
12
- def __init__(self, db_name="feedback.db"):
13
- self.db_name = db_name
14
- self.thread_local = threading.local()
15
- self._create_tables()
16
-
17
- def get_connection(self):
18
- if not hasattr(self.thread_local, "connection"):
19
- self.thread_local.connection = sqlite3.connect(self.db_name)
20
- return self.thread_local.connection
21
-
22
- def _create_tables(self):
23
- conn = sqlite3.connect(self.db_name)
24
- cursor = conn.cursor()
25
- cursor.execute('''
26
- CREATE TABLE IF NOT EXISTS feedback (
27
- id INTEGER PRIMARY KEY AUTOINCREMENT,
28
- job_id TEXT,
29
- resume_text TEXT,
30
- job_headline TEXT,
31
- job_occupation TEXT,
32
- job_description TEXT,
33
- is_relevant BOOLEAN,
34
- timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
35
  )
36
- ''')
37
- conn.commit()
38
- conn.close()
39
-
40
- def save_feedback(self, job_id: str, resume_text: str, headline: str,
41
- occupation: str, description: str, is_relevant: bool):
42
- conn = self.get_connection()
43
- cursor = conn.cursor()
44
- try:
45
- cursor.execute('''
46
- INSERT INTO feedback
47
- (job_id, resume_text, job_headline, job_occupation, job_description, is_relevant)
48
- VALUES (?, ?, ?, ?, ?, ?)
49
- ''', (job_id, resume_text, headline, occupation, description, is_relevant))
50
- conn.commit()
51
- except Exception as e:
52
- conn.rollback()
53
- raise e
54
 
55
  def extract_text(file) -> Optional[str]:
56
  """Extract text from uploaded resume file"""
@@ -121,12 +114,12 @@ class JobMatcher:
121
  try:
122
  # Find the job in current results by Pinecone ID
123
  job = next((job for job in self.current_results if job['id'] == pinecone_id), None)
124
-
125
  if not job:
126
  return "Error: Job not found"
127
-
128
  metadata = job['metadata']
129
-
130
  self.db.save_feedback(
131
  job_id=pinecone_id, # Use Pinecone's ID
132
  resume_text=self.current_resume_text,
@@ -135,10 +128,11 @@ class JobMatcher:
135
  description=metadata['description'],
136
  is_relevant=is_relevant
137
  )
138
- return f" Feedback saved for '{metadata['headline']}'"
139
  except Exception as e:
140
  return f"Error saving feedback: {str(e)}"
141
 
 
142
  def create_interface():
143
  matcher = JobMatcher()
144
 
@@ -258,4 +252,4 @@ def create_interface():
258
 
259
  if __name__ == "__main__":
260
  interface = create_interface()
261
- interface.launch()
 
7
  from datetime import datetime
8
  import sqlite3
9
  import threading
10
+ import hopsworks
11
+ import pandas as pd
12
+ import os
13
+ from dotenv import load_dotenv
14
+
15
+ load_dotenv()
16
 
17
  class Database:
18
+ def __init__(self):
19
+ # Initialize Hopsworks
20
+ project = "orestavf"
21
+ api_key = os.getenv("HOPSWORKS_API_KEY")
22
+ self.project = hopsworks.login(project=project, api_key_value=api_key)
23
+ self.fs = self.project.get_feature_store()
24
+ self.feedback_fg = self.fs.get_or_create_feature_group(
25
+ name="job_feedback",
26
+ version=1,
27
+ primary_key=["job_id"],
28
+ description="Feature group for storing user feedback on job matches.",
29
+ online_enabled=True
 
 
 
 
 
 
 
 
 
 
 
30
  )
31
+
32
+ def save_feedback(self, job_id: str, resume_text: str, headline: str,
33
+ occupation: str, description: str, is_relevant: bool):
34
+ # Prepare feedback data as a pandas DataFrame
35
+ feedback_data = pd.DataFrame([{
36
+ "job_id": job_id,
37
+ "resume_text": resume_text,
38
+ "job_headline": headline,
39
+ "job_occupation": occupation,
40
+ "job_description": description,
41
+ "is_relevant": is_relevant,
42
+ #"timestamp": datetime.now()
43
+ }])
44
+
45
+ self.feedback_fg.insert(feedback_data)
46
+ print(f"Feedback saved to Hopsworks for job ID: {job_id}")
 
 
47
 
48
  def extract_text(file) -> Optional[str]:
49
  """Extract text from uploaded resume file"""
 
114
  try:
115
  # Find the job in current results by Pinecone ID
116
  job = next((job for job in self.current_results if job['id'] == pinecone_id), None)
117
+
118
  if not job:
119
  return "Error: Job not found"
120
+
121
  metadata = job['metadata']
122
+
123
  self.db.save_feedback(
124
  job_id=pinecone_id, # Use Pinecone's ID
125
  resume_text=self.current_resume_text,
 
128
  description=metadata['description'],
129
  is_relevant=is_relevant
130
  )
131
+ return f"\u2713 Feedback saved for '{metadata['headline']}'"
132
  except Exception as e:
133
  return f"Error saving feedback: {str(e)}"
134
 
135
+
136
  def create_interface():
137
  matcher = JobMatcher()
138
 
 
252
 
253
  if __name__ == "__main__":
254
  interface = create_interface()
255
+ interface.launch(debug=True)
modeltrain.py DELETED
@@ -1,102 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- import os
4
- from torch.utils.data import DataLoader
5
- from sentence_transformers import SentenceTransformer, InputExample, losses
6
- # If you want to push to the HF Hub/Spaces programmatically:
7
- # pip install huggingface_hub
8
- # from huggingface_hub import HfApi, HfFolder
9
-
10
- def main():
11
- #--------------------------------------------------------------------------
12
- # 1. (Optional) Setup your Hugging Face auth
13
- #--------------------------------------------------------------------------
14
- # If you need to log into your HF account, you can do:
15
- # hf_token = os.getenv("HF_TOKEN") # or read from a config file
16
- # HfFolder.save_token(hf_token)
17
- # api = HfApi()
18
- #
19
- # Then set something like:
20
- # repo_id = "KolumbusLindh/my-weekly-model"
21
- #
22
- # Alternatively, you can push manually later via huggingface-cli.
23
-
24
- #--------------------------------------------------------------------------
25
- # 2. Placeholder training data
26
- #--------------------------------------------------------------------------
27
- # Suppose each tuple is: (CV_text, liked_job_text, disliked_job_text).
28
- # In a real scenario, you'd gather user feedback from your database.
29
- train_data = [
30
- ("My CV #1", "Job #1 that user liked", "Job #1 that user disliked"),
31
- ("My CV #2", "Job #2 that user liked", "Job #2 that user disliked"),
32
- # ...
33
- ]
34
-
35
- #--------------------------------------------------------------------------
36
- # 3. Convert data into Sentence Transformers InputExamples
37
- #--------------------------------------------------------------------------
38
- train_examples = []
39
- for (cv_text, liked_job_text, disliked_job_text) in train_data:
40
- example = InputExample(
41
- texts=[cv_text, liked_job_text, disliked_job_text]
42
- # TripletLoss expects exactly 3 texts: anchor, positive, negative
43
- )
44
- train_examples.append(example)
45
-
46
- #--------------------------------------------------------------------------
47
- # 4. Load the base model
48
- #--------------------------------------------------------------------------
49
- model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
50
- model = SentenceTransformer(model_name)
51
-
52
- #--------------------------------------------------------------------------
53
- # 5. Prepare DataLoader & define the Triplet Loss
54
- #--------------------------------------------------------------------------
55
- # A typical margin is 0.5–1.0. Feel free to adjust it.
56
- train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
57
- train_loss = losses.TripletLoss(
58
- model=model,
59
- distance_metric=losses.TripletDistanceMetric.COSINE,
60
- margin=0.5
61
- )
62
-
63
- #--------------------------------------------------------------------------
64
- # 6. Fine-tune (fit) the model
65
- #--------------------------------------------------------------------------
66
- # Just 1 epoch here for demo. In practice, tune #epochs/batch_size, etc.
67
- num_epochs = 1
68
- warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # ~10% warmup
69
-
70
- model.fit(
71
- train_objectives=[(train_dataloader, train_loss)],
72
- epochs=num_epochs,
73
- warmup_steps=warmup_steps,
74
- show_progress_bar=True
75
- )
76
-
77
- #--------------------------------------------------------------------------
78
- # 7. Save model locally
79
- #--------------------------------------------------------------------------
80
- local_output_path = "my_finetuned_model"
81
- model.save(local_output_path)
82
- print(f"Model fine-tuned and saved locally to: {local_output_path}")
83
-
84
- #--------------------------------------------------------------------------
85
- # 8. (Optional) Push to your Hugging Face Space
86
- #--------------------------------------------------------------------------
87
- # If you want to push automatically:
88
- #
89
- # model.push_to_hub(repo_id=repo_id, commit_message="Weekly model update")
90
- #
91
- # Or if you have a Space at e.g. https://huggingface.co/spaces/KolumbusLindh/<some-name>,
92
- # you’d create a repo on HF, then push to that repo. Typically one uses
93
- # huggingface-cli or the huggingface_hub methods for that:
94
- #
95
- # api.create_repo(repo_id=repo_id, repo_type="model", private=False)
96
- # model.push_to_hub(repo_id=repo_id)
97
- #
98
- # # If it's a Space, you might need to store your model in the "models" folder
99
- # # or however your Gradio app is set up to load it.
100
-
101
- if __name__ == "__main__":
102
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pinecone_handler.py CHANGED
@@ -52,7 +52,8 @@ class PineconeHandler:
52
  self.index = self.pc.Index(PINECONE_INDEX_NAME)
53
 
54
  #self.model = SentenceTransformer('all-MiniLM-L6-v2')
55
- self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
 
56
  log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
57
 
58
  def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
 
52
  self.index = self.pc.Index(PINECONE_INDEX_NAME)
53
 
54
  #self.model = SentenceTransformer('all-MiniLM-L6-v2')
55
+ #self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
56
+ self.model = SentenceTransformer('forestav/job_matching_sentence_transformer')
57
  log.info(f"Initialized connection to Pinecone index '{PINECONE_INDEX_NAME}'")
58
 
59
  def _create_embedding(self, ad: Dict[str, Any]) -> List[float]:
settings.py CHANGED
@@ -2,7 +2,8 @@ import logging
2
 
3
  PINECONE_ENVIRONMENT = "gcp-starter"
4
  #PINECONE_INDEX_NAME = "jobads-index"
5
- PINECONE_INDEX_NAME = "jobsai-multilingual-small"
 
6
 
7
  DB_TABLE_NAME = 'jobads'
8
  DB_FILE_NAME = 'jobads_database_20220127.db'
 
2
 
3
  PINECONE_ENVIRONMENT = "gcp-starter"
4
  #PINECONE_INDEX_NAME = "jobads-index"
5
+ #PINECONE_INDEX_NAME = "jobsai-multilingual-small"
6
+ PINECONE_INDEX_NAME = "jobads-finetuned-small"
7
 
8
  DB_TABLE_NAME = 'jobads'
9
  DB_FILE_NAME = 'jobads_database_20220127.db'
app.py → streamlit_app.py RENAMED
File without changes
training_pipeline.ipynb ADDED
@@ -0,0 +1,641 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 23,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import hopsworks\n",
10
+ "from sentence_transformers import SentenceTransformer, InputExample, losses\n",
11
+ "from torch.utils.data import DataLoader\n",
12
+ "from sklearn.model_selection import train_test_split\n",
13
+ "from dotenv import load_dotenv\n",
14
+ "import os"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 24,
20
+ "metadata": {},
21
+ "outputs": [
22
+ {
23
+ "name": "stdout",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "2025-01-08 19:52:22,417 INFO: Closing external client and cleaning up certificates.\n",
27
+ "Connection closed.\n",
28
+ "2025-01-08 19:52:22,421 INFO: Initializing external client\n",
29
+ "2025-01-08 19:52:22,421 INFO: Base URL: https://c.app.hopsworks.ai:443\n",
30
+ "2025-01-08 19:52:23,548 INFO: Python Engine initialized.\n",
31
+ "\n",
32
+ "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1158296\n"
33
+ ]
34
+ }
35
+ ],
36
+ "source": [
37
+ "# Initialize Hopsworks connection\n",
38
+ "load_dotenv()\n",
39
+ "\n",
40
+ "api_key = os.getenv(\"HOPSWORKS_API_KEY\")\n",
41
+ "project = hopsworks.login(project=\"orestavf\", api_key_value=api_key)\n",
42
+ "fs = project.get_feature_store()\n"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 3,
48
+ "metadata": {},
49
+ "outputs": [
50
+ {
51
+ "name": "stdout",
52
+ "output_type": "stream",
53
+ "text": [
54
+ "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.84s) \n"
55
+ ]
56
+ }
57
+ ],
58
+ "source": [
59
+ "# Load preprocessed data\n",
60
+ "feedback_fg = fs.get_feature_group(name=\"job_feedback\", version=1)\n",
61
+ "feedback_df = feedback_fg.read()"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 4,
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "# Split into train and validation sets\n",
71
+ "train_df, val_df = train_test_split(feedback_df, test_size=0.2, random_state=42)"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 5,
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "# Prepare data for SentenceTransformer\n",
81
+ "def prepare_examples(df):\n",
82
+ " examples = []\n",
83
+ " for _, row in df.iterrows():\n",
84
+ " examples.append(\n",
85
+ " InputExample(\n",
86
+ " texts=[row[\"resume_text\"], row[\"job_description\"]],\n",
87
+ " label=float(row[\"is_relevant\"]) # Convert to float for loss calculation\n",
88
+ " )\n",
89
+ " )\n",
90
+ " return examples"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": 6,
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "train_examples = prepare_examples(train_df)\n",
100
+ "val_examples = prepare_examples(val_df)"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 7,
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "name": "stdout",
110
+ "output_type": "stream",
111
+ "text": [
112
+ "2025-01-08 19:25:05,476 INFO: Use pytorch device_name: cpu\n",
113
+ "2025-01-08 19:25:05,477 INFO: Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2\n"
114
+ ]
115
+ }
116
+ ],
117
+ "source": [
118
+ "# Load pretrained SentenceTransformer\n",
119
+ "model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 8,
125
+ "metadata": {},
126
+ "outputs": [],
127
+ "source": [
128
+ "# Define DataLoader\n",
129
+ "train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)\n",
130
+ "val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=16)"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": 9,
136
+ "metadata": {},
137
+ "outputs": [],
138
+ "source": [
139
+ "# Define loss\n",
140
+ "train_loss = losses.CosineSimilarityLoss(model)"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 10,
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "# Configure training\n",
150
+ "num_epochs = 3\n",
151
+ "warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # 10% of training as warmup"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": 11,
157
+ "metadata": {},
158
+ "outputs": [
159
+ {
160
+ "data": {
161
+ "application/vnd.jupyter.widget-view+json": {
162
+ "model_id": "65a11878fdad456a94ae2e4d44e403a3",
163
+ "version_major": 2,
164
+ "version_minor": 0
165
+ },
166
+ "text/plain": [
167
+ " 0%| | 0/3 [00:00<?, ?it/s]"
168
+ ]
169
+ },
170
+ "metadata": {},
171
+ "output_type": "display_data"
172
+ },
173
+ {
174
+ "name": "stdout",
175
+ "output_type": "stream",
176
+ "text": [
177
+ "{'train_runtime': 5.2094, 'train_samples_per_second': 2.879, 'train_steps_per_second': 0.576, 'train_loss': 0.27454523245493573, 'epoch': 3.0}\n",
178
+ "2025-01-08 19:25:14,162 INFO: Save model to ./finetuned_model\n"
179
+ ]
180
+ },
181
+ {
182
+ "data": {
183
+ "application/vnd.jupyter.widget-view+json": {
184
+ "model_id": "7bc7a5e2e56e4abe8bbf47e5ed251d6a",
185
+ "version_major": 2,
186
+ "version_minor": 0
187
+ },
188
+ "text/plain": [
189
+ "Computing widget examples: 0%| | 0/1 [00:00<?, ?example/s]"
190
+ ]
191
+ },
192
+ "metadata": {},
193
+ "output_type": "display_data"
194
+ },
195
+ {
196
+ "data": {
197
+ "application/vnd.jupyter.widget-view+json": {
198
+ "model_id": "a4218c62846f43c7be217513f8fd86de",
199
+ "version_major": 2,
200
+ "version_minor": 0
201
+ },
202
+ "text/plain": [
203
+ "Computing widget examples: 0%| | 0/1 [00:00<?, ?example/s]"
204
+ ]
205
+ },
206
+ "metadata": {},
207
+ "output_type": "display_data"
208
+ }
209
+ ],
210
+ "source": [
211
+ "# Train the model\n",
212
+ "model.fit(\n",
213
+ " train_objectives=[(train_dataloader, train_loss)],\n",
214
+ " evaluator=None, # Add an evaluator if needed\n",
215
+ " epochs=num_epochs,\n",
216
+ " warmup_steps=warmup_steps,\n",
217
+ " output_path=\"./finetuned_model\"\n",
218
+ ")"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": null,
224
+ "metadata": {},
225
+ "outputs": [],
226
+ "source": [
227
+ "# Save the trained model locally\n",
228
+ "#model.save(\"./finetuned_model\")\n",
229
+ "#print(\"Model finetuned and saved locally!\")"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": 12,
235
+ "metadata": {},
236
+ "outputs": [],
237
+ "source": [
238
+ "from hsml.schema import Schema\n",
239
+ "from hsml.model_schema import ModelSchema"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": 13,
245
+ "metadata": {},
246
+ "outputs": [],
247
+ "source": [
248
+ "# Define the Model Schema\n",
249
+ "X_train_sample = train_df[[\"resume_text\", \"job_description\"]].sample(1).values # Input example\n",
250
+ "y_train_sample = train_df[\"is_relevant\"].sample(1).values # Output example"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": 14,
256
+ "metadata": {},
257
+ "outputs": [],
258
+ "source": [
259
+ "input_schema = Schema(X_train_sample)\n",
260
+ "output_schema = Schema(y_train_sample)\n",
261
+ "model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "code",
266
+ "execution_count": 15,
267
+ "metadata": {},
268
+ "outputs": [],
269
+ "source": [
270
+ "# Get Model Registry\n",
271
+ "mr = project.get_model_registry()"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": 19,
277
+ "metadata": {},
278
+ "outputs": [],
279
+ "source": [
280
+ "# Register the model in the Model Registry\n",
281
+ "job_matching_model = mr.python.create_model(\n",
282
+ " name=\"job_matching_sentence_transformer\",\n",
283
+ " #metrics=metrics,\n",
284
+ " model_schema=model_schema,\n",
285
+ " input_example=X_train_sample,\n",
286
+ " description=\"Finetuned SentenceTransformer for job matching\",\n",
287
+ " version=1\n",
288
+ ")"
289
+ ]
290
+ },
291
+ {
292
+ "cell_type": "code",
293
+ "execution_count": 20,
294
+ "metadata": {},
295
+ "outputs": [
296
+ {
297
+ "data": {
298
+ "application/vnd.jupyter.widget-view+json": {
299
+ "model_id": "78d7674e395848acb4586dd4dff1fee8",
300
+ "version_major": 2,
301
+ "version_minor": 0
302
+ },
303
+ "text/plain": [
304
+ " 0%| | 0/6 [00:00<?, ?it/s]"
305
+ ]
306
+ },
307
+ "metadata": {},
308
+ "output_type": "display_data"
309
+ },
310
+ {
311
+ "data": {
312
+ "application/vnd.jupyter.widget-view+json": {
313
+ "model_id": "ff6231d9469840229d992abb28a5740b",
314
+ "version_major": 2,
315
+ "version_minor": 0
316
+ },
317
+ "text/plain": [
318
+ "Uploading: 0.000%| | 0/727 elapsed<00:00 remaining<?"
319
+ ]
320
+ },
321
+ "metadata": {},
322
+ "output_type": "display_data"
323
+ },
324
+ {
325
+ "data": {
326
+ "application/vnd.jupyter.widget-view+json": {
327
+ "model_id": "53650ea045a24041a44626336d9a9c9b",
328
+ "version_major": 2,
329
+ "version_minor": 0
330
+ },
331
+ "text/plain": [
332
+ "Uploading: 0.000%| | 0/212 elapsed<00:00 remaining<?"
333
+ ]
334
+ },
335
+ "metadata": {},
336
+ "output_type": "display_data"
337
+ },
338
+ {
339
+ "data": {
340
+ "application/vnd.jupyter.widget-view+json": {
341
+ "model_id": "21e688c69c7a46a4bb7485d1d088c887",
342
+ "version_major": 2,
343
+ "version_minor": 0
344
+ },
345
+ "text/plain": [
346
+ "Uploading: 0.000%| | 0/470637416 elapsed<00:00 remaining<?"
347
+ ]
348
+ },
349
+ "metadata": {},
350
+ "output_type": "display_data"
351
+ },
352
+ {
353
+ "data": {
354
+ "application/vnd.jupyter.widget-view+json": {
355
+ "model_id": "c68dc00bc2ae4804b73780d0f23e21ba",
356
+ "version_major": 2,
357
+ "version_minor": 0
358
+ },
359
+ "text/plain": [
360
+ "Uploading: 0.000%| | 0/242 elapsed<00:00 remaining<?"
361
+ ]
362
+ },
363
+ "metadata": {},
364
+ "output_type": "display_data"
365
+ },
366
+ {
367
+ "data": {
368
+ "application/vnd.jupyter.widget-view+json": {
369
+ "model_id": "bb846d6b71b746749d59c0bf25779f61",
370
+ "version_major": 2,
371
+ "version_minor": 0
372
+ },
373
+ "text/plain": [
374
+ "Uploading: 0.000%| | 0/21034 elapsed<00:00 remaining<?"
375
+ ]
376
+ },
377
+ "metadata": {},
378
+ "output_type": "display_data"
379
+ },
380
+ {
381
+ "data": {
382
+ "application/vnd.jupyter.widget-view+json": {
383
+ "model_id": "afda6046a57c42dca546be6defd13f11",
384
+ "version_major": 2,
385
+ "version_minor": 0
386
+ },
387
+ "text/plain": [
388
+ "Uploading: 0.000%| | 0/56 elapsed<00:00 remaining<?"
389
+ ]
390
+ },
391
+ "metadata": {},
392
+ "output_type": "display_data"
393
+ },
394
+ {
395
+ "data": {
396
+ "application/vnd.jupyter.widget-view+json": {
397
+ "model_id": "df4c630a40954b9784a2a631be4ed4e3",
398
+ "version_major": 2,
399
+ "version_minor": 0
400
+ },
401
+ "text/plain": [
402
+ "Uploading: 0.000%| | 0/1015 elapsed<00:00 remaining<?"
403
+ ]
404
+ },
405
+ "metadata": {},
406
+ "output_type": "display_data"
407
+ },
408
+ {
409
+ "data": {
410
+ "application/vnd.jupyter.widget-view+json": {
411
+ "model_id": "4c6430695b7e43c0815fe278328e0448",
412
+ "version_major": 2,
413
+ "version_minor": 0
414
+ },
415
+ "text/plain": [
416
+ "Uploading: 0.000%| | 0/17082987 elapsed<00:00 remaining<?"
417
+ ]
418
+ },
419
+ "metadata": {},
420
+ "output_type": "display_data"
421
+ },
422
+ {
423
+ "data": {
424
+ "application/vnd.jupyter.widget-view+json": {
425
+ "model_id": "a1aa72dd0b344a26b22d8839aa12c4fb",
426
+ "version_major": 2,
427
+ "version_minor": 0
428
+ },
429
+ "text/plain": [
430
+ "Uploading: 0.000%| | 0/1512 elapsed<00:00 remaining<?"
431
+ ]
432
+ },
433
+ "metadata": {},
434
+ "output_type": "display_data"
435
+ },
436
+ {
437
+ "data": {
438
+ "application/vnd.jupyter.widget-view+json": {
439
+ "model_id": "2b599ffc95d4499b80673d873b6dfe7a",
440
+ "version_major": 2,
441
+ "version_minor": 0
442
+ },
443
+ "text/plain": [
444
+ "Uploading: 0.000%| | 0/14763260 elapsed<00:00 remaining<?"
445
+ ]
446
+ },
447
+ "metadata": {},
448
+ "output_type": "display_data"
449
+ },
450
+ {
451
+ "data": {
452
+ "application/vnd.jupyter.widget-view+json": {
453
+ "model_id": "fa11d5018fcb4ab094dac6c27128bcf5",
454
+ "version_major": 2,
455
+ "version_minor": 0
456
+ },
457
+ "text/plain": [
458
+ "Uploading: 0.000%| | 0/305 elapsed<00:00 remaining<?"
459
+ ]
460
+ },
461
+ "metadata": {},
462
+ "output_type": "display_data"
463
+ },
464
+ {
465
+ "data": {
466
+ "application/vnd.jupyter.widget-view+json": {
467
+ "model_id": "7e37354a957d45b88b7a25007b3c7889",
468
+ "version_major": 2,
469
+ "version_minor": 0
470
+ },
471
+ "text/plain": [
472
+ "Uploading: 0.000%| | 0/6678 elapsed<00:00 remaining<?"
473
+ ]
474
+ },
475
+ "metadata": {},
476
+ "output_type": "display_data"
477
+ },
478
+ {
479
+ "data": {
480
+ "application/vnd.jupyter.widget-view+json": {
481
+ "model_id": "f4e0c3e6d64b457bb01e5a0ac2162433",
482
+ "version_major": 2,
483
+ "version_minor": 0
484
+ },
485
+ "text/plain": [
486
+ "Uploading: 0.000%| | 0/216 elapsed<00:00 remaining<?"
487
+ ]
488
+ },
489
+ "metadata": {},
490
+ "output_type": "display_data"
491
+ },
492
+ {
493
+ "name": "stdout",
494
+ "output_type": "stream",
495
+ "text": [
496
+ "Model created, explore it at https://c.app.hopsworks.ai:443/p/1158296/models/job_matching_sentence_transformer/1\n",
497
+ "Model registered in Hopsworks Model Registry!\n"
498
+ ]
499
+ }
500
+ ],
501
+ "source": [
502
+ "# Save model artifacts to the Model Registry\n",
503
+ "job_matching_model.save(\"./finetuned_model\")\n",
504
+ "print(\"Model registered in Hopsworks Model Registry!\")"
505
+ ]
506
+ },
507
+ {
508
+ "cell_type": "code",
509
+ "execution_count": 22,
510
+ "metadata": {},
511
+ "outputs": [
512
+ {
513
+ "name": "stdout",
514
+ "output_type": "stream",
515
+ "text": [
516
+ "2025-01-08 19:44:05,458 INFO: Save model to C:\\Users\\Filip\\AppData\\Local\\Temp\\tmpa217ndkp\n"
517
+ ]
518
+ },
519
+ {
520
+ "data": {
521
+ "application/vnd.jupyter.widget-view+json": {
522
+ "model_id": "05f6bc89a66a4202b9f3b4b4fadee783",
523
+ "version_major": 2,
524
+ "version_minor": 0
525
+ },
526
+ "text/plain": [
527
+ "Computing widget examples: 0%| | 0/1 [00:00<?, ?example/s]"
528
+ ]
529
+ },
530
+ "metadata": {},
531
+ "output_type": "display_data"
532
+ },
533
+ {
534
+ "data": {
535
+ "application/vnd.jupyter.widget-view+json": {
536
+ "model_id": "c1688de9db8d46b6ba410bcdcff839fd",
537
+ "version_major": 2,
538
+ "version_minor": 0
539
+ },
540
+ "text/plain": [
541
+ "Computing widget examples: 0%| | 0/1 [00:00<?, ?example/s]"
542
+ ]
543
+ },
544
+ "metadata": {},
545
+ "output_type": "display_data"
546
+ },
547
+ {
548
+ "data": {
549
+ "application/vnd.jupyter.widget-view+json": {
550
+ "model_id": "c496a24b0d4044ffbbd9278a9f56996d",
551
+ "version_major": 2,
552
+ "version_minor": 0
553
+ },
554
+ "text/plain": [
555
+ "unigram.json: 0%| | 0.00/14.8M [00:00<?, ?B/s]"
556
+ ]
557
+ },
558
+ "metadata": {},
559
+ "output_type": "display_data"
560
+ },
561
+ {
562
+ "data": {
563
+ "application/vnd.jupyter.widget-view+json": {
564
+ "model_id": "d4f0e6b478aa4a8dad8c798761289a8f",
565
+ "version_major": 2,
566
+ "version_minor": 0
567
+ },
568
+ "text/plain": [
569
+ "model.safetensors: 0%| | 0.00/471M [00:00<?, ?B/s]"
570
+ ]
571
+ },
572
+ "metadata": {},
573
+ "output_type": "display_data"
574
+ },
575
+ {
576
+ "data": {
577
+ "application/vnd.jupyter.widget-view+json": {
578
+ "model_id": "e2a88564cf8b438bad836c593d4d78f6",
579
+ "version_major": 2,
580
+ "version_minor": 0
581
+ },
582
+ "text/plain": [
583
+ "Upload 3 LFS files: 0%| | 0/3 [00:00<?, ?it/s]"
584
+ ]
585
+ },
586
+ "metadata": {},
587
+ "output_type": "display_data"
588
+ },
589
+ {
590
+ "data": {
591
+ "application/vnd.jupyter.widget-view+json": {
592
+ "model_id": "b302ff2e4cf74982bccdcf9d3a221240",
593
+ "version_major": 2,
594
+ "version_minor": 0
595
+ },
596
+ "text/plain": [
597
+ "tokenizer.json: 0%| | 0.00/17.1M [00:00<?, ?B/s]"
598
+ ]
599
+ },
600
+ "metadata": {},
601
+ "output_type": "display_data"
602
+ },
603
+ {
604
+ "data": {
605
+ "text/plain": [
606
+ "'https://huggingface.co/forestav/job_matching_sentence_transformer/commit/7168a70785fae3fee6f5576b40a7556072ba31a2'"
607
+ ]
608
+ },
609
+ "execution_count": 22,
610
+ "metadata": {},
611
+ "output_type": "execute_result"
612
+ }
613
+ ],
614
+ "source": [
615
+ "# Push the model to huggingface\n",
616
+ "model.push_to_hub(\"forestav/job_matching_sentence_transformer\")"
617
+ ]
618
+ }
619
+ ],
620
+ "metadata": {
621
+ "kernelspec": {
622
+ "display_name": "venv",
623
+ "language": "python",
624
+ "name": "python3"
625
+ },
626
+ "language_info": {
627
+ "codemirror_mode": {
628
+ "name": "ipython",
629
+ "version": 3
630
+ },
631
+ "file_extension": ".py",
632
+ "mimetype": "text/x-python",
633
+ "name": "python",
634
+ "nbconvert_exporter": "python",
635
+ "pygments_lexer": "ipython3",
636
+ "version": "3.12.2"
637
+ }
638
+ },
639
+ "nbformat": 4,
640
+ "nbformat_minor": 2
641
+ }