bobox commited on
Commit
519f6b0
·
verified ·
1 Parent(s): 545ecaa

Training in progress, step 9823, checkpoint

Browse files
last-checkpoint/1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
last-checkpoint/README.md ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ library_name: sentence-transformers
5
+ tags:
6
+ - sentence-transformers
7
+ - sentence-similarity
8
+ - feature-extraction
9
+ - dataset_size:100K<n<1M
10
+ - loss:MultipleNegativesRankingLoss
11
+ base_model: microsoft/deberta-v3-base
12
+ metrics:
13
+ - pearson_cosine
14
+ - spearman_cosine
15
+ - pearson_manhattan
16
+ - spearman_manhattan
17
+ - pearson_euclidean
18
+ - spearman_euclidean
19
+ - pearson_dot
20
+ - spearman_dot
21
+ - pearson_max
22
+ - spearman_max
23
+ - cosine_accuracy
24
+ - cosine_accuracy_threshold
25
+ - cosine_f1
26
+ - cosine_f1_threshold
27
+ - cosine_precision
28
+ - cosine_recall
29
+ - cosine_ap
30
+ - dot_accuracy
31
+ - dot_accuracy_threshold
32
+ - dot_f1
33
+ - dot_f1_threshold
34
+ - dot_precision
35
+ - dot_recall
36
+ - dot_ap
37
+ - manhattan_accuracy
38
+ - manhattan_accuracy_threshold
39
+ - manhattan_f1
40
+ - manhattan_f1_threshold
41
+ - manhattan_precision
42
+ - manhattan_recall
43
+ - manhattan_ap
44
+ - euclidean_accuracy
45
+ - euclidean_accuracy_threshold
46
+ - euclidean_f1
47
+ - euclidean_f1_threshold
48
+ - euclidean_precision
49
+ - euclidean_recall
50
+ - euclidean_ap
51
+ - max_accuracy
52
+ - max_accuracy_threshold
53
+ - max_f1
54
+ - max_f1_threshold
55
+ - max_precision
56
+ - max_recall
57
+ - max_ap
58
+ widget:
59
+ - source_sentence: profit rather
60
+ sentences:
61
+ - Making money rather.
62
+ - A racecar is being watched.
63
+ - A man is standing in the doorway.
64
+ - source_sentence: life track
65
+ sentences:
66
+ - There is.
67
+ - The man is wearing an apron.
68
+ - A man playing billiards at a bar.
69
+ - source_sentence: Fiesta time!
70
+ sentences:
71
+ - It is a special day.
72
+ - The world is getting better.
73
+ - A man hammering nails on a shed.
74
+ - source_sentence: 'The family. '
75
+ sentences:
76
+ - A man is at his sisters party.
77
+ - the man is training some guys
78
+ - Commuters wait for to cross a street.
79
+ - source_sentence: I don't know.
80
+ sentences:
81
+ - I'm not sure about anything.
82
+ - A guy is outside in the snow
83
+ - The dogs run a race at the track.
84
+ pipeline_tag: sentence-similarity
85
+ model-index:
86
+ - name: SentenceTransformer based on microsoft/deberta-v3-base
87
+ results:
88
+ - task:
89
+ type: semantic-similarity
90
+ name: Semantic Similarity
91
+ dataset:
92
+ name: Unknown
93
+ type: unknown
94
+ metrics:
95
+ - type: pearson_cosine
96
+ value: 0.35614155568929684
97
+ name: Pearson Cosine
98
+ - type: spearman_cosine
99
+ value: 0.4042062369647017
100
+ name: Spearman Cosine
101
+ - type: pearson_manhattan
102
+ value: 0.44470114795339144
103
+ name: Pearson Manhattan
104
+ - type: spearman_manhattan
105
+ value: 0.464389588301289
106
+ name: Spearman Manhattan
107
+ - type: pearson_euclidean
108
+ value: 0.4073816956345048
109
+ name: Pearson Euclidean
110
+ - type: spearman_euclidean
111
+ value: 0.42806381869427496
112
+ name: Spearman Euclidean
113
+ - type: pearson_dot
114
+ value: -0.033633706160895414
115
+ name: Pearson Dot
116
+ - type: spearman_dot
117
+ value: -0.026115764956036586
118
+ name: Spearman Dot
119
+ - type: pearson_max
120
+ value: 0.44470114795339144
121
+ name: Pearson Max
122
+ - type: spearman_max
123
+ value: 0.464389588301289
124
+ name: Spearman Max
125
+ - task:
126
+ type: binary-classification
127
+ name: Binary Classification
128
+ dataset:
129
+ name: Unknown
130
+ type: unknown
131
+ metrics:
132
+ - type: cosine_accuracy
133
+ value: 0.6648722420198651
134
+ name: Cosine Accuracy
135
+ - type: cosine_accuracy_threshold
136
+ value: 0.7642883062362671
137
+ name: Cosine Accuracy Threshold
138
+ - type: cosine_f1
139
+ value: 0.7061340941512125
140
+ name: Cosine F1
141
+ - type: cosine_f1_threshold
142
+ value: 0.6351689100265503
143
+ name: Cosine F1 Threshold
144
+ - type: cosine_precision
145
+ value: 0.5953693495038589
146
+ name: Cosine Precision
147
+ - type: cosine_recall
148
+ value: 0.8675332262304659
149
+ name: Cosine Recall
150
+ - type: cosine_ap
151
+ value: 0.7283929467215002
152
+ name: Cosine Ap
153
+ - type: dot_accuracy
154
+ value: 0.6397755705512169
155
+ name: Dot Accuracy
156
+ - type: dot_accuracy_threshold
157
+ value: 268.03167724609375
158
+ name: Dot Accuracy Threshold
159
+ - type: dot_f1
160
+ value: 0.7021864211737631
161
+ name: Dot F1
162
+ - type: dot_f1_threshold
163
+ value: 216.1470947265625
164
+ name: Dot F1 Threshold
165
+ - type: dot_precision
166
+ value: 0.5793221304471661
167
+ name: Dot Precision
168
+ - type: dot_recall
169
+ value: 0.8911932233094786
170
+ name: Dot Recall
171
+ - type: dot_ap
172
+ value: 0.6799114732445778
173
+ name: Dot Ap
174
+ - type: manhattan_accuracy
175
+ value: 0.6606262794753204
176
+ name: Manhattan Accuracy
177
+ - type: manhattan_accuracy_threshold
178
+ value: 248.9168701171875
179
+ name: Manhattan Accuracy Threshold
180
+ - type: manhattan_f1
181
+ value: 0.703255925305243
182
+ name: Manhattan F1
183
+ - type: manhattan_f1_threshold
184
+ value: 306.02117919921875
185
+ name: Manhattan F1 Threshold
186
+ - type: manhattan_precision
187
+ value: 0.5957813609167427
188
+ name: Manhattan Precision
189
+ - type: manhattan_recall
190
+ value: 0.8580400175259237
191
+ name: Manhattan Recall
192
+ - type: manhattan_ap
193
+ value: 0.7294072443461903
194
+ name: Manhattan Ap
195
+ - type: euclidean_accuracy
196
+ value: 0.6580483736447039
197
+ name: Euclidean Accuracy
198
+ - type: euclidean_accuracy_threshold
199
+ value: 12.722024917602539
200
+ name: Euclidean Accuracy Threshold
201
+ - type: euclidean_f1
202
+ value: 0.7027586626880216
203
+ name: Euclidean F1
204
+ - type: euclidean_f1_threshold
205
+ value: 16.08021354675293
206
+ name: Euclidean F1 Threshold
207
+ - type: euclidean_precision
208
+ value: 0.6026739085021935
209
+ name: Euclidean Precision
210
+ - type: euclidean_recall
211
+ value: 0.8427048342339711
212
+ name: Euclidean Recall
213
+ - type: euclidean_ap
214
+ value: 0.7268148607241872
215
+ name: Euclidean Ap
216
+ - type: max_accuracy
217
+ value: 0.6648722420198651
218
+ name: Max Accuracy
219
+ - type: max_accuracy_threshold
220
+ value: 268.03167724609375
221
+ name: Max Accuracy Threshold
222
+ - type: max_f1
223
+ value: 0.7061340941512125
224
+ name: Max F1
225
+ - type: max_f1_threshold
226
+ value: 306.02117919921875
227
+ name: Max F1 Threshold
228
+ - type: max_precision
229
+ value: 0.6026739085021935
230
+ name: Max Precision
231
+ - type: max_recall
232
+ value: 0.8911932233094786
233
+ name: Max Recall
234
+ - type: max_ap
235
+ value: 0.7294072443461903
236
+ name: Max Ap
237
+ ---
238
+
239
+ # SentenceTransformer based on microsoft/deberta-v3-base
240
+
241
+ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [microsoft/deberta-v3-base](https://huggingface.co/microsoft/deberta-v3-base) on the [stanfordnlp/snli](https://huggingface.co/datasets/stanfordnlp/snli) dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
242
+
243
+ ## Model Details
244
+
245
+ ### Model Description
246
+ - **Model Type:** Sentence Transformer
247
+ - **Base model:** [microsoft/deberta-v3-base](https://huggingface.co/microsoft/deberta-v3-base) <!-- at revision 8ccc9b6f36199bec6961081d44eb72fb3f7353f3 -->
248
+ - **Maximum Sequence Length:** 512 tokens
249
+ - **Output Dimensionality:** 768 tokens
250
+ - **Similarity Function:** Cosine Similarity
251
+ - **Training Dataset:**
252
+ - [stanfordnlp/snli](https://huggingface.co/datasets/stanfordnlp/snli)
253
+ - **Language:** en
254
+ <!-- - **License:** Unknown -->
255
+
256
+ ### Model Sources
257
+
258
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
259
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
260
+ - **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
261
+
262
+ ### Full Model Architecture
263
+
264
+ ```
265
+ SentenceTransformer(
266
+ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DebertaV2Model
267
+ (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
268
+ )
269
+ ```
270
+
271
+ ## Usage
272
+
273
+ ### Direct Usage (Sentence Transformers)
274
+
275
+ First install the Sentence Transformers library:
276
+
277
+ ```bash
278
+ pip install -U sentence-transformers
279
+ ```
280
+
281
+ Then you can load this model and run inference.
282
+ ```python
283
+ from sentence_transformers import SentenceTransformer
284
+
285
+ # Download from the 🤗 Hub
286
+ model = SentenceTransformer("bobox/DeBERTaV3-large-SentenceTransformer-0.01n")
287
+ # Run inference
288
+ sentences = [
289
+ "I don't know.",
290
+ "I'm not sure about anything.",
291
+ 'A guy is outside in the snow',
292
+ ]
293
+ embeddings = model.encode(sentences)
294
+ print(embeddings.shape)
295
+ # [3, 768]
296
+
297
+ # Get the similarity scores for the embeddings
298
+ similarities = model.similarity(embeddings, embeddings)
299
+ print(similarities.shape)
300
+ # [3, 3]
301
+ ```
302
+
303
+ <!--
304
+ ### Direct Usage (Transformers)
305
+
306
+ <details><summary>Click to see the direct usage in Transformers</summary>
307
+
308
+ </details>
309
+ -->
310
+
311
+ <!--
312
+ ### Downstream Usage (Sentence Transformers)
313
+
314
+ You can finetune this model on your own dataset.
315
+
316
+ <details><summary>Click to expand</summary>
317
+
318
+ </details>
319
+ -->
320
+
321
+ <!--
322
+ ### Out-of-Scope Use
323
+
324
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
325
+ -->
326
+
327
+ ## Evaluation
328
+
329
+ ### Metrics
330
+
331
+ #### Semantic Similarity
332
+
333
+ * Evaluated with [<code>EmbeddingSimilarityEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.EmbeddingSimilarityEvaluator)
334
+
335
+ | Metric | Value |
336
+ |:--------------------|:-----------|
337
+ | pearson_cosine | 0.3561 |
338
+ | **spearman_cosine** | **0.4042** |
339
+ | pearson_manhattan | 0.4447 |
340
+ | spearman_manhattan | 0.4644 |
341
+ | pearson_euclidean | 0.4074 |
342
+ | spearman_euclidean | 0.4281 |
343
+ | pearson_dot | -0.0336 |
344
+ | spearman_dot | -0.0261 |
345
+ | pearson_max | 0.4447 |
346
+ | spearman_max | 0.4644 |
347
+
348
+ #### Binary Classification
349
+
350
+ * Evaluated with [<code>BinaryClassificationEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.BinaryClassificationEvaluator)
351
+
352
+ | Metric | Value |
353
+ |:-----------------------------|:-----------|
354
+ | cosine_accuracy | 0.6649 |
355
+ | cosine_accuracy_threshold | 0.7643 |
356
+ | cosine_f1 | 0.7061 |
357
+ | cosine_f1_threshold | 0.6352 |
358
+ | cosine_precision | 0.5954 |
359
+ | cosine_recall | 0.8675 |
360
+ | cosine_ap | 0.7284 |
361
+ | dot_accuracy | 0.6398 |
362
+ | dot_accuracy_threshold | 268.0317 |
363
+ | dot_f1 | 0.7022 |
364
+ | dot_f1_threshold | 216.1471 |
365
+ | dot_precision | 0.5793 |
366
+ | dot_recall | 0.8912 |
367
+ | dot_ap | 0.6799 |
368
+ | manhattan_accuracy | 0.6606 |
369
+ | manhattan_accuracy_threshold | 248.9169 |
370
+ | manhattan_f1 | 0.7033 |
371
+ | manhattan_f1_threshold | 306.0212 |
372
+ | manhattan_precision | 0.5958 |
373
+ | manhattan_recall | 0.858 |
374
+ | manhattan_ap | 0.7294 |
375
+ | euclidean_accuracy | 0.658 |
376
+ | euclidean_accuracy_threshold | 12.722 |
377
+ | euclidean_f1 | 0.7028 |
378
+ | euclidean_f1_threshold | 16.0802 |
379
+ | euclidean_precision | 0.6027 |
380
+ | euclidean_recall | 0.8427 |
381
+ | euclidean_ap | 0.7268 |
382
+ | max_accuracy | 0.6649 |
383
+ | max_accuracy_threshold | 268.0317 |
384
+ | max_f1 | 0.7061 |
385
+ | max_f1_threshold | 306.0212 |
386
+ | max_precision | 0.6027 |
387
+ | max_recall | 0.8912 |
388
+ | **max_ap** | **0.7294** |
389
+
390
+ <!--
391
+ ## Bias, Risks and Limitations
392
+
393
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
394
+ -->
395
+
396
+ <!--
397
+ ### Recommendations
398
+
399
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
400
+ -->
401
+
402
+ ## Training Details
403
+
404
+ ### Training Dataset
405
+
406
+ #### stanfordnlp/snli
407
+
408
+ * Dataset: [stanfordnlp/snli](https://huggingface.co/datasets/stanfordnlp/snli) at [cdb5c3d](https://huggingface.co/datasets/stanfordnlp/snli/tree/cdb5c3d5eed6ead6e5a341c8e56e669bb666725b)
409
+ * Size: 314,315 training samples
410
+ * Columns: <code>sentence1</code>, <code>sentence2</code>, and <code>label</code>
411
+ * Approximate statistics based on the first 1000 samples:
412
+ | | sentence1 | sentence2 | label |
413
+ |:--------|:----------------------------------------------------------------------------------|:---------------------------------------------------------------------------------|:-----------------------------|
414
+ | type | string | string | int |
415
+ | details | <ul><li>min: 5 tokens</li><li>mean: 16.62 tokens</li><li>max: 62 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 9.46 tokens</li><li>max: 29 tokens</li></ul> | <ul><li>0: 100.00%</li></ul> |
416
+ * Samples:
417
+ | sentence1 | sentence2 | label |
418
+ |:---------------------------------------------------------------------------|:-------------------------------------------------|:---------------|
419
+ | <code>A person on a horse jumps over a broken down airplane.</code> | <code>A person is outdoors, on a horse.</code> | <code>0</code> |
420
+ | <code>Children smiling and waving at camera</code> | <code>There are children present</code> | <code>0</code> |
421
+ | <code>A boy is jumping on skateboard in the middle of a red bridge.</code> | <code>The boy does a skateboarding trick.</code> | <code>0</code> |
422
+ * Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
423
+ ```json
424
+ {
425
+ "scale": 20.0,
426
+ "similarity_fct": "cos_sim"
427
+ }
428
+ ```
429
+
430
+ ### Evaluation Dataset
431
+
432
+ #### sentence-transformers/stsb
433
+
434
+ * Dataset: [sentence-transformers/stsb](https://huggingface.co/datasets/sentence-transformers/stsb) at [ab7a5ac](https://huggingface.co/datasets/sentence-transformers/stsb/tree/ab7a5ac0e35aa22088bdcf23e7fd99b220e53308)
435
+ * Size: 13,189 evaluation samples
436
+ * Columns: <code>premise</code>, <code>hypothesis</code>, and <code>label</code>
437
+ * Approximate statistics based on the first 1000 samples:
438
+ | | premise | hypothesis | label |
439
+ |:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:------------------------------------------------|
440
+ | type | string | string | int |
441
+ | details | <ul><li>min: 6 tokens</li><li>mean: 17.28 tokens</li><li>max: 59 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 10.53 tokens</li><li>max: 32 tokens</li></ul> | <ul><li>0: ~48.70%</li><li>1: ~51.30%</li></ul> |
442
+ * Samples:
443
+ | premise | hypothesis | label |
444
+ |:--------------------------------------------------------------------------------------------------------|:---------------------------------------------------|:---------------|
445
+ | <code>This church choir sings to the masses as they sing joyous songs from the book at a church.</code> | <code>The church has cracks in the ceiling.</code> | <code>0</code> |
446
+ | <code>This church choir sings to the masses as they sing joyous songs from the book at a church.</code> | <code>The church is filled with song.</code> | <code>1</code> |
447
+ | <code>A woman with a green headscarf, blue shirt and a very big grin.</code> | <code>The woman is young.</code> | <code>0</code> |
448
+ * Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
449
+ ```json
450
+ {
451
+ "scale": 20.0,
452
+ "similarity_fct": "cos_sim"
453
+ }
454
+ ```
455
+
456
+ ### Training Hyperparameters
457
+ #### Non-Default Hyperparameters
458
+
459
+ - `eval_strategy`: steps
460
+ - `per_device_train_batch_size`: 32
461
+ - `per_device_eval_batch_size`: 64
462
+ - `num_train_epochs`: 5
463
+ - `lr_scheduler_type`: cosine
464
+ - `warmup_ratio`: 0.25
465
+ - `save_safetensors`: False
466
+ - `fp16`: True
467
+ - `push_to_hub`: True
468
+ - `hub_model_id`: bobox/DeBERTaV3-large-SentenceTransformer-0.01n
469
+ - `hub_strategy`: checkpoint
470
+ - `batch_sampler`: no_duplicates
471
+
472
+ #### All Hyperparameters
473
+ <details><summary>Click to expand</summary>
474
+
475
+ - `overwrite_output_dir`: False
476
+ - `do_predict`: False
477
+ - `eval_strategy`: steps
478
+ - `prediction_loss_only`: True
479
+ - `per_device_train_batch_size`: 32
480
+ - `per_device_eval_batch_size`: 64
481
+ - `per_gpu_train_batch_size`: None
482
+ - `per_gpu_eval_batch_size`: None
483
+ - `gradient_accumulation_steps`: 1
484
+ - `eval_accumulation_steps`: None
485
+ - `learning_rate`: 5e-05
486
+ - `weight_decay`: 0.0
487
+ - `adam_beta1`: 0.9
488
+ - `adam_beta2`: 0.999
489
+ - `adam_epsilon`: 1e-08
490
+ - `max_grad_norm`: 1.0
491
+ - `num_train_epochs`: 5
492
+ - `max_steps`: -1
493
+ - `lr_scheduler_type`: cosine
494
+ - `lr_scheduler_kwargs`: {}
495
+ - `warmup_ratio`: 0.25
496
+ - `warmup_steps`: 0
497
+ - `log_level`: passive
498
+ - `log_level_replica`: warning
499
+ - `log_on_each_node`: True
500
+ - `logging_nan_inf_filter`: True
501
+ - `save_safetensors`: False
502
+ - `save_on_each_node`: False
503
+ - `save_only_model`: False
504
+ - `restore_callback_states_from_checkpoint`: False
505
+ - `no_cuda`: False
506
+ - `use_cpu`: False
507
+ - `use_mps_device`: False
508
+ - `seed`: 42
509
+ - `data_seed`: None
510
+ - `jit_mode_eval`: False
511
+ - `use_ipex`: False
512
+ - `bf16`: False
513
+ - `fp16`: True
514
+ - `fp16_opt_level`: O1
515
+ - `half_precision_backend`: auto
516
+ - `bf16_full_eval`: False
517
+ - `fp16_full_eval`: False
518
+ - `tf32`: None
519
+ - `local_rank`: 0
520
+ - `ddp_backend`: None
521
+ - `tpu_num_cores`: None
522
+ - `tpu_metrics_debug`: False
523
+ - `debug`: []
524
+ - `dataloader_drop_last`: False
525
+ - `dataloader_num_workers`: 0
526
+ - `dataloader_prefetch_factor`: None
527
+ - `past_index`: -1
528
+ - `disable_tqdm`: False
529
+ - `remove_unused_columns`: True
530
+ - `label_names`: None
531
+ - `load_best_model_at_end`: False
532
+ - `ignore_data_skip`: False
533
+ - `fsdp`: []
534
+ - `fsdp_min_num_params`: 0
535
+ - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
536
+ - `fsdp_transformer_layer_cls_to_wrap`: None
537
+ - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
538
+ - `deepspeed`: None
539
+ - `label_smoothing_factor`: 0.0
540
+ - `optim`: adamw_torch
541
+ - `optim_args`: None
542
+ - `adafactor`: False
543
+ - `group_by_length`: False
544
+ - `length_column_name`: length
545
+ - `ddp_find_unused_parameters`: None
546
+ - `ddp_bucket_cap_mb`: None
547
+ - `ddp_broadcast_buffers`: False
548
+ - `dataloader_pin_memory`: True
549
+ - `dataloader_persistent_workers`: False
550
+ - `skip_memory_metrics`: True
551
+ - `use_legacy_prediction_loop`: False
552
+ - `push_to_hub`: True
553
+ - `resume_from_checkpoint`: None
554
+ - `hub_model_id`: bobox/DeBERTaV3-large-SentenceTransformer-0.01n
555
+ - `hub_strategy`: checkpoint
556
+ - `hub_private_repo`: False
557
+ - `hub_always_push`: False
558
+ - `gradient_checkpointing`: False
559
+ - `gradient_checkpointing_kwargs`: None
560
+ - `include_inputs_for_metrics`: False
561
+ - `eval_do_concat_batches`: True
562
+ - `fp16_backend`: auto
563
+ - `push_to_hub_model_id`: None
564
+ - `push_to_hub_organization`: None
565
+ - `mp_parameters`:
566
+ - `auto_find_batch_size`: False
567
+ - `full_determinism`: False
568
+ - `torchdynamo`: None
569
+ - `ray_scope`: last
570
+ - `ddp_timeout`: 1800
571
+ - `torch_compile`: False
572
+ - `torch_compile_backend`: None
573
+ - `torch_compile_mode`: None
574
+ - `dispatch_batches`: None
575
+ - `split_batches`: None
576
+ - `include_tokens_per_second`: False
577
+ - `include_num_input_tokens_seen`: False
578
+ - `neftune_noise_alpha`: None
579
+ - `optim_target_modules`: None
580
+ - `batch_eval_metrics`: False
581
+ - `batch_sampler`: no_duplicates
582
+ - `multi_dataset_batch_sampler`: proportional
583
+
584
+ </details>
585
+
586
+ ### Training Logs
587
+ | Epoch | Step | Training Loss | loss | max_ap | spearman_cosine |
588
+ |:------:|:----:|:-------------:|:------:|:------:|:---------------:|
589
+ | None | 0 | - | 3.2007 | 0.5917 | 0.4042 |
590
+ | 0.1250 | 1228 | 2.3115 | 1.3295 | 0.6783 | - |
591
+ | 0.2500 | 2456 | 1.1344 | 1.0007 | 0.7048 | - |
592
+ | 0.3750 | 3684 | 0.9827 | 0.8551 | 0.7091 | - |
593
+ | 0.5001 | 4912 | 0.9045 | 0.7483 | 0.7148 | - |
594
+ | 0.6251 | 6140 | 0.6488 | 0.6057 | 0.7276 | - |
595
+ | 0.7501 | 7368 | 0.1224 | 0.6683 | 0.7358 | - |
596
+ | 0.8751 | 8596 | 0.1063 | 0.6895 | 0.7294 | - |
597
+
598
+
599
+ ### Framework Versions
600
+ - Python: 3.10.12
601
+ - Sentence Transformers: 3.0.0
602
+ - Transformers: 4.41.2
603
+ - PyTorch: 2.3.0+cu121
604
+ - Accelerate: 0.30.1
605
+ - Datasets: 2.19.2
606
+ - Tokenizers: 0.19.1
607
+
608
+ ## Citation
609
+
610
+ ### BibTeX
611
+
612
+ #### Sentence Transformers
613
+ ```bibtex
614
+ @inproceedings{reimers-2019-sentence-bert,
615
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
616
+ author = "Reimers, Nils and Gurevych, Iryna",
617
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
618
+ month = "11",
619
+ year = "2019",
620
+ publisher = "Association for Computational Linguistics",
621
+ url = "https://arxiv.org/abs/1908.10084",
622
+ }
623
+ ```
624
+
625
+ #### MultipleNegativesRankingLoss
626
+ ```bibtex
627
+ @misc{henderson2017efficient,
628
+ title={Efficient Natural Language Response Suggestion for Smart Reply},
629
+ author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
630
+ year={2017},
631
+ eprint={1705.00652},
632
+ archivePrefix={arXiv},
633
+ primaryClass={cs.CL}
634
+ }
635
+ ```
636
+
637
+ <!--
638
+ ## Glossary
639
+
640
+ *Clearly define terms in order to be accessible across audiences.*
641
+ -->
642
+
643
+ <!--
644
+ ## Model Card Authors
645
+
646
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
647
+ -->
648
+
649
+ <!--
650
+ ## Model Card Contact
651
+
652
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
653
+ -->
last-checkpoint/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
last-checkpoint/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/deberta-v3-base",
3
+ "architectures": [
4
+ "DebertaV2Model"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-07,
13
+ "max_position_embeddings": 512,
14
+ "max_relative_positions": -1,
15
+ "model_type": "deberta-v2",
16
+ "norm_rel_ebd": "layer_norm",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "pooler_dropout": 0,
21
+ "pooler_hidden_act": "gelu",
22
+ "pooler_hidden_size": 768,
23
+ "pos_att_type": [
24
+ "p2c",
25
+ "c2p"
26
+ ],
27
+ "position_biased_input": false,
28
+ "position_buckets": 256,
29
+ "relative_attention": true,
30
+ "share_att_key": true,
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.41.2",
33
+ "type_vocab_size": 0,
34
+ "vocab_size": 128100
35
+ }
last-checkpoint/config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.0.0",
4
+ "transformers": "4.41.2",
5
+ "pytorch": "2.3.0+cu121"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": null
10
+ }
last-checkpoint/modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
last-checkpoint/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25831fee8a56126ffd94bd472a29dd638517430a70c9d55e5521916f42d41cd0
3
+ size 1470818042
last-checkpoint/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:964309fb41343f220f0134621218c2e95faf9b4c3b23ec32bf293a2c6dc0c7f2
3
+ size 735393442
last-checkpoint/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f787d94dbd33f4586c396382ebc4b31f1eb9a7ff215e29da3f8f0beae15a8ce7
3
+ size 14244
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5afa3d56abf40a30335967d6a0973486eb802f6a6a8af4dc50831b2b2486b3e8
3
+ size 1064
last-checkpoint/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
last-checkpoint/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
last-checkpoint/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
last-checkpoint/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "128000": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "sp_model_kwargs": {},
54
+ "split_by_punct": false,
55
+ "tokenizer_class": "DebertaV2Tokenizer",
56
+ "unk_token": "[UNK]",
57
+ "vocab_type": "spm"
58
+ }
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 1228,
6
+ "global_step": 9823,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.1250127252366894,
13
+ "grad_norm": 15.43435287475586,
14
+ "learning_rate": 4.984119227950159e-06,
15
+ "loss": 2.3115,
16
+ "step": 1228
17
+ },
18
+ {
19
+ "epoch": 0.1250127252366894,
20
+ "eval_cosine_accuracy": 0.6210478428993859,
21
+ "eval_cosine_accuracy_threshold": 0.848922610282898,
22
+ "eval_cosine_ap": 0.6734391704854221,
23
+ "eval_cosine_f1": 0.6875199829478844,
24
+ "eval_cosine_f1_threshold": 0.6612564921379089,
25
+ "eval_cosine_precision": 0.5412366809296082,
26
+ "eval_cosine_recall": 0.9421644515846356,
27
+ "eval_dot_accuracy": 0.5711577830009856,
28
+ "eval_dot_accuracy_threshold": 343.60479736328125,
29
+ "eval_dot_ap": 0.5947527693779144,
30
+ "eval_dot_f1": 0.6843559977888336,
31
+ "eval_dot_f1_threshold": 204.2578887939453,
32
+ "eval_dot_precision": 0.5216825007661661,
33
+ "eval_dot_recall": 0.99445012414196,
34
+ "eval_euclidean_accuracy": 0.6265827583592387,
35
+ "eval_euclidean_accuracy_threshold": 12.094720840454102,
36
+ "eval_euclidean_ap": 0.6783368773007193,
37
+ "eval_euclidean_f1": 0.6879386675305043,
38
+ "eval_euclidean_f1_threshold": 17.03110122680664,
39
+ "eval_euclidean_precision": 0.545695931477516,
40
+ "eval_euclidean_recall": 0.9304805024098145,
41
+ "eval_loss": 1.3295445442199707,
42
+ "eval_manhattan_accuracy": 0.628023352793995,
43
+ "eval_manhattan_accuracy_threshold": 217.5894012451172,
44
+ "eval_manhattan_ap": 0.6782312637044651,
45
+ "eval_manhattan_f1": 0.6873385012919898,
46
+ "eval_manhattan_f1_threshold": 288.74737548828125,
47
+ "eval_manhattan_precision": 0.5511373655439957,
48
+ "eval_manhattan_recall": 0.9129545786475829,
49
+ "eval_max_accuracy": 0.628023352793995,
50
+ "eval_max_accuracy_threshold": 343.60479736328125,
51
+ "eval_max_ap": 0.6783368773007193,
52
+ "eval_max_f1": 0.6879386675305043,
53
+ "eval_max_f1_threshold": 288.74737548828125,
54
+ "eval_max_precision": 0.5511373655439957,
55
+ "eval_max_recall": 0.99445012414196,
56
+ "eval_runtime": 37.2407,
57
+ "eval_samples_per_second": 354.156,
58
+ "eval_steps_per_second": 5.558,
59
+ "step": 1228
60
+ },
61
+ {
62
+ "epoch": 0.2500254504733788,
63
+ "grad_norm": 15.62058162689209,
64
+ "learning_rate": 9.984526427233488e-06,
65
+ "loss": 1.1344,
66
+ "step": 2456
67
+ },
68
+ {
69
+ "epoch": 0.2500254504733788,
70
+ "eval_cosine_accuracy": 0.6382591553567367,
71
+ "eval_cosine_accuracy_threshold": 0.8384923934936523,
72
+ "eval_cosine_ap": 0.6979960206921748,
73
+ "eval_cosine_f1": 0.6956719817767655,
74
+ "eval_cosine_f1_threshold": 0.7339043617248535,
75
+ "eval_cosine_precision": 0.5701484178101373,
76
+ "eval_cosine_recall": 0.8920695194975902,
77
+ "eval_dot_accuracy": 0.5939040109181895,
78
+ "eval_dot_accuracy_threshold": 351.5899658203125,
79
+ "eval_dot_ap": 0.608546180214937,
80
+ "eval_dot_f1": 0.6888470841171728,
81
+ "eval_dot_f1_threshold": 282.84210205078125,
82
+ "eval_dot_precision": 0.5449906446674605,
83
+ "eval_dot_recall": 0.9358843289031693,
84
+ "eval_euclidean_accuracy": 0.6435666085374175,
85
+ "eval_euclidean_accuracy_threshold": 12.236141204833984,
86
+ "eval_euclidean_ap": 0.703382621150597,
87
+ "eval_euclidean_f1": 0.6971692663200463,
88
+ "eval_euclidean_f1_threshold": 15.051461219787598,
89
+ "eval_euclidean_precision": 0.5766988435439167,
90
+ "eval_euclidean_recall": 0.8812618665108807,
91
+ "eval_loss": 1.0007121562957764,
92
+ "eval_manhattan_accuracy": 0.6444764576541057,
93
+ "eval_manhattan_accuracy_threshold": 214.30072021484375,
94
+ "eval_manhattan_ap": 0.7047552016202201,
95
+ "eval_manhattan_f1": 0.6967445793320045,
96
+ "eval_manhattan_f1_threshold": 250.53140258789062,
97
+ "eval_manhattan_precision": 0.5940267765190526,
98
+ "eval_manhattan_recall": 0.8424127355046006,
99
+ "eval_max_accuracy": 0.6444764576541057,
100
+ "eval_max_accuracy_threshold": 351.5899658203125,
101
+ "eval_max_ap": 0.7047552016202201,
102
+ "eval_max_f1": 0.6971692663200463,
103
+ "eval_max_f1_threshold": 282.84210205078125,
104
+ "eval_max_precision": 0.5940267765190526,
105
+ "eval_max_recall": 0.9358843289031693,
106
+ "eval_runtime": 36.7273,
107
+ "eval_samples_per_second": 359.106,
108
+ "eval_steps_per_second": 5.636,
109
+ "step": 2456
110
+ },
111
+ {
112
+ "epoch": 0.3750381757100682,
113
+ "grad_norm": 11.62977123260498,
114
+ "learning_rate": 1.4984933626516816e-05,
115
+ "loss": 0.9827,
116
+ "step": 3684
117
+ },
118
+ {
119
+ "epoch": 0.3750381757100682,
120
+ "eval_cosine_accuracy": 0.6383349761164607,
121
+ "eval_cosine_accuracy_threshold": 0.8612147569656372,
122
+ "eval_cosine_ap": 0.7032393444334419,
123
+ "eval_cosine_f1": 0.6932498182428275,
124
+ "eval_cosine_f1_threshold": 0.7054703235626221,
125
+ "eval_cosine_precision": 0.5617183251767265,
126
+ "eval_cosine_recall": 0.9052139623192639,
127
+ "eval_dot_accuracy": 0.5945863977557055,
128
+ "eval_dot_accuracy_threshold": 339.2906188964844,
129
+ "eval_dot_ap": 0.6228302492866797,
130
+ "eval_dot_f1": 0.6876656472986747,
131
+ "eval_dot_f1_threshold": 224.48248291015625,
132
+ "eval_dot_precision": 0.528145306505911,
133
+ "eval_dot_recall": 0.9852490141667883,
134
+ "eval_euclidean_accuracy": 0.6429600424596255,
135
+ "eval_euclidean_accuracy_threshold": 12.074974060058594,
136
+ "eval_euclidean_ap": 0.7080982628872418,
137
+ "eval_euclidean_f1": 0.6952535685273297,
138
+ "eval_euclidean_f1_threshold": 15.344061851501465,
139
+ "eval_euclidean_precision": 0.576778665639742,
140
+ "eval_euclidean_recall": 0.8749817438294143,
141
+ "eval_loss": 0.8550812602043152,
142
+ "eval_manhattan_accuracy": 0.646523618166654,
143
+ "eval_manhattan_accuracy_threshold": 216.00611877441406,
144
+ "eval_manhattan_ap": 0.7091050987827672,
145
+ "eval_manhattan_f1": 0.6948782931573053,
146
+ "eval_manhattan_f1_threshold": 267.3834228515625,
147
+ "eval_manhattan_precision": 0.5825098814229249,
148
+ "eval_manhattan_recall": 0.860961004819629,
149
+ "eval_max_accuracy": 0.646523618166654,
150
+ "eval_max_accuracy_threshold": 339.2906188964844,
151
+ "eval_max_ap": 0.7091050987827672,
152
+ "eval_max_f1": 0.6952535685273297,
153
+ "eval_max_f1_threshold": 267.3834228515625,
154
+ "eval_max_precision": 0.5825098814229249,
155
+ "eval_max_recall": 0.9852490141667883,
156
+ "eval_runtime": 37.837,
157
+ "eval_samples_per_second": 348.574,
158
+ "eval_steps_per_second": 5.471,
159
+ "step": 3684
160
+ },
161
+ {
162
+ "epoch": 0.5000509009467576,
163
+ "grad_norm": 6.6275715827941895,
164
+ "learning_rate": 1.9985340825800148e-05,
165
+ "loss": 0.9045,
166
+ "step": 4912
167
+ },
168
+ {
169
+ "epoch": 0.5000509009467576,
170
+ "eval_cosine_accuracy": 0.64644779740693,
171
+ "eval_cosine_accuracy_threshold": 0.8346654772758484,
172
+ "eval_cosine_ap": 0.7120759823214882,
173
+ "eval_cosine_f1": 0.7009007975213725,
174
+ "eval_cosine_f1_threshold": 0.7283114790916443,
175
+ "eval_cosine_precision": 0.5772065772065772,
176
+ "eval_cosine_recall": 0.8920695194975902,
177
+ "eval_dot_accuracy": 0.6088407005838199,
178
+ "eval_dot_accuracy_threshold": 372.897705078125,
179
+ "eval_dot_ap": 0.648631541530839,
180
+ "eval_dot_f1": 0.6931595400837175,
181
+ "eval_dot_f1_threshold": 278.7884216308594,
182
+ "eval_dot_precision": 0.5439048727756528,
183
+ "eval_dot_recall": 0.9553088944063093,
184
+ "eval_euclidean_accuracy": 0.6481916748805823,
185
+ "eval_euclidean_accuracy_threshold": 12.667613983154297,
186
+ "eval_euclidean_ap": 0.7129650302942496,
187
+ "eval_euclidean_f1": 0.7006925449572251,
188
+ "eval_euclidean_f1_threshold": 15.448726654052734,
189
+ "eval_euclidean_precision": 0.5824303405572755,
190
+ "eval_euclidean_recall": 0.879217175405287,
191
+ "eval_loss": 0.7482573986053467,
192
+ "eval_manhattan_accuracy": 0.6497080900750626,
193
+ "eval_manhattan_accuracy_threshold": 228.42645263671875,
194
+ "eval_manhattan_ap": 0.7148163923267561,
195
+ "eval_manhattan_f1": 0.7008231143961029,
196
+ "eval_manhattan_f1_threshold": 289.23284912109375,
197
+ "eval_manhattan_precision": 0.5682891391209589,
198
+ "eval_manhattan_recall": 0.9139769242003797,
199
+ "eval_max_accuracy": 0.6497080900750626,
200
+ "eval_max_accuracy_threshold": 372.897705078125,
201
+ "eval_max_ap": 0.7148163923267561,
202
+ "eval_max_f1": 0.7009007975213725,
203
+ "eval_max_f1_threshold": 289.23284912109375,
204
+ "eval_max_precision": 0.5824303405572755,
205
+ "eval_max_recall": 0.9553088944063093,
206
+ "eval_runtime": 36.72,
207
+ "eval_samples_per_second": 359.178,
208
+ "eval_steps_per_second": 5.637,
209
+ "step": 4912
210
+ },
211
+ {
212
+ "epoch": 0.625063626183447,
213
+ "grad_norm": 8.070146560668945,
214
+ "learning_rate": 2.4985748025083476e-05,
215
+ "loss": 0.6488,
216
+ "step": 6140
217
+ },
218
+ {
219
+ "epoch": 0.625063626183447,
220
+ "eval_cosine_accuracy": 0.6679050724088256,
221
+ "eval_cosine_accuracy_threshold": 0.7640305757522583,
222
+ "eval_cosine_ap": 0.7260649988715485,
223
+ "eval_cosine_f1": 0.7124000968757568,
224
+ "eval_cosine_f1_threshold": 0.6364777088165283,
225
+ "eval_cosine_precision": 0.6084393422277381,
226
+ "eval_cosine_recall": 0.8592084124434058,
227
+ "eval_dot_accuracy": 0.6316627492607476,
228
+ "eval_dot_accuracy_threshold": 238.4151611328125,
229
+ "eval_dot_ap": 0.6571882757637728,
230
+ "eval_dot_f1": 0.7046040103058139,
231
+ "eval_dot_f1_threshold": 183.28378295898438,
232
+ "eval_dot_precision": 0.5714545289361316,
233
+ "eval_dot_recall": 0.9186505038703081,
234
+ "eval_euclidean_accuracy": 0.6656304496171052,
235
+ "eval_euclidean_accuracy_threshold": 12.852251052856445,
236
+ "eval_euclidean_ap": 0.7254103624891234,
237
+ "eval_euclidean_f1": 0.7062990731448138,
238
+ "eval_euclidean_f1_threshold": 16.455379486083984,
239
+ "eval_euclidean_precision": 0.5927467300832342,
240
+ "eval_euclidean_recall": 0.873667299547247,
241
+ "eval_loss": 0.6056942343711853,
242
+ "eval_manhattan_accuracy": 0.6666919402532413,
243
+ "eval_manhattan_accuracy_threshold": 238.08731079101562,
244
+ "eval_manhattan_ap": 0.7275797275536158,
245
+ "eval_manhattan_f1": 0.7068183230778792,
246
+ "eval_manhattan_f1_threshold": 281.99066162109375,
247
+ "eval_manhattan_precision": 0.6152347976628435,
248
+ "eval_manhattan_recall": 0.8304366876004089,
249
+ "eval_max_accuracy": 0.6679050724088256,
250
+ "eval_max_accuracy_threshold": 238.4151611328125,
251
+ "eval_max_ap": 0.7275797275536158,
252
+ "eval_max_f1": 0.7124000968757568,
253
+ "eval_max_f1_threshold": 281.99066162109375,
254
+ "eval_max_precision": 0.6152347976628435,
255
+ "eval_max_recall": 0.9186505038703081,
256
+ "eval_runtime": 36.5244,
257
+ "eval_samples_per_second": 361.101,
258
+ "eval_steps_per_second": 5.667,
259
+ "step": 6140
260
+ },
261
+ {
262
+ "epoch": 0.7500763514201364,
263
+ "grad_norm": 3.4847946166992188,
264
+ "learning_rate": 2.9986155224366808e-05,
265
+ "loss": 0.1224,
266
+ "step": 7368
267
+ },
268
+ {
269
+ "epoch": 0.7500763514201364,
270
+ "eval_cosine_accuracy": 0.6698005914019258,
271
+ "eval_cosine_accuracy_threshold": 0.766596794128418,
272
+ "eval_cosine_ap": 0.7285326578175677,
273
+ "eval_cosine_f1": 0.7130742480629613,
274
+ "eval_cosine_f1_threshold": 0.6729254722595215,
275
+ "eval_cosine_precision": 0.6123218776194468,
276
+ "eval_cosine_recall": 0.8535124872206806,
277
+ "eval_dot_accuracy": 0.6206687391007658,
278
+ "eval_dot_accuracy_threshold": 261.77471923828125,
279
+ "eval_dot_ap": 0.6459102507978051,
280
+ "eval_dot_f1": 0.7029659625697713,
281
+ "eval_dot_f1_threshold": 202.47332763671875,
282
+ "eval_dot_precision": 0.5620897873457601,
283
+ "eval_dot_recall": 0.9380750693734482,
284
+ "eval_euclidean_accuracy": 0.6688149215255137,
285
+ "eval_euclidean_accuracy_threshold": 12.95585823059082,
286
+ "eval_euclidean_ap": 0.732777449339851,
287
+ "eval_euclidean_f1": 0.7106249615928225,
288
+ "eval_euclidean_f1_threshold": 15.783571243286133,
289
+ "eval_euclidean_precision": 0.6134097178018247,
290
+ "eval_euclidean_recall": 0.8444574266101943,
291
+ "eval_loss": 0.6682662963867188,
292
+ "eval_manhattan_accuracy": 0.6737432709075745,
293
+ "eval_manhattan_accuracy_threshold": 239.43251037597656,
294
+ "eval_manhattan_ap": 0.7357876112808931,
295
+ "eval_manhattan_f1": 0.7108785592601606,
296
+ "eval_manhattan_f1_threshold": 293.34881591796875,
297
+ "eval_manhattan_precision": 0.6092397538846595,
298
+ "eval_manhattan_recall": 0.8532203884913101,
299
+ "eval_max_accuracy": 0.6737432709075745,
300
+ "eval_max_accuracy_threshold": 261.77471923828125,
301
+ "eval_max_ap": 0.7357876112808931,
302
+ "eval_max_f1": 0.7130742480629613,
303
+ "eval_max_f1_threshold": 293.34881591796875,
304
+ "eval_max_precision": 0.6134097178018247,
305
+ "eval_max_recall": 0.9380750693734482,
306
+ "eval_runtime": 36.5369,
307
+ "eval_samples_per_second": 360.977,
308
+ "eval_steps_per_second": 5.666,
309
+ "step": 7368
310
+ },
311
+ {
312
+ "epoch": 0.8750890766568258,
313
+ "grad_norm": 7.982311725616455,
314
+ "learning_rate": 3.498656242365014e-05,
315
+ "loss": 0.1063,
316
+ "step": 8596
317
+ },
318
+ {
319
+ "epoch": 0.8750890766568258,
320
+ "eval_cosine_accuracy": 0.6648722420198651,
321
+ "eval_cosine_accuracy_threshold": 0.7642883062362671,
322
+ "eval_cosine_ap": 0.7283929467215002,
323
+ "eval_cosine_f1": 0.7061340941512125,
324
+ "eval_cosine_f1_threshold": 0.6351689100265503,
325
+ "eval_cosine_precision": 0.5953693495038589,
326
+ "eval_cosine_recall": 0.8675332262304659,
327
+ "eval_dot_accuracy": 0.6397755705512169,
328
+ "eval_dot_accuracy_threshold": 268.03167724609375,
329
+ "eval_dot_ap": 0.6799114732445778,
330
+ "eval_dot_f1": 0.7021864211737631,
331
+ "eval_dot_f1_threshold": 216.1470947265625,
332
+ "eval_dot_precision": 0.5793221304471661,
333
+ "eval_dot_recall": 0.8911932233094786,
334
+ "eval_euclidean_accuracy": 0.6580483736447039,
335
+ "eval_euclidean_accuracy_threshold": 12.722024917602539,
336
+ "eval_euclidean_ap": 0.7268148607241872,
337
+ "eval_euclidean_f1": 0.7027586626880216,
338
+ "eval_euclidean_f1_threshold": 16.08021354675293,
339
+ "eval_euclidean_precision": 0.6026739085021935,
340
+ "eval_euclidean_recall": 0.8427048342339711,
341
+ "eval_loss": 0.6895074248313904,
342
+ "eval_manhattan_accuracy": 0.6606262794753204,
343
+ "eval_manhattan_accuracy_threshold": 248.9168701171875,
344
+ "eval_manhattan_ap": 0.7294072443461903,
345
+ "eval_manhattan_f1": 0.703255925305243,
346
+ "eval_manhattan_f1_threshold": 306.02117919921875,
347
+ "eval_manhattan_precision": 0.5957813609167427,
348
+ "eval_manhattan_recall": 0.8580400175259237,
349
+ "eval_max_accuracy": 0.6648722420198651,
350
+ "eval_max_accuracy_threshold": 268.03167724609375,
351
+ "eval_max_ap": 0.7294072443461903,
352
+ "eval_max_f1": 0.7061340941512125,
353
+ "eval_max_f1_threshold": 306.02117919921875,
354
+ "eval_max_precision": 0.6026739085021935,
355
+ "eval_max_recall": 0.8911932233094786,
356
+ "eval_runtime": 36.5151,
357
+ "eval_samples_per_second": 361.193,
358
+ "eval_steps_per_second": 5.669,
359
+ "step": 8596
360
+ }
361
+ ],
362
+ "logging_steps": 1228,
363
+ "max_steps": 49115,
364
+ "num_input_tokens_seen": 0,
365
+ "num_train_epochs": 5,
366
+ "save_steps": 9823,
367
+ "stateful_callbacks": {
368
+ "TrainerControl": {
369
+ "args": {
370
+ "should_epoch_stop": false,
371
+ "should_evaluate": false,
372
+ "should_log": false,
373
+ "should_save": true,
374
+ "should_training_stop": false
375
+ },
376
+ "attributes": {}
377
+ }
378
+ },
379
+ "total_flos": 0.0,
380
+ "train_batch_size": 32,
381
+ "trial_name": null,
382
+ "trial_params": null
383
+ }
last-checkpoint/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f3a4722bf017fa7a86c21655fdafd309cec23645bee218eafdd9a69e1939ff7
3
+ size 5560