OrlikB commited on
Commit
b012a37
0 Parent(s):

initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": true,
4
+ "pooling_mode_mean_tokens": false,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
2_Normalize/.gitkeep ADDED
File without changes
README.md ADDED
@@ -0,0 +1,1273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - sentence-transformers
4
+ - feature-extraction
5
+ - sentence-similarity
6
+ - transformers
7
+ - mteb
8
+ license: lgpl
9
+ language:
10
+ - pl
11
+ pipeline_tag: sentence-similarity
12
+ model-index:
13
+ - name: st-polish-kartonberta-base-alpha-v1
14
+ results:
15
+ - task:
16
+ type: Clustering
17
+ dataset:
18
+ type: PL-MTEB/8tags-clustering
19
+ name: MTEB 8TagsClustering
20
+ config: default
21
+ split: test
22
+ revision: None
23
+ metrics:
24
+ - type: v_measure
25
+ value: 32.85180358455615
26
+ - task:
27
+ type: Classification
28
+ dataset:
29
+ type: PL-MTEB/allegro-reviews
30
+ name: MTEB AllegroReviews
31
+ config: default
32
+ split: test
33
+ revision: None
34
+ metrics:
35
+ - type: accuracy
36
+ value: 40.188866799204774
37
+ - type: f1
38
+ value: 34.71127012684797
39
+ - task:
40
+ type: Retrieval
41
+ dataset:
42
+ type: arguana-pl
43
+ name: MTEB ArguAna-PL
44
+ config: default
45
+ split: test
46
+ revision: None
47
+ metrics:
48
+ - type: map_at_1
49
+ value: 30.939
50
+ - type: map_at_10
51
+ value: 47.467999999999996
52
+ - type: map_at_100
53
+ value: 48.303000000000004
54
+ - type: map_at_1000
55
+ value: 48.308
56
+ - type: map_at_3
57
+ value: 43.22
58
+ - type: map_at_5
59
+ value: 45.616
60
+ - type: mrr_at_1
61
+ value: 31.863000000000003
62
+ - type: mrr_at_10
63
+ value: 47.829
64
+ - type: mrr_at_100
65
+ value: 48.664
66
+ - type: mrr_at_1000
67
+ value: 48.67
68
+ - type: mrr_at_3
69
+ value: 43.492
70
+ - type: mrr_at_5
71
+ value: 46.006
72
+ - type: ndcg_at_1
73
+ value: 30.939
74
+ - type: ndcg_at_10
75
+ value: 56.058
76
+ - type: ndcg_at_100
77
+ value: 59.562000000000005
78
+ - type: ndcg_at_1000
79
+ value: 59.69799999999999
80
+ - type: ndcg_at_3
81
+ value: 47.260000000000005
82
+ - type: ndcg_at_5
83
+ value: 51.587
84
+ - type: precision_at_1
85
+ value: 30.939
86
+ - type: precision_at_10
87
+ value: 8.329
88
+ - type: precision_at_100
89
+ value: 0.984
90
+ - type: precision_at_1000
91
+ value: 0.1
92
+ - type: precision_at_3
93
+ value: 19.654
94
+ - type: precision_at_5
95
+ value: 13.898
96
+ - type: recall_at_1
97
+ value: 30.939
98
+ - type: recall_at_10
99
+ value: 83.286
100
+ - type: recall_at_100
101
+ value: 98.43499999999999
102
+ - type: recall_at_1000
103
+ value: 99.502
104
+ - type: recall_at_3
105
+ value: 58.962
106
+ - type: recall_at_5
107
+ value: 69.488
108
+ - task:
109
+ type: Classification
110
+ dataset:
111
+ type: PL-MTEB/cbd
112
+ name: MTEB CBD
113
+ config: default
114
+ split: test
115
+ revision: None
116
+ metrics:
117
+ - type: accuracy
118
+ value: 67.69000000000001
119
+ - type: ap
120
+ value: 21.078799692467182
121
+ - type: f1
122
+ value: 56.80107173953953
123
+ - task:
124
+ type: PairClassification
125
+ dataset:
126
+ type: PL-MTEB/cdsce-pairclassification
127
+ name: MTEB CDSC-E
128
+ config: default
129
+ split: test
130
+ revision: None
131
+ metrics:
132
+ - type: cos_sim_accuracy
133
+ value: 89.2
134
+ - type: cos_sim_ap
135
+ value: 79.11674608786898
136
+ - type: cos_sim_f1
137
+ value: 68.83468834688347
138
+ - type: cos_sim_precision
139
+ value: 70.94972067039106
140
+ - type: cos_sim_recall
141
+ value: 66.84210526315789
142
+ - type: dot_accuracy
143
+ value: 89.2
144
+ - type: dot_ap
145
+ value: 79.11674608786898
146
+ - type: dot_f1
147
+ value: 68.83468834688347
148
+ - type: dot_precision
149
+ value: 70.94972067039106
150
+ - type: dot_recall
151
+ value: 66.84210526315789
152
+ - type: euclidean_accuracy
153
+ value: 89.2
154
+ - type: euclidean_ap
155
+ value: 79.11674608786898
156
+ - type: euclidean_f1
157
+ value: 68.83468834688347
158
+ - type: euclidean_precision
159
+ value: 70.94972067039106
160
+ - type: euclidean_recall
161
+ value: 66.84210526315789
162
+ - type: manhattan_accuracy
163
+ value: 89.1
164
+ - type: manhattan_ap
165
+ value: 79.1220443374692
166
+ - type: manhattan_f1
167
+ value: 69.02173913043478
168
+ - type: manhattan_precision
169
+ value: 71.34831460674157
170
+ - type: manhattan_recall
171
+ value: 66.84210526315789
172
+ - type: max_accuracy
173
+ value: 89.2
174
+ - type: max_ap
175
+ value: 79.1220443374692
176
+ - type: max_f1
177
+ value: 69.02173913043478
178
+ - task:
179
+ type: STS
180
+ dataset:
181
+ type: PL-MTEB/cdscr-sts
182
+ name: MTEB CDSC-R
183
+ config: default
184
+ split: test
185
+ revision: None
186
+ metrics:
187
+ - type: cos_sim_pearson
188
+ value: 91.41534744278998
189
+ - type: cos_sim_spearman
190
+ value: 92.12681551821147
191
+ - type: euclidean_pearson
192
+ value: 91.74369794485992
193
+ - type: euclidean_spearman
194
+ value: 92.12685848456046
195
+ - type: manhattan_pearson
196
+ value: 91.66651938751657
197
+ - type: manhattan_spearman
198
+ value: 92.057603126734
199
+ - task:
200
+ type: Retrieval
201
+ dataset:
202
+ type: dbpedia-pl
203
+ name: MTEB DBPedia-PL
204
+ config: default
205
+ split: test
206
+ revision: None
207
+ metrics:
208
+ - type: map_at_1
209
+ value: 5.8709999999999996
210
+ - type: map_at_10
211
+ value: 12.486
212
+ - type: map_at_100
213
+ value: 16.897000000000002
214
+ - type: map_at_1000
215
+ value: 18.056
216
+ - type: map_at_3
217
+ value: 8.958
218
+ - type: map_at_5
219
+ value: 10.57
220
+ - type: mrr_at_1
221
+ value: 44.0
222
+ - type: mrr_at_10
223
+ value: 53.830999999999996
224
+ - type: mrr_at_100
225
+ value: 54.54
226
+ - type: mrr_at_1000
227
+ value: 54.568000000000005
228
+ - type: mrr_at_3
229
+ value: 51.87500000000001
230
+ - type: mrr_at_5
231
+ value: 53.113
232
+ - type: ndcg_at_1
233
+ value: 34.625
234
+ - type: ndcg_at_10
235
+ value: 26.996
236
+ - type: ndcg_at_100
237
+ value: 31.052999999999997
238
+ - type: ndcg_at_1000
239
+ value: 38.208
240
+ - type: ndcg_at_3
241
+ value: 29.471000000000004
242
+ - type: ndcg_at_5
243
+ value: 28.364
244
+ - type: precision_at_1
245
+ value: 44.0
246
+ - type: precision_at_10
247
+ value: 21.45
248
+ - type: precision_at_100
249
+ value: 6.837
250
+ - type: precision_at_1000
251
+ value: 1.6019999999999999
252
+ - type: precision_at_3
253
+ value: 32.333
254
+ - type: precision_at_5
255
+ value: 27.800000000000004
256
+ - type: recall_at_1
257
+ value: 5.8709999999999996
258
+ - type: recall_at_10
259
+ value: 17.318
260
+ - type: recall_at_100
261
+ value: 36.854
262
+ - type: recall_at_1000
263
+ value: 60.468999999999994
264
+ - type: recall_at_3
265
+ value: 10.213999999999999
266
+ - type: recall_at_5
267
+ value: 13.364
268
+ - task:
269
+ type: Retrieval
270
+ dataset:
271
+ type: fiqa-pl
272
+ name: MTEB FiQA-PL
273
+ config: default
274
+ split: test
275
+ revision: None
276
+ metrics:
277
+ - type: map_at_1
278
+ value: 10.289
279
+ - type: map_at_10
280
+ value: 18.285999999999998
281
+ - type: map_at_100
282
+ value: 19.743
283
+ - type: map_at_1000
284
+ value: 19.964000000000002
285
+ - type: map_at_3
286
+ value: 15.193000000000001
287
+ - type: map_at_5
288
+ value: 16.962
289
+ - type: mrr_at_1
290
+ value: 21.914
291
+ - type: mrr_at_10
292
+ value: 30.653999999999996
293
+ - type: mrr_at_100
294
+ value: 31.623
295
+ - type: mrr_at_1000
296
+ value: 31.701
297
+ - type: mrr_at_3
298
+ value: 27.855
299
+ - type: mrr_at_5
300
+ value: 29.514000000000003
301
+ - type: ndcg_at_1
302
+ value: 21.914
303
+ - type: ndcg_at_10
304
+ value: 24.733
305
+ - type: ndcg_at_100
306
+ value: 31.253999999999998
307
+ - type: ndcg_at_1000
308
+ value: 35.617
309
+ - type: ndcg_at_3
310
+ value: 20.962
311
+ - type: ndcg_at_5
312
+ value: 22.553
313
+ - type: precision_at_1
314
+ value: 21.914
315
+ - type: precision_at_10
316
+ value: 7.346
317
+ - type: precision_at_100
318
+ value: 1.389
319
+ - type: precision_at_1000
320
+ value: 0.214
321
+ - type: precision_at_3
322
+ value: 14.352
323
+ - type: precision_at_5
324
+ value: 11.42
325
+ - type: recall_at_1
326
+ value: 10.289
327
+ - type: recall_at_10
328
+ value: 31.459
329
+ - type: recall_at_100
330
+ value: 56.854000000000006
331
+ - type: recall_at_1000
332
+ value: 83.722
333
+ - type: recall_at_3
334
+ value: 19.457
335
+ - type: recall_at_5
336
+ value: 24.767
337
+ - task:
338
+ type: Retrieval
339
+ dataset:
340
+ type: hotpotqa-pl
341
+ name: MTEB HotpotQA-PL
342
+ config: default
343
+ split: test
344
+ revision: None
345
+ metrics:
346
+ - type: map_at_1
347
+ value: 29.669
348
+ - type: map_at_10
349
+ value: 41.615
350
+ - type: map_at_100
351
+ value: 42.571999999999996
352
+ - type: map_at_1000
353
+ value: 42.662
354
+ - type: map_at_3
355
+ value: 38.938
356
+ - type: map_at_5
357
+ value: 40.541
358
+ - type: mrr_at_1
359
+ value: 59.338
360
+ - type: mrr_at_10
361
+ value: 66.93900000000001
362
+ - type: mrr_at_100
363
+ value: 67.361
364
+ - type: mrr_at_1000
365
+ value: 67.38499999999999
366
+ - type: mrr_at_3
367
+ value: 65.384
368
+ - type: mrr_at_5
369
+ value: 66.345
370
+ - type: ndcg_at_1
371
+ value: 59.338
372
+ - type: ndcg_at_10
373
+ value: 50.607
374
+ - type: ndcg_at_100
375
+ value: 54.342999999999996
376
+ - type: ndcg_at_1000
377
+ value: 56.286
378
+ - type: ndcg_at_3
379
+ value: 46.289
380
+ - type: ndcg_at_5
381
+ value: 48.581
382
+ - type: precision_at_1
383
+ value: 59.338
384
+ - type: precision_at_10
385
+ value: 10.585
386
+ - type: precision_at_100
387
+ value: 1.353
388
+ - type: precision_at_1000
389
+ value: 0.161
390
+ - type: precision_at_3
391
+ value: 28.877000000000002
392
+ - type: precision_at_5
393
+ value: 19.133
394
+ - type: recall_at_1
395
+ value: 29.669
396
+ - type: recall_at_10
397
+ value: 52.92400000000001
398
+ - type: recall_at_100
399
+ value: 67.657
400
+ - type: recall_at_1000
401
+ value: 80.628
402
+ - type: recall_at_3
403
+ value: 43.315
404
+ - type: recall_at_5
405
+ value: 47.833
406
+ - task:
407
+ type: Retrieval
408
+ dataset:
409
+ type: msmarco-pl
410
+ name: MTEB MSMARCO-PL
411
+ config: default
412
+ split: test
413
+ revision: None
414
+ metrics:
415
+ - type: map_at_1
416
+ value: 0.997
417
+ - type: map_at_10
418
+ value: 7.481999999999999
419
+ - type: map_at_100
420
+ value: 20.208000000000002
421
+ - type: map_at_1000
422
+ value: 25.601000000000003
423
+ - type: map_at_3
424
+ value: 3.055
425
+ - type: map_at_5
426
+ value: 4.853
427
+ - type: mrr_at_1
428
+ value: 55.814
429
+ - type: mrr_at_10
430
+ value: 64.651
431
+ - type: mrr_at_100
432
+ value: 65.003
433
+ - type: mrr_at_1000
434
+ value: 65.05199999999999
435
+ - type: mrr_at_3
436
+ value: 62.403
437
+ - type: mrr_at_5
438
+ value: 64.031
439
+ - type: ndcg_at_1
440
+ value: 44.186
441
+ - type: ndcg_at_10
442
+ value: 43.25
443
+ - type: ndcg_at_100
444
+ value: 40.515
445
+ - type: ndcg_at_1000
446
+ value: 48.345
447
+ - type: ndcg_at_3
448
+ value: 45.829
449
+ - type: ndcg_at_5
450
+ value: 46.477000000000004
451
+ - type: precision_at_1
452
+ value: 55.814
453
+ - type: precision_at_10
454
+ value: 50.465
455
+ - type: precision_at_100
456
+ value: 25.419000000000004
457
+ - type: precision_at_1000
458
+ value: 5.0840000000000005
459
+ - type: precision_at_3
460
+ value: 58.14
461
+ - type: precision_at_5
462
+ value: 57.67400000000001
463
+ - type: recall_at_1
464
+ value: 0.997
465
+ - type: recall_at_10
466
+ value: 8.985999999999999
467
+ - type: recall_at_100
468
+ value: 33.221000000000004
469
+ - type: recall_at_1000
470
+ value: 58.836999999999996
471
+ - type: recall_at_3
472
+ value: 3.472
473
+ - type: recall_at_5
474
+ value: 5.545
475
+ - task:
476
+ type: Classification
477
+ dataset:
478
+ type: mteb/amazon_massive_intent
479
+ name: MTEB MassiveIntentClassification (pl)
480
+ config: pl
481
+ split: test
482
+ revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
483
+ metrics:
484
+ - type: accuracy
485
+ value: 68.19771351714861
486
+ - type: f1
487
+ value: 64.75039989217822
488
+ - task:
489
+ type: Classification
490
+ dataset:
491
+ type: mteb/amazon_massive_scenario
492
+ name: MTEB MassiveScenarioClassification (pl)
493
+ config: pl
494
+ split: test
495
+ revision: 7d571f92784cd94a019292a1f45445077d0ef634
496
+ metrics:
497
+ - type: accuracy
498
+ value: 73.9677202420982
499
+ - type: f1
500
+ value: 73.72287107577753
501
+ - task:
502
+ type: Retrieval
503
+ dataset:
504
+ type: nfcorpus-pl
505
+ name: MTEB NFCorpus-PL
506
+ config: default
507
+ split: test
508
+ revision: None
509
+ metrics:
510
+ - type: map_at_1
511
+ value: 5.167
512
+ - type: map_at_10
513
+ value: 10.791
514
+ - type: map_at_100
515
+ value: 14.072999999999999
516
+ - type: map_at_1000
517
+ value: 15.568000000000001
518
+ - type: map_at_3
519
+ value: 7.847999999999999
520
+ - type: map_at_5
521
+ value: 9.112
522
+ - type: mrr_at_1
523
+ value: 42.105
524
+ - type: mrr_at_10
525
+ value: 49.933
526
+ - type: mrr_at_100
527
+ value: 50.659
528
+ - type: mrr_at_1000
529
+ value: 50.705
530
+ - type: mrr_at_3
531
+ value: 47.988
532
+ - type: mrr_at_5
533
+ value: 49.056
534
+ - type: ndcg_at_1
535
+ value: 39.938
536
+ - type: ndcg_at_10
537
+ value: 31.147000000000002
538
+ - type: ndcg_at_100
539
+ value: 29.336000000000002
540
+ - type: ndcg_at_1000
541
+ value: 38.147
542
+ - type: ndcg_at_3
543
+ value: 35.607
544
+ - type: ndcg_at_5
545
+ value: 33.725
546
+ - type: precision_at_1
547
+ value: 41.486000000000004
548
+ - type: precision_at_10
549
+ value: 23.901
550
+ - type: precision_at_100
551
+ value: 7.960000000000001
552
+ - type: precision_at_1000
553
+ value: 2.086
554
+ - type: precision_at_3
555
+ value: 33.437
556
+ - type: precision_at_5
557
+ value: 29.598000000000003
558
+ - type: recall_at_1
559
+ value: 5.167
560
+ - type: recall_at_10
561
+ value: 14.244000000000002
562
+ - type: recall_at_100
563
+ value: 31.192999999999998
564
+ - type: recall_at_1000
565
+ value: 62.41799999999999
566
+ - type: recall_at_3
567
+ value: 8.697000000000001
568
+ - type: recall_at_5
569
+ value: 10.911
570
+ - task:
571
+ type: Retrieval
572
+ dataset:
573
+ type: nq-pl
574
+ name: MTEB NQ-PL
575
+ config: default
576
+ split: test
577
+ revision: None
578
+ metrics:
579
+ - type: map_at_1
580
+ value: 14.417
581
+ - type: map_at_10
582
+ value: 23.330000000000002
583
+ - type: map_at_100
584
+ value: 24.521
585
+ - type: map_at_1000
586
+ value: 24.604
587
+ - type: map_at_3
588
+ value: 20.076
589
+ - type: map_at_5
590
+ value: 21.854000000000003
591
+ - type: mrr_at_1
592
+ value: 16.454
593
+ - type: mrr_at_10
594
+ value: 25.402
595
+ - type: mrr_at_100
596
+ value: 26.411
597
+ - type: mrr_at_1000
598
+ value: 26.479000000000003
599
+ - type: mrr_at_3
600
+ value: 22.369
601
+ - type: mrr_at_5
602
+ value: 24.047
603
+ - type: ndcg_at_1
604
+ value: 16.454
605
+ - type: ndcg_at_10
606
+ value: 28.886
607
+ - type: ndcg_at_100
608
+ value: 34.489999999999995
609
+ - type: ndcg_at_1000
610
+ value: 36.687999999999995
611
+ - type: ndcg_at_3
612
+ value: 22.421
613
+ - type: ndcg_at_5
614
+ value: 25.505
615
+ - type: precision_at_1
616
+ value: 16.454
617
+ - type: precision_at_10
618
+ value: 5.252
619
+ - type: precision_at_100
620
+ value: 0.8410000000000001
621
+ - type: precision_at_1000
622
+ value: 0.105
623
+ - type: precision_at_3
624
+ value: 10.428999999999998
625
+ - type: precision_at_5
626
+ value: 8.019
627
+ - type: recall_at_1
628
+ value: 14.417
629
+ - type: recall_at_10
630
+ value: 44.025
631
+ - type: recall_at_100
632
+ value: 69.404
633
+ - type: recall_at_1000
634
+ value: 86.18900000000001
635
+ - type: recall_at_3
636
+ value: 26.972
637
+ - type: recall_at_5
638
+ value: 34.132
639
+ - task:
640
+ type: Classification
641
+ dataset:
642
+ type: laugustyniak/abusive-clauses-pl
643
+ name: MTEB PAC
644
+ config: default
645
+ split: test
646
+ revision: None
647
+ metrics:
648
+ - type: accuracy
649
+ value: 66.55082536924412
650
+ - type: ap
651
+ value: 76.44962281293184
652
+ - type: f1
653
+ value: 63.899803692180434
654
+ - task:
655
+ type: PairClassification
656
+ dataset:
657
+ type: PL-MTEB/ppc-pairclassification
658
+ name: MTEB PPC
659
+ config: default
660
+ split: test
661
+ revision: None
662
+ metrics:
663
+ - type: cos_sim_accuracy
664
+ value: 86.5
665
+ - type: cos_sim_ap
666
+ value: 92.65086645409387
667
+ - type: cos_sim_f1
668
+ value: 89.39157566302653
669
+ - type: cos_sim_precision
670
+ value: 84.51327433628319
671
+ - type: cos_sim_recall
672
+ value: 94.86754966887418
673
+ - type: dot_accuracy
674
+ value: 86.5
675
+ - type: dot_ap
676
+ value: 92.65086645409387
677
+ - type: dot_f1
678
+ value: 89.39157566302653
679
+ - type: dot_precision
680
+ value: 84.51327433628319
681
+ - type: dot_recall
682
+ value: 94.86754966887418
683
+ - type: euclidean_accuracy
684
+ value: 86.5
685
+ - type: euclidean_ap
686
+ value: 92.65086645409387
687
+ - type: euclidean_f1
688
+ value: 89.39157566302653
689
+ - type: euclidean_precision
690
+ value: 84.51327433628319
691
+ - type: euclidean_recall
692
+ value: 94.86754966887418
693
+ - type: manhattan_accuracy
694
+ value: 86.5
695
+ - type: manhattan_ap
696
+ value: 92.64975544736456
697
+ - type: manhattan_f1
698
+ value: 89.33852140077822
699
+ - type: manhattan_precision
700
+ value: 84.28781204111601
701
+ - type: manhattan_recall
702
+ value: 95.03311258278146
703
+ - type: max_accuracy
704
+ value: 86.5
705
+ - type: max_ap
706
+ value: 92.65086645409387
707
+ - type: max_f1
708
+ value: 89.39157566302653
709
+ - task:
710
+ type: PairClassification
711
+ dataset:
712
+ type: PL-MTEB/psc-pairclassification
713
+ name: MTEB PSC
714
+ config: default
715
+ split: test
716
+ revision: None
717
+ metrics:
718
+ - type: cos_sim_accuracy
719
+ value: 95.64007421150278
720
+ - type: cos_sim_ap
721
+ value: 98.42114841894346
722
+ - type: cos_sim_f1
723
+ value: 92.8895612708018
724
+ - type: cos_sim_precision
725
+ value: 92.1921921921922
726
+ - type: cos_sim_recall
727
+ value: 93.59756097560977
728
+ - type: dot_accuracy
729
+ value: 95.64007421150278
730
+ - type: dot_ap
731
+ value: 98.42114841894346
732
+ - type: dot_f1
733
+ value: 92.8895612708018
734
+ - type: dot_precision
735
+ value: 92.1921921921922
736
+ - type: dot_recall
737
+ value: 93.59756097560977
738
+ - type: euclidean_accuracy
739
+ value: 95.64007421150278
740
+ - type: euclidean_ap
741
+ value: 98.42114841894346
742
+ - type: euclidean_f1
743
+ value: 92.8895612708018
744
+ - type: euclidean_precision
745
+ value: 92.1921921921922
746
+ - type: euclidean_recall
747
+ value: 93.59756097560977
748
+ - type: manhattan_accuracy
749
+ value: 95.82560296846012
750
+ - type: manhattan_ap
751
+ value: 98.38712415914046
752
+ - type: manhattan_f1
753
+ value: 93.19213313161876
754
+ - type: manhattan_precision
755
+ value: 92.49249249249249
756
+ - type: manhattan_recall
757
+ value: 93.90243902439023
758
+ - type: max_accuracy
759
+ value: 95.82560296846012
760
+ - type: max_ap
761
+ value: 98.42114841894346
762
+ - type: max_f1
763
+ value: 93.19213313161876
764
+ - task:
765
+ type: Classification
766
+ dataset:
767
+ type: PL-MTEB/polemo2_in
768
+ name: MTEB PolEmo2.0-IN
769
+ config: default
770
+ split: test
771
+ revision: None
772
+ metrics:
773
+ - type: accuracy
774
+ value: 68.40720221606648
775
+ - type: f1
776
+ value: 67.09084289613526
777
+ - task:
778
+ type: Classification
779
+ dataset:
780
+ type: PL-MTEB/polemo2_out
781
+ name: MTEB PolEmo2.0-OUT
782
+ config: default
783
+ split: test
784
+ revision: None
785
+ metrics:
786
+ - type: accuracy
787
+ value: 38.056680161943326
788
+ - type: f1
789
+ value: 32.87731504372395
790
+ - task:
791
+ type: Retrieval
792
+ dataset:
793
+ type: quora-pl
794
+ name: MTEB Quora-PL
795
+ config: default
796
+ split: test
797
+ revision: None
798
+ metrics:
799
+ - type: map_at_1
800
+ value: 65.422
801
+ - type: map_at_10
802
+ value: 79.259
803
+ - type: map_at_100
804
+ value: 80.0
805
+ - type: map_at_1000
806
+ value: 80.021
807
+ - type: map_at_3
808
+ value: 76.16199999999999
809
+ - type: map_at_5
810
+ value: 78.03999999999999
811
+ - type: mrr_at_1
812
+ value: 75.26
813
+ - type: mrr_at_10
814
+ value: 82.39699999999999
815
+ - type: mrr_at_100
816
+ value: 82.589
817
+ - type: mrr_at_1000
818
+ value: 82.593
819
+ - type: mrr_at_3
820
+ value: 81.08999999999999
821
+ - type: mrr_at_5
822
+ value: 81.952
823
+ - type: ndcg_at_1
824
+ value: 75.3
825
+ - type: ndcg_at_10
826
+ value: 83.588
827
+ - type: ndcg_at_100
828
+ value: 85.312
829
+ - type: ndcg_at_1000
830
+ value: 85.536
831
+ - type: ndcg_at_3
832
+ value: 80.128
833
+ - type: ndcg_at_5
834
+ value: 81.962
835
+ - type: precision_at_1
836
+ value: 75.3
837
+ - type: precision_at_10
838
+ value: 12.856000000000002
839
+ - type: precision_at_100
840
+ value: 1.508
841
+ - type: precision_at_1000
842
+ value: 0.156
843
+ - type: precision_at_3
844
+ value: 35.207
845
+ - type: precision_at_5
846
+ value: 23.316
847
+ - type: recall_at_1
848
+ value: 65.422
849
+ - type: recall_at_10
850
+ value: 92.381
851
+ - type: recall_at_100
852
+ value: 98.575
853
+ - type: recall_at_1000
854
+ value: 99.85300000000001
855
+ - type: recall_at_3
856
+ value: 82.59100000000001
857
+ - type: recall_at_5
858
+ value: 87.629
859
+ - task:
860
+ type: Retrieval
861
+ dataset:
862
+ type: scidocs-pl
863
+ name: MTEB SCIDOCS-PL
864
+ config: default
865
+ split: test
866
+ revision: None
867
+ metrics:
868
+ - type: map_at_1
869
+ value: 2.52
870
+ - type: map_at_10
871
+ value: 6.814000000000001
872
+ - type: map_at_100
873
+ value: 8.267
874
+ - type: map_at_1000
875
+ value: 8.565000000000001
876
+ - type: map_at_3
877
+ value: 4.736
878
+ - type: map_at_5
879
+ value: 5.653
880
+ - type: mrr_at_1
881
+ value: 12.5
882
+ - type: mrr_at_10
883
+ value: 20.794999999999998
884
+ - type: mrr_at_100
885
+ value: 22.014
886
+ - type: mrr_at_1000
887
+ value: 22.109
888
+ - type: mrr_at_3
889
+ value: 17.8
890
+ - type: mrr_at_5
891
+ value: 19.42
892
+ - type: ndcg_at_1
893
+ value: 12.5
894
+ - type: ndcg_at_10
895
+ value: 12.209
896
+ - type: ndcg_at_100
897
+ value: 18.812
898
+ - type: ndcg_at_1000
899
+ value: 24.766
900
+ - type: ndcg_at_3
901
+ value: 10.847
902
+ - type: ndcg_at_5
903
+ value: 9.632
904
+ - type: precision_at_1
905
+ value: 12.5
906
+ - type: precision_at_10
907
+ value: 6.660000000000001
908
+ - type: precision_at_100
909
+ value: 1.6340000000000001
910
+ - type: precision_at_1000
911
+ value: 0.307
912
+ - type: precision_at_3
913
+ value: 10.299999999999999
914
+ - type: precision_at_5
915
+ value: 8.66
916
+ - type: recall_at_1
917
+ value: 2.52
918
+ - type: recall_at_10
919
+ value: 13.495
920
+ - type: recall_at_100
921
+ value: 33.188
922
+ - type: recall_at_1000
923
+ value: 62.34499999999999
924
+ - type: recall_at_3
925
+ value: 6.245
926
+ - type: recall_at_5
927
+ value: 8.76
928
+ - task:
929
+ type: PairClassification
930
+ dataset:
931
+ type: PL-MTEB/sicke-pl-pairclassification
932
+ name: MTEB SICK-E-PL
933
+ config: default
934
+ split: test
935
+ revision: None
936
+ metrics:
937
+ - type: cos_sim_accuracy
938
+ value: 86.13942111699959
939
+ - type: cos_sim_ap
940
+ value: 81.47480017120256
941
+ - type: cos_sim_f1
942
+ value: 74.79794268919912
943
+ - type: cos_sim_precision
944
+ value: 77.2382397572079
945
+ - type: cos_sim_recall
946
+ value: 72.50712250712252
947
+ - type: dot_accuracy
948
+ value: 86.13942111699959
949
+ - type: dot_ap
950
+ value: 81.47478531367476
951
+ - type: dot_f1
952
+ value: 74.79794268919912
953
+ - type: dot_precision
954
+ value: 77.2382397572079
955
+ - type: dot_recall
956
+ value: 72.50712250712252
957
+ - type: euclidean_accuracy
958
+ value: 86.13942111699959
959
+ - type: euclidean_ap
960
+ value: 81.47478531367476
961
+ - type: euclidean_f1
962
+ value: 74.79794268919912
963
+ - type: euclidean_precision
964
+ value: 77.2382397572079
965
+ - type: euclidean_recall
966
+ value: 72.50712250712252
967
+ - type: manhattan_accuracy
968
+ value: 86.15980432123929
969
+ - type: manhattan_ap
970
+ value: 81.40798042612397
971
+ - type: manhattan_f1
972
+ value: 74.86116253239543
973
+ - type: manhattan_precision
974
+ value: 77.9491133384734
975
+ - type: manhattan_recall
976
+ value: 72.00854700854701
977
+ - type: max_accuracy
978
+ value: 86.15980432123929
979
+ - type: max_ap
980
+ value: 81.47480017120256
981
+ - type: max_f1
982
+ value: 74.86116253239543
983
+ - task:
984
+ type: STS
985
+ dataset:
986
+ type: PL-MTEB/sickr-pl-sts
987
+ name: MTEB SICK-R-PL
988
+ config: default
989
+ split: test
990
+ revision: None
991
+ metrics:
992
+ - type: cos_sim_pearson
993
+ value: 84.27525342551935
994
+ - type: cos_sim_spearman
995
+ value: 79.50631730805885
996
+ - type: euclidean_pearson
997
+ value: 82.07169123942028
998
+ - type: euclidean_spearman
999
+ value: 79.50631887406465
1000
+ - type: manhattan_pearson
1001
+ value: 81.98288826317463
1002
+ - type: manhattan_spearman
1003
+ value: 79.4244081650332
1004
+ - task:
1005
+ type: STS
1006
+ dataset:
1007
+ type: mteb/sts22-crosslingual-sts
1008
+ name: MTEB STS22 (pl)
1009
+ config: pl
1010
+ split: test
1011
+ revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
1012
+ metrics:
1013
+ - type: cos_sim_pearson
1014
+ value: 35.59400236598834
1015
+ - type: cos_sim_spearman
1016
+ value: 36.782560207852846
1017
+ - type: euclidean_pearson
1018
+ value: 28.546177668542942
1019
+ - type: euclidean_spearman
1020
+ value: 36.68394223635756
1021
+ - type: manhattan_pearson
1022
+ value: 28.45606963909248
1023
+ - type: manhattan_spearman
1024
+ value: 36.475975118547524
1025
+ - task:
1026
+ type: Retrieval
1027
+ dataset:
1028
+ type: scifact-pl
1029
+ name: MTEB SciFact-PL
1030
+ config: default
1031
+ split: test
1032
+ revision: None
1033
+ metrics:
1034
+ - type: map_at_1
1035
+ value: 41.028
1036
+ - type: map_at_10
1037
+ value: 52.23799999999999
1038
+ - type: map_at_100
1039
+ value: 52.905
1040
+ - type: map_at_1000
1041
+ value: 52.945
1042
+ - type: map_at_3
1043
+ value: 49.102000000000004
1044
+ - type: map_at_5
1045
+ value: 50.992000000000004
1046
+ - type: mrr_at_1
1047
+ value: 43.333
1048
+ - type: mrr_at_10
1049
+ value: 53.551
1050
+ - type: mrr_at_100
1051
+ value: 54.138
1052
+ - type: mrr_at_1000
1053
+ value: 54.175
1054
+ - type: mrr_at_3
1055
+ value: 51.056000000000004
1056
+ - type: mrr_at_5
1057
+ value: 52.705999999999996
1058
+ - type: ndcg_at_1
1059
+ value: 43.333
1060
+ - type: ndcg_at_10
1061
+ value: 57.731
1062
+ - type: ndcg_at_100
1063
+ value: 61.18599999999999
1064
+ - type: ndcg_at_1000
1065
+ value: 62.261
1066
+ - type: ndcg_at_3
1067
+ value: 52.276999999999994
1068
+ - type: ndcg_at_5
1069
+ value: 55.245999999999995
1070
+ - type: precision_at_1
1071
+ value: 43.333
1072
+ - type: precision_at_10
1073
+ value: 8.267
1074
+ - type: precision_at_100
1075
+ value: 1.02
1076
+ - type: precision_at_1000
1077
+ value: 0.11100000000000002
1078
+ - type: precision_at_3
1079
+ value: 21.444
1080
+ - type: precision_at_5
1081
+ value: 14.533
1082
+ - type: recall_at_1
1083
+ value: 41.028
1084
+ - type: recall_at_10
1085
+ value: 73.111
1086
+ - type: recall_at_100
1087
+ value: 89.533
1088
+ - type: recall_at_1000
1089
+ value: 98.0
1090
+ - type: recall_at_3
1091
+ value: 58.744
1092
+ - type: recall_at_5
1093
+ value: 66.106
1094
+ - task:
1095
+ type: Retrieval
1096
+ dataset:
1097
+ type: trec-covid-pl
1098
+ name: MTEB TRECCOVID-PL
1099
+ config: default
1100
+ split: test
1101
+ revision: None
1102
+ metrics:
1103
+ - type: map_at_1
1104
+ value: 0.146
1105
+ - type: map_at_10
1106
+ value: 1.09
1107
+ - type: map_at_100
1108
+ value: 6.002
1109
+ - type: map_at_1000
1110
+ value: 15.479999999999999
1111
+ - type: map_at_3
1112
+ value: 0.41000000000000003
1113
+ - type: map_at_5
1114
+ value: 0.596
1115
+ - type: mrr_at_1
1116
+ value: 54.0
1117
+ - type: mrr_at_10
1118
+ value: 72.367
1119
+ - type: mrr_at_100
1120
+ value: 72.367
1121
+ - type: mrr_at_1000
1122
+ value: 72.367
1123
+ - type: mrr_at_3
1124
+ value: 70.333
1125
+ - type: mrr_at_5
1126
+ value: 72.033
1127
+ - type: ndcg_at_1
1128
+ value: 48.0
1129
+ - type: ndcg_at_10
1130
+ value: 48.827
1131
+ - type: ndcg_at_100
1132
+ value: 38.513999999999996
1133
+ - type: ndcg_at_1000
1134
+ value: 37.958
1135
+ - type: ndcg_at_3
1136
+ value: 52.614000000000004
1137
+ - type: ndcg_at_5
1138
+ value: 51.013
1139
+ - type: precision_at_1
1140
+ value: 54.0
1141
+ - type: precision_at_10
1142
+ value: 53.6
1143
+ - type: precision_at_100
1144
+ value: 40.300000000000004
1145
+ - type: precision_at_1000
1146
+ value: 17.276
1147
+ - type: precision_at_3
1148
+ value: 57.333
1149
+ - type: precision_at_5
1150
+ value: 55.60000000000001
1151
+ - type: recall_at_1
1152
+ value: 0.146
1153
+ - type: recall_at_10
1154
+ value: 1.438
1155
+ - type: recall_at_100
1156
+ value: 9.673
1157
+ - type: recall_at_1000
1158
+ value: 36.870999999999995
1159
+ - type: recall_at_3
1160
+ value: 0.47400000000000003
1161
+ - type: recall_at_5
1162
+ value: 0.721
1163
+ ---
1164
+ # Model Card for st-polish-kartonberta-base-alpha-v1
1165
+
1166
+ This sentence transformer model is designed to convert text content into a 768-float vector space, ensuring an effective representation. It aims to be proficient in tasks involving sentence / document similarity.
1167
+
1168
+ The model has been released in its alpha version. Numerous potential enhancements could boost its performance, such as adjusting training hyperparameters or extending the training duration (currently limited to only one epoch). The main reason is limited GPU.
1169
+
1170
+
1171
+ ## Model Description
1172
+
1173
+
1174
+ - **Developed by:** Bartłomiej Orlik ([email protected])
1175
+ - **Model type:** RoBERTa Sentence Transformer
1176
+ - **Language:** Polish
1177
+ - **License:** LGPL-3.0
1178
+ - **Trained from model:** sdadas/polish-roberta-base-v2: https://huggingface.co/sdadas/polish-roberta-base-v2
1179
+
1180
+
1181
+
1182
+
1183
+
1184
+
1185
+ ## How to Get Started with the Model
1186
+
1187
+ Use the code below to get started with the model.
1188
+
1189
+ ### Using Sentence-Transformers
1190
+
1191
+ You can use the model with [sentence-transformers](https://www.SBERT.net):
1192
+
1193
+ ```
1194
+ pip install -U sentence-transformers
1195
+ ```
1196
+ ```python
1197
+ from sentence_transformers import SentenceTransformer
1198
+
1199
+ model = SentenceTransformer('FajnyKarton/st-polish-kartonberta-base-alpha-v1')
1200
+
1201
+ text_1 = 'Jestem wielkim fanem opakowań tekturowych'
1202
+ text_2 = 'Bardzo podobają mi się kartony'
1203
+
1204
+ embeddings_1 = model.encode(text_1, normalize_embeddings=True)
1205
+ embeddings_2 = model.encode(text_2, normalize_embeddings=True)
1206
+
1207
+ similarity = embeddings_1 @ embeddings_2.T
1208
+ print(similarity)
1209
+ ```
1210
+
1211
+ ### Using HuggingFace Transformers
1212
+
1213
+
1214
+ ```python
1215
+ from transformers import AutoTokenizer, AutoModel
1216
+ import torch
1217
+ import numpy as np
1218
+
1219
+ def encode_text(text):
1220
+ encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=512)
1221
+ with torch.no_grad():
1222
+ model_output = model(**encoded_input)
1223
+ sentence_embeddings = model_output[0][:, 0]
1224
+ sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
1225
+ return sentence_embeddings.squeeze().numpy()
1226
+
1227
+ cosine_similarity = lambda a, b: np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
1228
+
1229
+
1230
+ tokenizer = AutoTokenizer.from_pretrained('FajnyKarton/st-polish-kartonberta-base-alpha-v1')
1231
+ model = AutoModel.from_pretrained('FajnyKarton/st-polish-kartonberta-base-alpha-v1')
1232
+ model.eval()
1233
+
1234
+ text_1 = 'Jestem wielkim fanem opakowań tekturowych'
1235
+ text_2 = 'Bardzo podobają mi się kartony'
1236
+
1237
+ embeddings_1 = encode_text(text_1)
1238
+ embeddings_2 = encode_text(text_2)
1239
+
1240
+ print(cosine_similarity(embeddings_1, embeddings_2))
1241
+ ```
1242
+ *Note: You can use the encode_text function for demonstration purposes. For the best experience, it's recommended to process text in batches.
1243
+
1244
+
1245
+
1246
+
1247
+ ## Evaluation
1248
+ #### [MTEB for Polish Language](https://huggingface.co/spaces/mteb/leaderboard)
1249
+
1250
+ | Rank | Model | Model Size (GB) | Embedding Dimensions | Sequence Length | Average (26 datasets) | Classification Average (7 datasets) | Clustering Average (1 datasets) | Pair Classification Average (4 datasets) | Retrieval Average (11 datasets) | STS Average (3 datasets) |
1251
+ |-------:|:----------------------------------------|------------------:|-----------------------:|------------------:|------------------------:|--------------------------------------:|--------------------------------:|-----------------------------------------:|----------------------------------:|-------------------------:|
1252
+ | 1 | multilingual-e5-large | 2.24 | 1024 | 514 | 58.25 | 60.51 | 24.06 | 84.58 | 47.82 | 67.52 |
1253
+ | 2 | **st-polish-kartonberta-base-alpha-v1** | 0.5 | 768 | 514 | 56.92 | 60.44 | **32.85** | **87.92** | 42.19 | **69.47** |
1254
+ | 3 | multilingual-e5-base | 1.11 | 768 | 514 | 54.18 | 57.01 | 18.62 | 82.08 | 42.5 | 65.07 |
1255
+ | 4 | multilingual-e5-small | 0.47 | 384 | 512 | 53.15 | 54.35 | 19.64 | 81.67 | 41.52 | 66.08 |
1256
+ | 5 | st-polish-paraphrase-from-mpnet | 0.5 | 768 | 514 | 53.06 | 57.49 | 25.09 | 87.04 | 36.53 | 67.39 |
1257
+ | 6 | st-polish-paraphrase-from-distilroberta | 0.5 | 768 | 514 | 52.65 | 58.55 | 31.11 | 87 | 33.96 | 68.78 |
1258
+
1259
+
1260
+
1261
+
1262
+
1263
+
1264
+
1265
+ ## More Information
1266
+
1267
+ I developed this model as a personal scientific initiative.
1268
+
1269
+ I plan to start the development on a new ST model. However, due to limited computational resources, I suspended further work to create a larger or enhanced version of current model.
1270
+
1271
+
1272
+
1273
+
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "st-polish-kartonberta-base-alpha-v1",
3
+ "architectures": [
4
+ "RobertaModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 514,
18
+ "model_type": "roberta",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.30.2",
25
+ "type_vocab_size": 1,
26
+ "use_cache": true,
27
+ "vocab_size": 50001
28
+ }
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a20d0ce46850300c03679076fd597e0204fbbe94dcf1fee096353e62ac63eca4
3
+ size 497842733
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "errors": "replace",
8
+ "mask_token": "<mask>",
9
+ "model_max_length": 1000000000000000019884624838656,
10
+ "pad_token": "<pad>",
11
+ "sep_token": "</s>",
12
+ "tokenizer_class": "RobertaTokenizer",
13
+ "trim_offsets": true,
14
+ "unk_token": "<unk>"
15
+ }