zhonghhh's picture
Update README.md
2050d34 verified
---
tags:
- mteb
model-index:
- name: Dmeta-embedding-zh-small
results:
- task:
type: STS
dataset:
type: C-MTEB/AFQMC
name: MTEB AFQMC
config: default
split: validation
revision: None
metrics:
- type: cos_sim_pearson
value: 55.38441014851534
- type: cos_sim_spearman
value: 59.54284362578262
- type: euclidean_pearson
value: 58.18592108890414
- type: euclidean_spearman
value: 59.54284362133902
- type: manhattan_pearson
value: 58.142197046175916
- type: manhattan_spearman
value: 59.47943468645265
- task:
type: STS
dataset:
type: C-MTEB/ATEC
name: MTEB ATEC
config: default
split: test
revision: None
metrics:
- type: cos_sim_pearson
value: 55.96911621560259
- type: cos_sim_spearman
value: 58.6334496101353
- type: euclidean_pearson
value: 62.78426382809823
- type: euclidean_spearman
value: 58.63344961011331
- type: manhattan_pearson
value: 62.80625401678188
- type: manhattan_spearman
value: 58.618722128260394
- task:
type: Classification
dataset:
type: mteb/amazon_reviews_multi
name: MTEB AmazonReviewsClassification (zh)
config: zh
split: test
revision: 1399c76144fd37290681b995c656ef9b2e06e26d
metrics:
- type: accuracy
value: 44.88
- type: f1
value: 42.739249460584375
- task:
type: STS
dataset:
type: C-MTEB/BQ
name: MTEB BQ
config: default
split: test
revision: None
metrics:
- type: cos_sim_pearson
value: 68.56815521242152
- type: cos_sim_spearman
value: 70.30776353631751
- type: euclidean_pearson
value: 69.10087719019191
- type: euclidean_spearman
value: 70.30775660748148
- type: manhattan_pearson
value: 69.0672710967445
- type: manhattan_spearman
value: 70.31940638148254
- task:
type: Clustering
dataset:
type: C-MTEB/CLSClusteringP2P
name: MTEB CLSClusteringP2P
config: default
split: test
revision: None
metrics:
- type: v_measure
value: 40.7861976704356
- task:
type: Clustering
dataset:
type: C-MTEB/CLSClusteringS2S
name: MTEB CLSClusteringS2S
config: default
split: test
revision: None
metrics:
- type: v_measure
value: 38.43028280281822
- task:
type: Reranking
dataset:
type: C-MTEB/CMedQAv1-reranking
name: MTEB CMedQAv1
config: default
split: test
revision: None
metrics:
- type: map
value: 86.78386695617407
- type: mrr
value: 88.79857142857142
- task:
type: Reranking
dataset:
type: C-MTEB/CMedQAv2-reranking
name: MTEB CMedQAv2
config: default
split: test
revision: None
metrics:
- type: map
value: 87.38582377194436
- type: mrr
value: 89.17158730158731
- task:
type: Retrieval
dataset:
type: C-MTEB/CmedqaRetrieval
name: MTEB CmedqaRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 23.746000000000002
- type: map_at_10
value: 35.952
- type: map_at_100
value: 37.946999999999996
- type: map_at_1000
value: 38.059
- type: map_at_3
value: 31.680999999999997
- type: map_at_5
value: 34.046
- type: mrr_at_1
value: 36.409000000000006
- type: mrr_at_10
value: 44.801
- type: mrr_at_100
value: 45.842
- type: mrr_at_1000
value: 45.885999999999996
- type: mrr_at_3
value: 42.081
- type: mrr_at_5
value: 43.613
- type: ndcg_at_1
value: 36.409000000000006
- type: ndcg_at_10
value: 42.687000000000005
- type: ndcg_at_100
value: 50.352
- type: ndcg_at_1000
value: 52.275000000000006
- type: ndcg_at_3
value: 37.113
- type: ndcg_at_5
value: 39.434000000000005
- type: precision_at_1
value: 36.409000000000006
- type: precision_at_10
value: 9.712
- type: precision_at_100
value: 1.584
- type: precision_at_1000
value: 0.182
- type: precision_at_3
value: 21.096999999999998
- type: precision_at_5
value: 15.498999999999999
- type: recall_at_1
value: 23.746000000000002
- type: recall_at_10
value: 53.596
- type: recall_at_100
value: 85.232
- type: recall_at_1000
value: 98.092
- type: recall_at_3
value: 37.226
- type: recall_at_5
value: 44.187
- task:
type: PairClassification
dataset:
type: C-MTEB/CMNLI
name: MTEB Cmnli
config: default
split: validation
revision: None
metrics:
- type: cos_sim_accuracy
value: 82.66987372218881
- type: cos_sim_ap
value: 90.28715189799232
- type: cos_sim_f1
value: 84.108318049412
- type: cos_sim_precision
value: 78.0849358974359
- type: cos_sim_recall
value: 91.13864858545709
- type: dot_accuracy
value: 82.66987372218881
- type: dot_ap
value: 90.29346021403634
- type: dot_f1
value: 84.108318049412
- type: dot_precision
value: 78.0849358974359
- type: dot_recall
value: 91.13864858545709
- type: euclidean_accuracy
value: 82.66987372218881
- type: euclidean_ap
value: 90.28656734732074
- type: euclidean_f1
value: 84.108318049412
- type: euclidean_precision
value: 78.0849358974359
- type: euclidean_recall
value: 91.13864858545709
- type: manhattan_accuracy
value: 82.70595309681299
- type: manhattan_ap
value: 90.25413574022456
- type: manhattan_f1
value: 83.9924670433145
- type: manhattan_precision
value: 79.81052631578947
- type: manhattan_recall
value: 88.63689501987373
- type: max_accuracy
value: 82.70595309681299
- type: max_ap
value: 90.29346021403634
- type: max_f1
value: 84.108318049412
- task:
type: Retrieval
dataset:
type: C-MTEB/CovidRetrieval
name: MTEB CovidRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 61.38
- type: map_at_10
value: 71.23
- type: map_at_100
value: 71.61800000000001
- type: map_at_1000
value: 71.63000000000001
- type: map_at_3
value: 69.31
- type: map_at_5
value: 70.403
- type: mrr_at_1
value: 61.538000000000004
- type: mrr_at_10
value: 71.28999999999999
- type: mrr_at_100
value: 71.666
- type: mrr_at_1000
value: 71.678
- type: mrr_at_3
value: 69.44200000000001
- type: mrr_at_5
value: 70.506
- type: ndcg_at_1
value: 61.538000000000004
- type: ndcg_at_10
value: 75.626
- type: ndcg_at_100
value: 77.449
- type: ndcg_at_1000
value: 77.73400000000001
- type: ndcg_at_3
value: 71.75200000000001
- type: ndcg_at_5
value: 73.695
- type: precision_at_1
value: 61.538000000000004
- type: precision_at_10
value: 9.009
- type: precision_at_100
value: 0.9860000000000001
- type: precision_at_1000
value: 0.101
- type: precision_at_3
value: 26.379
- type: precision_at_5
value: 16.797
- type: recall_at_1
value: 61.38
- type: recall_at_10
value: 89.199
- type: recall_at_100
value: 97.576
- type: recall_at_1000
value: 99.789
- type: recall_at_3
value: 78.635
- type: recall_at_5
value: 83.325
- task:
type: Retrieval
dataset:
type: C-MTEB/DuRetrieval
name: MTEB DuRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 23.067
- type: map_at_10
value: 70.658
- type: map_at_100
value: 73.85300000000001
- type: map_at_1000
value: 73.925
- type: map_at_3
value: 48.391
- type: map_at_5
value: 61.172000000000004
- type: mrr_at_1
value: 83.1
- type: mrr_at_10
value: 88.214
- type: mrr_at_100
value: 88.298
- type: mrr_at_1000
value: 88.304
- type: mrr_at_3
value: 87.717
- type: mrr_at_5
value: 88.03699999999999
- type: ndcg_at_1
value: 83.1
- type: ndcg_at_10
value: 79.89
- type: ndcg_at_100
value: 83.829
- type: ndcg_at_1000
value: 84.577
- type: ndcg_at_3
value: 78.337
- type: ndcg_at_5
value: 77.224
- type: precision_at_1
value: 83.1
- type: precision_at_10
value: 38.934999999999995
- type: precision_at_100
value: 4.6690000000000005
- type: precision_at_1000
value: 0.484
- type: precision_at_3
value: 70.48299999999999
- type: precision_at_5
value: 59.68
- type: recall_at_1
value: 23.067
- type: recall_at_10
value: 81.702
- type: recall_at_100
value: 94.214
- type: recall_at_1000
value: 98.241
- type: recall_at_3
value: 51.538
- type: recall_at_5
value: 67.39
- task:
type: Retrieval
dataset:
type: C-MTEB/EcomRetrieval
name: MTEB EcomRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 49.8
- type: map_at_10
value: 59.46399999999999
- type: map_at_100
value: 60.063
- type: map_at_1000
value: 60.08
- type: map_at_3
value: 56.833
- type: map_at_5
value: 58.438
- type: mrr_at_1
value: 49.8
- type: mrr_at_10
value: 59.46399999999999
- type: mrr_at_100
value: 60.063
- type: mrr_at_1000
value: 60.08
- type: mrr_at_3
value: 56.833
- type: mrr_at_5
value: 58.438
- type: ndcg_at_1
value: 49.8
- type: ndcg_at_10
value: 64.48
- type: ndcg_at_100
value: 67.314
- type: ndcg_at_1000
value: 67.745
- type: ndcg_at_3
value: 59.06400000000001
- type: ndcg_at_5
value: 61.973
- type: precision_at_1
value: 49.8
- type: precision_at_10
value: 8.04
- type: precision_at_100
value: 0.935
- type: precision_at_1000
value: 0.097
- type: precision_at_3
value: 21.833
- type: precision_at_5
value: 14.52
- type: recall_at_1
value: 49.8
- type: recall_at_10
value: 80.4
- type: recall_at_100
value: 93.5
- type: recall_at_1000
value: 96.8
- type: recall_at_3
value: 65.5
- type: recall_at_5
value: 72.6
- task:
type: Classification
dataset:
type: C-MTEB/IFlyTek-classification
name: MTEB IFlyTek
config: default
split: validation
revision: None
metrics:
- type: accuracy
value: 49.111196614082345
- type: f1
value: 37.07930546974089
- task:
type: Classification
dataset:
type: C-MTEB/JDReview-classification
name: MTEB JDReview
config: default
split: test
revision: None
metrics:
- type: accuracy
value: 85.57223264540339
- type: ap
value: 53.30690968994808
- type: f1
value: 80.20587062271773
- task:
type: STS
dataset:
type: C-MTEB/LCQMC
name: MTEB LCQMC
config: default
split: test
revision: None
metrics:
- type: cos_sim_pearson
value: 73.03085269274996
- type: cos_sim_spearman
value: 78.72837937949888
- type: euclidean_pearson
value: 78.34911745798928
- type: euclidean_spearman
value: 78.72838602779268
- type: manhattan_pearson
value: 78.31833697617105
- type: manhattan_spearman
value: 78.69603741566397
- task:
type: Reranking
dataset:
type: C-MTEB/Mmarco-reranking
name: MTEB MMarcoReranking
config: default
split: dev
revision: None
metrics:
- type: map
value: 27.391692468538416
- type: mrr
value: 26.44682539682539
- task:
type: Retrieval
dataset:
type: C-MTEB/MMarcoRetrieval
name: MTEB MMarcoRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 57.206999999999994
- type: map_at_10
value: 66.622
- type: map_at_100
value: 67.12700000000001
- type: map_at_1000
value: 67.145
- type: map_at_3
value: 64.587
- type: map_at_5
value: 65.827
- type: mrr_at_1
value: 59.312
- type: mrr_at_10
value: 67.387
- type: mrr_at_100
value: 67.836
- type: mrr_at_1000
value: 67.851
- type: mrr_at_3
value: 65.556
- type: mrr_at_5
value: 66.66
- type: ndcg_at_1
value: 59.312
- type: ndcg_at_10
value: 70.748
- type: ndcg_at_100
value: 73.076
- type: ndcg_at_1000
value: 73.559
- type: ndcg_at_3
value: 66.81200000000001
- type: ndcg_at_5
value: 68.92399999999999
- type: precision_at_1
value: 59.312
- type: precision_at_10
value: 8.798
- type: precision_at_100
value: 0.996
- type: precision_at_1000
value: 0.104
- type: precision_at_3
value: 25.487
- type: precision_at_5
value: 16.401
- type: recall_at_1
value: 57.206999999999994
- type: recall_at_10
value: 82.767
- type: recall_at_100
value: 93.449
- type: recall_at_1000
value: 97.262
- type: recall_at_3
value: 72.271
- type: recall_at_5
value: 77.291
- task:
type: Classification
dataset:
type: mteb/amazon_massive_intent
name: MTEB MassiveIntentClassification (zh-CN)
config: zh-CN
split: test
revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
metrics:
- type: accuracy
value: 70.78345662407531
- type: f1
value: 68.35683436974351
- task:
type: Classification
dataset:
type: mteb/amazon_massive_scenario
name: MTEB MassiveScenarioClassification (zh-CN)
config: zh-CN
split: test
revision: 7d571f92784cd94a019292a1f45445077d0ef634
metrics:
- type: accuracy
value: 73.16408876933423
- type: f1
value: 73.31484873459382
- task:
type: Retrieval
dataset:
type: C-MTEB/MedicalRetrieval
name: MTEB MedicalRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 51.4
- type: map_at_10
value: 57.091
- type: map_at_100
value: 57.652
- type: map_at_1000
value: 57.703
- type: map_at_3
value: 55.733
- type: map_at_5
value: 56.363
- type: mrr_at_1
value: 51.7
- type: mrr_at_10
value: 57.243
- type: mrr_at_100
value: 57.80499999999999
- type: mrr_at_1000
value: 57.855999999999995
- type: mrr_at_3
value: 55.883
- type: mrr_at_5
value: 56.513000000000005
- type: ndcg_at_1
value: 51.4
- type: ndcg_at_10
value: 59.948
- type: ndcg_at_100
value: 63.064
- type: ndcg_at_1000
value: 64.523
- type: ndcg_at_3
value: 57.089999999999996
- type: ndcg_at_5
value: 58.214
- type: precision_at_1
value: 51.4
- type: precision_at_10
value: 6.9
- type: precision_at_100
value: 0.845
- type: precision_at_1000
value: 0.096
- type: precision_at_3
value: 20.333000000000002
- type: precision_at_5
value: 12.740000000000002
- type: recall_at_1
value: 51.4
- type: recall_at_10
value: 69.0
- type: recall_at_100
value: 84.5
- type: recall_at_1000
value: 96.2
- type: recall_at_3
value: 61.0
- type: recall_at_5
value: 63.7
- task:
type: Classification
dataset:
type: C-MTEB/MultilingualSentiment-classification
name: MTEB MultilingualSentiment
config: default
split: validation
revision: None
metrics:
- type: accuracy
value: 74.38999999999999
- type: f1
value: 74.07161306140839
- task:
type: PairClassification
dataset:
type: C-MTEB/OCNLI
name: MTEB Ocnli
config: default
split: validation
revision: None
metrics:
- type: cos_sim_accuracy
value: 81.15863562533838
- type: cos_sim_ap
value: 84.84571607908443
- type: cos_sim_f1
value: 82.55872063968016
- type: cos_sim_precision
value: 78.36812144212524
- type: cos_sim_recall
value: 87.22280887011615
- type: dot_accuracy
value: 81.15863562533838
- type: dot_ap
value: 84.84571607908443
- type: dot_f1
value: 82.55872063968016
- type: dot_precision
value: 78.36812144212524
- type: dot_recall
value: 87.22280887011615
- type: euclidean_accuracy
value: 81.15863562533838
- type: euclidean_ap
value: 84.84571607908443
- type: euclidean_f1
value: 82.55872063968016
- type: euclidean_precision
value: 78.36812144212524
- type: euclidean_recall
value: 87.22280887011615
- type: manhattan_accuracy
value: 80.7796426637791
- type: manhattan_ap
value: 84.81524098914134
- type: manhattan_f1
value: 82.36462990561351
- type: manhattan_precision
value: 77.76735459662288
- type: manhattan_recall
value: 87.53959873284055
- type: max_accuracy
value: 81.15863562533838
- type: max_ap
value: 84.84571607908443
- type: max_f1
value: 82.55872063968016
- task:
type: Classification
dataset:
type: C-MTEB/OnlineShopping-classification
name: MTEB OnlineShopping
config: default
split: test
revision: None
metrics:
- type: accuracy
value: 93.12000000000002
- type: ap
value: 91.0749103088623
- type: f1
value: 93.10837266607813
- task:
type: STS
dataset:
type: C-MTEB/PAWSX
name: MTEB PAWSX
config: default
split: test
revision: None
metrics:
- type: cos_sim_pearson
value: 38.5692290188029
- type: cos_sim_spearman
value: 42.965264868554335
- type: euclidean_pearson
value: 43.002526263615735
- type: euclidean_spearman
value: 42.97561576045246
- type: manhattan_pearson
value: 43.050089639788936
- type: manhattan_spearman
value: 43.038497558804934
- task:
type: STS
dataset:
type: C-MTEB/QBQTC
name: MTEB QBQTC
config: default
split: test
revision: None
metrics:
- type: cos_sim_pearson
value: 38.99284895602663
- type: cos_sim_spearman
value: 41.02655813481606
- type: euclidean_pearson
value: 38.934953519378354
- type: euclidean_spearman
value: 41.02680077136343
- type: manhattan_pearson
value: 39.224809609807785
- type: manhattan_spearman
value: 41.13950779185706
- task:
type: STS
dataset:
type: mteb/sts22-crosslingual-sts
name: MTEB STS22 (zh)
config: zh
split: test
revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
metrics:
- type: cos_sim_pearson
value: 66.47464607633356
- type: cos_sim_spearman
value: 66.76311382148693
- type: euclidean_pearson
value: 67.25180409604143
- type: euclidean_spearman
value: 66.76311382148693
- type: manhattan_pearson
value: 67.6928257682864
- type: manhattan_spearman
value: 67.08172581019826
- task:
type: STS
dataset:
type: C-MTEB/STSB
name: MTEB STSB
config: default
split: test
revision: None
metrics:
- type: cos_sim_pearson
value: 77.48943840585562
- type: cos_sim_spearman
value: 79.0869194735025
- type: euclidean_pearson
value: 79.48559575794792
- type: euclidean_spearman
value: 79.08765044225807
- type: manhattan_pearson
value: 79.36157224751007
- type: manhattan_spearman
value: 78.94400905463999
- task:
type: Reranking
dataset:
type: C-MTEB/T2Reranking
name: MTEB T2Reranking
config: default
split: dev
revision: None
metrics:
- type: map
value: 66.1093201711458
- type: mrr
value: 75.70959742506797
- task:
type: Retrieval
dataset:
type: C-MTEB/T2Retrieval
name: MTEB T2Retrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 25.533
- type: map_at_10
value: 71.322
- type: map_at_100
value: 75.244
- type: map_at_1000
value: 75.333
- type: map_at_3
value: 50.15500000000001
- type: map_at_5
value: 61.514
- type: mrr_at_1
value: 86.126
- type: mrr_at_10
value: 89.462
- type: mrr_at_100
value: 89.58500000000001
- type: mrr_at_1000
value: 89.59
- type: mrr_at_3
value: 88.88000000000001
- type: mrr_at_5
value: 89.241
- type: ndcg_at_1
value: 86.126
- type: ndcg_at_10
value: 79.89500000000001
- type: ndcg_at_100
value: 84.405
- type: ndcg_at_1000
value: 85.286
- type: ndcg_at_3
value: 81.547
- type: ndcg_at_5
value: 79.834
- type: precision_at_1
value: 86.126
- type: precision_at_10
value: 39.972
- type: precision_at_100
value: 4.932
- type: precision_at_1000
value: 0.514
- type: precision_at_3
value: 71.49
- type: precision_at_5
value: 59.687
- type: recall_at_1
value: 25.533
- type: recall_at_10
value: 78.962
- type: recall_at_100
value: 93.413
- type: recall_at_1000
value: 97.89099999999999
- type: recall_at_3
value: 52.129000000000005
- type: recall_at_5
value: 65.444
- task:
type: Classification
dataset:
type: C-MTEB/TNews-classification
name: MTEB TNews
config: default
split: validation
revision: None
metrics:
- type: accuracy
value: 51.800000000000004
- type: f1
value: 50.07807183704828
- task:
type: Clustering
dataset:
type: C-MTEB/ThuNewsClusteringP2P
name: MTEB ThuNewsClusteringP2P
config: default
split: test
revision: None
metrics:
- type: v_measure
value: 65.15253218390774
- task:
type: Clustering
dataset:
type: C-MTEB/ThuNewsClusteringS2S
name: MTEB ThuNewsClusteringS2S
config: default
split: test
revision: None
metrics:
- type: v_measure
value: 58.81779372506517
- task:
type: Retrieval
dataset:
type: C-MTEB/VideoRetrieval
name: MTEB VideoRetrieval
config: default
split: dev
revision: None
metrics:
- type: map_at_1
value: 53.0
- type: map_at_10
value: 63.422999999999995
- type: map_at_100
value: 63.995000000000005
- type: map_at_1000
value: 64.004
- type: map_at_3
value: 61.382999999999996
- type: map_at_5
value: 62.488
- type: mrr_at_1
value: 53.0
- type: mrr_at_10
value: 63.422999999999995
- type: mrr_at_100
value: 63.995000000000005
- type: mrr_at_1000
value: 64.004
- type: mrr_at_3
value: 61.382999999999996
- type: mrr_at_5
value: 62.488
- type: ndcg_at_1
value: 53.0
- type: ndcg_at_10
value: 68.301
- type: ndcg_at_100
value: 70.988
- type: ndcg_at_1000
value: 71.294
- type: ndcg_at_3
value: 64.11
- type: ndcg_at_5
value: 66.094
- type: precision_at_1
value: 53.0
- type: precision_at_10
value: 8.35
- type: precision_at_100
value: 0.958
- type: precision_at_1000
value: 0.098
- type: precision_at_3
value: 24.0
- type: precision_at_5
value: 15.36
- type: recall_at_1
value: 53.0
- type: recall_at_10
value: 83.5
- type: recall_at_100
value: 95.8
- type: recall_at_1000
value: 98.3
- type: recall_at_3
value: 72.0
- type: recall_at_5
value: 76.8
- task:
type: Classification
dataset:
type: C-MTEB/waimai-classification
name: MTEB Waimai
config: default
split: test
revision: None
metrics:
- type: accuracy
value: 86.18
- type: ap
value: 69.04229346593745
- type: f1
value: 84.52986739717021
license: apache-2.0
---
<div align="center">
<img src="logo.png" alt="icon" width="100px"/>
</div>
<h1 align="center">Dmeta-embedding-small</h1>
- Dmeta-embedding系列模型是跨领域、跨任务、开箱即用的中文 Embedding 模型,适用于搜索、问答、智能客服、LLM+RAG 等各种业务场景,支持使用 Transformers/Sentence-Transformers/Langchain 等工具加载推理。
- **Dmeta-embedding-zh-small**是开源模型[Dmeta-embedding-zh](https://huggingface.co/DMetaSoul/Dmeta-embedding-zh)的蒸馏版本(8层BERT),模型大小不到300M。相较于原始版本,Dmeta-embedding-zh-small模型大小减小三分之一,推理速度提升约30%,总体精度下降约1.4%。
---
## Evaluation
这里主要跟蒸馏前对应的 teacher 模型作了对比:
*性能:*(基于1万条数据测试,GPU设备是V100)
| | Teacher | Student | Gap |
| ---------- | ------------------------- | ------------------------------ | ----- |
| Model | Dmeta-Embedding-zh (411M) | Dmeta-Embedding-zh-small (297M)| 0.67x |
| Cost | 127s | 89s | -30% |
| Latency | 13ms | 9ms | -31% |
| Throughput | 78 sentence/s | 111 sentence/s | 1.4x |
*精度:*(参考自MTEB榜单)
| | **Classification** | **Clustering** | **Pair Classification** | **Reranking** | **Retrieval** | **STS** | **Avg** |
| ----------------------------- | ----------------- | -------------- | ----------------------- | ------------- | ------------- | ------- | ------- |
| **Dmeta-Embedding-zh** | 70 | 50.96 | 88.92 | 67.17 | 70.41 | 64.89 | 67.51 |
| **Dmeta-Embedding-zh-small** | 69.89 | 50.8 | 87.57 | 66.92 | 67.7 | 62.13 | 66.1 |
| **Gap** | -0.11 | -0.16 | -1.35 | -0.25 | -2.71 | -2.76 | -1.41 |
## Usage
目前模型支持通过 [Sentence-Transformers](#sentence-transformers), [Langchain](#langchain), [Huggingface Transformers](#huggingface-transformers) 等主流框架进行推理,具体用法参考各个框架的示例。
### Sentence-Transformers
Dmeta-embedding 模型支持通过 [sentence-transformers](https://www.SBERT.net) 来加载推理:
```
pip install -U sentence-transformers
```
```python
from sentence_transformers import SentenceTransformer
texts1 = ["胡子长得太快怎么办?", "在香港哪里买手表好"]
texts2 = ["胡子长得快怎么办?", "怎样使胡子不浓密!", "香港买手表哪里好", "在杭州手机到哪里买"]
model = SentenceTransformer('DMetaSoul/Dmeta-embedding-zh-small')
embs1 = model.encode(texts1, normalize_embeddings=True)
embs2 = model.encode(texts2, normalize_embeddings=True)
# 计算两两相似度
similarity = embs1 @ embs2.T
print(similarity)
# 获取 texts1[i] 对应的最相似 texts2[j]
for i in range(len(texts1)):
scores = []
for j in range(len(texts2)):
scores.append([texts2[j], similarity[i][j]])
scores = sorted(scores, key=lambda x:x[1], reverse=True)
print(f"查询文本:{texts1[i]}")
for text2, score in scores:
print(f"相似文本:{text2},打分:{score}")
print()
```
示例输出如下:
```
查询文本:胡子长得太快怎么办?
相似文本:胡子长得快怎么办?,打分:0.965681254863739
相似文本:怎样使胡子不浓密!,打分:0.7353651523590088
相似文本:香港买手表哪里好,打分:0.24928246438503265
相似文本:在杭州手机到哪里买,打分:0.2038613110780716
查询文本:在香港哪里买手表好
相似文本:香港买手表哪里好,打分:0.9916468262672424
相似文本:在杭州手机到哪里买,打分:0.498248815536499
相似文本:胡子长得快怎么办?,打分:0.2424771636724472
相似文本:怎样使胡子不浓密!,打分:0.21715955436229706
```
### Langchain
Dmeta-embedding 模型支持通过 LLM 工具框架 [langchain](https://www.langchain.com/) 来加载推理:
```
pip install -U langchain
```
```python
import torch
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings
model_name = "DMetaSoul/Dmeta-embedding-zh-small"
model_kwargs = {'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
model = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs,
)
texts1 = ["胡子长得太快怎么办?", "在香港哪里买手表好"]
texts2 = ["胡子长得快怎么办?", "怎样使胡子不浓密!", "香港买手表哪里好", "在杭州手机到哪里买"]
embs1 = model.embed_documents(texts1)
embs2 = model.embed_documents(texts2)
embs1, embs2 = np.array(embs1), np.array(embs2)
# 计算两两相似度
similarity = embs1 @ embs2.T
print(similarity)
# 获取 texts1[i] 对应的最相似 texts2[j]
for i in range(len(texts1)):
scores = []
for j in range(len(texts2)):
scores.append([texts2[j], similarity[i][j]])
scores = sorted(scores, key=lambda x:x[1], reverse=True)
print(f"查询文本:{texts1[i]}")
for text2, score in scores:
print(f"相似文本:{text2},打分:{score}")
print()
```
### HuggingFace Transformers
Dmeta-embedding 模型支持通过 [HuggingFace Transformers](https://huggingface.co/docs/transformers/index) 框架来加载推理:
```
pip install -U transformers
```
```python
import torch
from transformers import AutoTokenizer, AutoModel
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def cls_pooling(model_output):
return model_output[0][:, 0]
texts1 = ["胡子长得太快怎么办?", "在香港哪里买手表好"]
texts2 = ["胡子长得快怎么办?", "怎样使胡子不浓密!", "香港买手表哪里好", "在杭州手机到哪里买"]
tokenizer = AutoTokenizer.from_pretrained('DMetaSoul/Dmeta-embedding-zh-small')
model = AutoModel.from_pretrained('DMetaSoul/Dmeta-embedding-zh-small')
model.eval()
with torch.no_grad():
inputs1 = tokenizer(texts1, padding=True, truncation=True, return_tensors='pt')
inputs2 = tokenizer(texts2, padding=True, truncation=True, return_tensors='pt')
model_output1 = model(**inputs1)
model_output2 = model(**inputs2)
embs1, embs2 = cls_pooling(model_output1), cls_pooling(model_output2)
embs1 = torch.nn.functional.normalize(embs1, p=2, dim=1).numpy()
embs2 = torch.nn.functional.normalize(embs2, p=2, dim=1).numpy()
# 计算两两相似度
similarity = embs1 @ embs2.T
print(similarity)
# 获取 texts1[i] 对应的最相似 texts2[j]
for i in range(len(texts1)):
scores = []
for j in range(len(texts2)):
scores.append([texts2[j], similarity[i][j]])
scores = sorted(scores, key=lambda x:x[1], reverse=True)
print(f"查询文本:{texts1[i]}")
for text2, score in scores:
print(f"相似文本:{text2},打分:{score}")
print()
```
## Contact
您如果在使用过程中,遇到任何问题,欢迎前往[讨论区](https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/discussions)建言献策。
您也可以联系我们:赵中昊 <[email protected]>, 肖文斌 <[email protected]>, 孙凯 <[email protected]>
同时我们也开通了微信群,可扫码加入我们(人数超200了,先加管理员再拉进群),一起共建 AIGC 技术生态!
<image src="https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/resolve/main/weixin.jpeg" style="display: block; margin-left: auto; margin-right: auto; width: 256px; height: 358px;"/>
## License
Dmeta-embedding 系列模型采用 Apache-2.0 License,开源模型可以进行免费商用私有部署。