infly
/

inf-wse-v1-base-zh

Refactored the convert_embeddings_to_weights script to be self-contained and executable by integrating the embeddings generated from the first code snippet.

Files changed (1) hide show

README.md +23 -1

README.md CHANGED Viewed

@@ -58,10 +58,31 @@ print(scores.tolist())
 #### Convert embeddings to lexical weights
 ```python
 from collections import OrderedDict
 def convert_embeddings_to_weights(embeddings, tokenizer):
     values, indices = torch.sort(embeddings, dim=-1, descending=True)
     token2weight = []
     for i in range(embeddings.size(0)):
         token2weight.append(OrderedDict())
@@ -77,6 +98,7 @@ def convert_embeddings_to_weights(embeddings, tokenizer):
 token2weight = convert_embeddings_to_weights(embeddings, tokenizer)
 print(token2weight[0])
 # OrderedDict([('一体机', 3.3438382148742676), ('由', 2.493837356567383), ('电脑', 2.0291812419891357), ('构成', 1.986171841621399), ('什么', 1.0218793153762817)])
 ```

 #### Convert embeddings to lexical weights
 ```python
+import torch
+from transformers import AutoTokenizer, AutoModel
 from collections import OrderedDict
+queries = ['电脑一体机由什么构成？', '什么是掌上电脑？']
+documents = [
+    '电脑一体机，是由一台显示器、一个电脑键盘和一个鼠标组成的电脑。',
+    '掌上电脑是一种运行在嵌入式操作系统和内嵌式应用软件之上的、小巧、轻便、易带、实用、价廉的手持式计算设备。',
+]
+input_texts = queries + documents
+tokenizer = AutoTokenizer.from_pretrained("infly/inf-wse-v1-base-zh", trust_remote_code=True, use_fast=False)
+model = AutoModel.from_pretrained("infly/inf-wse-v1-base-zh", trust_remote_code=True)
+model.eval()
+max_length = 512
+input_batch = tokenizer(input_texts, padding=True, max_length=max_length, truncation=True, return_tensors="pt")
+with torch.no_grad():
+    embeddings = model(input_batch['input_ids'], input_batch['attention_mask'], return_sparse=False)
 def convert_embeddings_to_weights(embeddings, tokenizer):
     values, indices = torch.sort(embeddings, dim=-1, descending=True)
     token2weight = []
     for i in range(embeddings.size(0)):
         token2weight.append(OrderedDict())
 token2weight = convert_embeddings_to_weights(embeddings, tokenizer)
 print(token2weight[0])
 # OrderedDict([('一体机', 3.3438382148742676), ('由', 2.493837356567383), ('电脑', 2.0291812419891357), ('构成', 1.986171841621399), ('什么', 1.0218793153762817)])
 ```