Spaces:
Runtime error
Runtime error
import torch | |
from transformers import AutoTokenizer, AutoModel | |
import torch.nn.functional as F | |
# Create a class for embedding sentences using Hugging Face Transformers | |
class EmbeddingModel: | |
def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'): | |
# Initialize the model with the given model_name | |
self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
self.model = AutoModel.from_pretrained(model_name) | |
# Get the embedding dimension from the model's output | |
self.embedding_dim = self.encode('Hi').shape[1] | |
def _mean_pooling(self, model_output, attention_mask): | |
# Calculate mean pooling of token embeddings | |
token_embeddings = model_output[0] | |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) | |
return embedding | |
def encode(self, text): | |
# Encode a text into sentence embeddings | |
inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt') | |
with torch.no_grad(): | |
outputs = self.model(**inputs) | |
sentence_embeddings = self._mean_pooling(outputs, inputs['attention_mask']) | |
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1).numpy().astype('float32') | |
return sentence_embeddings | |
if __name__ == '__main__': | |
# Sentences we want sentence embeddings for | |
sentences = ['This is an example sentence', 'Each sentence is converted'] | |
# Print the embedding dimension of the model | |
print(EmbeddingModel().embedding_dim) | |