embedding / app.py
codeblacks's picture
Update app.py
c6cd033 verified
raw
history blame
1.19 kB
from transformers import AutoTokenizer, AutoModel
import torch
import gradio as gr
# Load the pre-trained paraphrase-mpnet-base-v2 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-mpnet-base-v2')
def get_mpnet_embeddings(sentences):
# Tokenize input sentences
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
# Get embeddings
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling over the sequence
return embeddings.numpy().tolist()
# Define the Gradio interface
interface = gr.Interface(
fn=get_mpnet_embeddings, # Function to call
inputs=gr.Textbox(lines=2, placeholder="Enter sentences here, one per line"), # Input component
outputs=gr.JSON(), # Output component
title="Sentence Embeddings with MPNet", # Interface title
description="Enter sentences to get their embeddings with paraphrase-mpnet-base-v2 (up to 512 tokens)." # Description
)
# Launch the interface
interface.launch()