embedding / app.py
codeblacks's picture
Create app.py
52773ee verified
raw
history blame
1.17 kB
from transformers import LongformerTokenizer, LongformerModel
import torch
import gradio as gr
# Load the pre-trained Longformer model and tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
def get_longformer_embeddings(sentences):
# Tokenize input sentences
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=2048)
# Get embeddings
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling over the sequence
return embeddings.numpy().tolist()
# Define the Gradio interface
interface = gr.Interface(
fn=get_longformer_embeddings, # Function to call
inputs=gr.Textbox(lines=2, placeholder="Enter sentences here, one per line"), # Input component
outputs=gr.JSON(), # Output component
title="Sentence Embeddings with Longformer", # Interface title
description="Enter sentences to get their embeddings with Longformer (up to 2048 tokens)." # Description
)
# Launch the interface
interface.launch()