from transformers import LongformerTokenizer, LongformerModel import torch import gradio as gr # Load the pre-trained Longformer model and tokenizer tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') model = LongformerModel.from_pretrained('allenai/longformer-base-4096') def get_longformer_embeddings(sentences): # Tokenize input sentences inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=2048) # Get embeddings with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling over the sequence return embeddings.numpy().tolist() # Define the Gradio interface interface = gr.Interface( fn=get_longformer_embeddings, # Function to call inputs=gr.Textbox(lines=2, placeholder="Enter sentences here, one per line"), # Input component outputs=gr.JSON(), # Output component title="Sentence Embeddings with Longformer", # Interface title description="Enter sentences to get their embeddings with Longformer (up to 2048 tokens)." # Description ) # Launch the interface interface.launch()