Spaces:

ehri-ner
/

xlm-roberta-large-ehri-ner-all

Sleeping

File size: 1,402 Bytes

b612fdb
0449f98
 
b612fdb
dd42c91
b612fdb
 
 
 
0449f98
 
b612fdb
 
2c0f990
 
 
 
b612fdb
2c0f990
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b612fdb
 
 
 
08bb881
b612fdb

import gradio as gr
from transformers import pipeline, AutoTokenizer


# Specify the name of the model
model_name = 'ehri-ner/xlm-roberta-large-ehri-ner-all'

# Load the model from Hugging Face
ner_model = pipeline('ner', model=model_name)
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

def predict(text):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)

    # Use the model to predict the named entities for each token
    entities = ner_model(text)

    # Initialize an empty dictionary to store the results
    result = {}

    # Iterate over the entities
    for entity in entities:
        # Get the word and entity type
        word = entity['word']
        entity_type = entity['entity']

        # If the word starts with '##', it's a subword
        if word.startswith('##'):
            # Remove the '##' and append the subword to the last word in the result
            word = word[2:]
            last_word = list(result.keys())[-1]
            result[last_word + word] = result.pop(last_word)
        else:
            # Add the word and entity type to the result
            result[word] = entity_type

    return result

# Define the Gradio interface
iface = gr.Interface(fn=predict, 
                     inputs=gr.Textbox(lines=2, placeholder='Enter text here...'), 
                     outputs='json')

# Launch the interface
iface.launch()