import gradio as gr from transformers import pipeline, AutoTokenizer # Define the model name MODEL_NAME = "impresso-project/ner-stacked-bert-multilingual" # Load the tokenizer and model using the pipeline ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) ner_pipeline = pipeline( "generic-ner", model=MODEL_NAME, tokenizer=ner_tokenizer, trust_remote_code=True, device="cpu", ) # Helper function to align entities correctly and debug tokenization def prepare_entities_for_highlight(text, results): entities = [] seen_spans = set() # Track the spans we have already added to avoid overlaps # Print debug info about tokenization print(f"Original text: {text}") for category, entity_list in results.items(): for entity in entity_list: entity_span = (entity["start"], entity["end"]) # Only add non-overlapping entities if entity_span not in seen_spans: seen_spans.add(entity_span) entity_text = text[ entity["start"] : entity["end"] ].strip() # Ensure we're working with the correct portion of the text print( f"Entity text: {entity_text}, Start: {entity['start']}, End: {entity['end']}, Type: {entity['entity']}" ) entities.append( { "start": entity["start"], "end": entity["end"], "label": f"{entity['entity']}", } ) # Sort entities by their start position entities = sorted(entities, key=lambda x: x["start"]) return {"text": text, "entities": entities} # Function to process the sentence and extract entities def extract_entities(sentence): results = ner_pipeline(sentence) # Debugging the result format print(f"NER results: {results}") # Format the results for HighlightedText return prepare_entities_for_highlight(sentence, results) # Create Gradio interface def ner_app_interface(): input_sentence = gr.Textbox( lines=5, label="Input Sentence", placeholder="Enter a sentence for NER..." ) output_entities = gr.HighlightedText(label="Extracted Entities") # Interface definition interface = gr.Interface( fn=extract_entities, inputs=input_sentence, outputs=output_entities, title="Named Entity Recognition", description="Enter a sentence to extract named entities using the NER model from the Impresso project.", examples=[ [ "In the year 1789, King Louis XVI, ruler of France, convened the Estates-General at the Palace of Versailles." ] ], live=False, ) interface.launch(share=True) # Run the app if __name__ == "__main__": ner_app_interface()