# NER helpers and constants from typing import List # Standard NER entity types with descriptions NER_ENTITY_TYPES = { "PERSON": "People, including fictional", "ORG": "Companies, agencies, institutions, etc.", "GPE": "Countries, cities, states", "LOC": "Non-GPE locations, mountain ranges, bodies of water", "PRODUCT": "Objects, vehicles, foods, etc. (not services)", "EVENT": "Named hurricanes, battles, wars, sports events, etc.", "WORK_OF_ART": "Titles of books, songs, etc.", "LAW": "Named documents made into laws", "LANGUAGE": "Any named language", "DATE": "Absolute or relative dates or periods", "TIME": "Times smaller than a day", "PERCENT": "Percentage (including '%')", "MONEY": "Monetary values, including unit", "QUANTITY": "Measurements, as of weight or distance", "ORDINAL": "'first', 'second', etc.", "CARDINAL": "Numerals that do not fall under another type", "NORP": "Nationalities or religious or political groups", "FAC": "Buildings, airports, highways, bridges, etc.", "PRODUCT": "Objects, vehicles, foods, etc. (not services)", "EVENT": "Named hurricanes, battles, wars, sports events, etc.", "WORK_OF_ART": "Titles of books, songs, etc.", "LAW": "Named documents made into laws", "LANGUAGE": "Any named language" } # Default selected entity types (first 5 by default) DEFAULT_SELECTED_ENTITIES = list(NER_ENTITY_TYPES.keys())[:5] LLM_MODELS = ["gemini", "gpt", "claude"] def is_llm_model(model_id: str) -> bool: """Check if the model is an LLM-based model.""" return any(llm_model in model_id.lower() for llm_model in LLM_MODELS) # Render NER HTML for tagged view def render_ner_html(text, entities, selected_entity_types=None): import html as html_lib import re if not text.strip() or not entities: return "
No named entities found in the text.
" if selected_entity_types is None: selected_entity_types = list(NER_ENTITY_TYPES.keys()) COLORS = [ '#e3f2fd', '#e8f5e9', '#fff8e1', '#f3e5f5', '#e8eaf6', '#e0f7fa', '#f1f8e9', '#fce4ec', '#f5f5f5', '#fafafa', '#e1f5fe', '#f3e5f5', '#f1f8e9' ] # Sort and filter entities by start position and selected types entities = sorted(entities, key=lambda e: e.get('start', 0)) non_overlapping = [] for e in entities: if e.get('type', '') in selected_entity_types or e.get('entity', '') in selected_entity_types: if not non_overlapping or e['start'] >= non_overlapping[-1]['end']: label = e.get('type', e.get('entity', '')) color = COLORS[hash(label) % len(COLORS)] non_overlapping.append({ 'start': e['start'], 'end': e['end'], 'label': label, 'text': e.get('word', e.get('text', '')), 'color': color }) filtered_entities = [entity for entity in non_overlapping if entity['label'] in selected_entity_types] html = ["
"] if not filtered_entities: html.append("
") html.append("No entities of the selected types found in the text.") html.append("
") else: last_pos = 0 for entity in filtered_entities: start = entity['start'] end = entity['end'] if start > last_pos: html.append(html_lib.escape(text[last_pos:start])) html.append(f"") html.append(f"{html_lib.escape(entity['text'])} ") html.append(f"{html_lib.escape(entity['label'])}") html.append("") last_pos = end if last_pos < len(text): html.append(html_lib.escape(text[last_pos:])) html.append("
") return "".join(html)