# NER helpers and constants from typing import List # Standard NER entity types with descriptions NER_ENTITY_TYPES = { "PERSON": "People, including fictional", "ORG": "Companies, agencies, institutions, etc.", "GPE": "Countries, cities, states", "LOC": "Non-GPE locations, mountain ranges, bodies of water", "PRODUCT": "Objects, vehicles, foods, etc. (not services)", "EVENT": "Named hurricanes, battles, wars, sports events, etc.", "WORK_OF_ART": "Titles of books, songs, etc.", "LAW": "Named documents made into laws", "LANGUAGE": "Any named language", "DATE": "Absolute or relative dates or periods", "TIME": "Times smaller than a day", "PERCENT": "Percentage (including '%')", "MONEY": "Monetary values, including unit", "QUANTITY": "Measurements, as of weight or distance", "ORDINAL": "'first', 'second', etc.", "CARDINAL": "Numerals that do not fall under another type", "NORP": "Nationalities or religious or political groups", "FAC": "Buildings, airports, highways, bridges, etc.", "PRODUCT": "Objects, vehicles, foods, etc. (not services)", "EVENT": "Named hurricanes, battles, wars, sports events, etc.", "WORK_OF_ART": "Titles of books, songs, etc.", "LAW": "Named documents made into laws", "LANGUAGE": "Any named language" } # Default selected entity types (first 5 by default) DEFAULT_SELECTED_ENTITIES = list(NER_ENTITY_TYPES.keys())[:5] LLM_MODELS = ["gemini", "gpt", "claude"] def is_llm_model(model_id: str) -> bool: """Check if the model is an LLM-based model.""" return any(llm_model in model_id.lower() for llm_model in LLM_MODELS) # Render NER HTML for tagged view def render_ner_html(text, entities, selected_entity_types=None): import html as html_lib import re if not text.strip() or not entities: return "

No named entities found in the text.

" if selected_entity_types is None: selected_entity_types = list(NER_ENTITY_TYPES.keys()) COLORS = [ '#e3f2fd', '#e8f5e9', '#fff8e1', '#f3e5f5', '#e8eaf6', '#e0f7fa', '#f1f8e9', '#fce4ec', '#f5f5f5', '#fafafa', '#e1f5fe', '#f3e5f5', '#f1f8e9' ] # Sort and filter entities by start position and selected types entities = sorted(entities, key=lambda e: e.get('start', 0)) non_overlapping = [] for e in entities: if e.get('type', '') in selected_entity_types or e.get('entity', '') in selected_entity_types: if not non_overlapping or e['start'] >= non_overlapping[-1]['end']: label = e.get('type', e.get('entity', '')) color = COLORS[hash(label) % len(COLORS)] non_overlapping.append({ 'start': e['start'], 'end': e['end'], 'label': label, 'text': e.get('word', e.get('text', '')), 'color': color }) filtered_entities = [entity for entity in non_overlapping if entity['label'] in selected_entity_types] html = ["

"]
    if not filtered_entities:
        html.append("")
        html.append("No entities of the selected types found in the text.")
        html.append("
")
    else:
        last_pos = 0
        for entity in filtered_entities:
            start = entity['start']
            end = entity['end']
            if start > last_pos:
                html.append(html_lib.escape(text[last_pos:start]))
            html.append(f"")
            html.append(f"{html_lib.escape(entity['text'])} ")
            html.append(f"{html_lib.escape(entity['label'])}")
            html.append("")
            last_pos = end
    if last_pos < len(text):
        html.append(html_lib.escape(text[last_pos:]))
    html.append("

") return "".join(html)