Spaces:

namfam
/

Ling

Running

File size: 4,330 Bytes

ea99abb

# NER helpers and constants
from typing import List

# Standard NER entity types with descriptions
NER_ENTITY_TYPES = {
    "PERSON": "People, including fictional",
    "ORG": "Companies, agencies, institutions, etc.",
    "GPE": "Countries, cities, states",
    "LOC": "Non-GPE locations, mountain ranges, bodies of water",
    "PRODUCT": "Objects, vehicles, foods, etc. (not services)",
    "EVENT": "Named hurricanes, battles, wars, sports events, etc.",
    "WORK_OF_ART": "Titles of books, songs, etc.",
    "LAW": "Named documents made into laws",
    "LANGUAGE": "Any named language",
    "DATE": "Absolute or relative dates or periods",
    "TIME": "Times smaller than a day",
    "PERCENT": "Percentage (including '%')",
    "MONEY": "Monetary values, including unit",
    "QUANTITY": "Measurements, as of weight or distance",
    "ORDINAL": "'first', 'second', etc.",
    "CARDINAL": "Numerals that do not fall under another type",
    "NORP": "Nationalities or religious or political groups",
    "FAC": "Buildings, airports, highways, bridges, etc.",
    "PRODUCT": "Objects, vehicles, foods, etc. (not services)",
    "EVENT": "Named hurricanes, battles, wars, sports events, etc.",
    "WORK_OF_ART": "Titles of books, songs, etc.",
    "LAW": "Named documents made into laws",
    "LANGUAGE": "Any named language"
}

# Default selected entity types (first 5 by default)
DEFAULT_SELECTED_ENTITIES = list(NER_ENTITY_TYPES.keys())[:5]

LLM_MODELS = ["gemini", "gpt", "claude"]

def is_llm_model(model_id: str) -> bool:
    """Check if the model is an LLM-based model."""
    return any(llm_model in model_id.lower() for llm_model in LLM_MODELS)

# Render NER HTML for tagged view
def render_ner_html(text, entities, selected_entity_types=None):
    import html as html_lib
    import re
    if not text.strip() or not entities:
        return "<div style='text-align: center; color: #666; padding: 20px;'>No named entities found in the text.</div>"
    if selected_entity_types is None:
        selected_entity_types = list(NER_ENTITY_TYPES.keys())
    COLORS = [
        '#e3f2fd', '#e8f5e9', '#fff8e1', '#f3e5f5', '#e8eaf6', '#e0f7fa',
        '#f1f8e9', '#fce4ec', '#f5f5f5', '#fafafa', '#e1f5fe', '#f3e5f5', '#f1f8e9'
    ]
    # Sort and filter entities by start position and selected types
    entities = sorted(entities, key=lambda e: e.get('start', 0))
    non_overlapping = []
    for e in entities:
        if e.get('type', '') in selected_entity_types or e.get('entity', '') in selected_entity_types:
            if not non_overlapping or e['start'] >= non_overlapping[-1]['end']:
                label = e.get('type', e.get('entity', ''))
                color = COLORS[hash(label) % len(COLORS)]
                non_overlapping.append({
                    'start': e['start'],
                    'end': e['end'],
                    'label': label,
                    'text': e.get('word', e.get('text', '')),
                    'color': color
                })
    filtered_entities = [entity for entity in non_overlapping if entity['label'] in selected_entity_types]
    html = ["<div class='ner-highlight' style='line-height:1.6;padding:15px;border:1px solid #e0e0e0;border-radius:4px;background:#f9f9f9;white-space:pre-wrap;'>"]
    if not filtered_entities:
        html.append("<div style='text-align: center; color: #666; padding: 20px;'>")
        html.append("No entities of the selected types found in the text.")
        html.append("</div>")
    else:
        last_pos = 0
        for entity in filtered_entities:
            start = entity['start']
            end = entity['end']
            if start > last_pos:
                html.append(html_lib.escape(text[last_pos:start]))
            html.append(f"<span style='background:{entity['color']};border-radius:3px;padding:2px 4px;margin:0 1px;border:1px solid rgba(0,0,0,0.1);'>")
            html.append(f"{html_lib.escape(entity['text'])} ")
            html.append(f"<span style='font-size:0.8em;font-weight:bold;color:#555;border-radius:2px;padding:0 2px;background:rgba(255,255,255,0.7);'>{html_lib.escape(entity['label'])}</span>")
            html.append("</span>")
            last_pos = end
    if last_pos < len(text):
        html.append(html_lib.escape(text[last_pos:]))
    html.append("</div>")
    return "".join(html)