|
|
|
from typing import List |
|
|
|
|
|
NER_ENTITY_TYPES = { |
|
"PERSON": "People, including fictional", |
|
"ORG": "Companies, agencies, institutions, etc.", |
|
"GPE": "Countries, cities, states", |
|
"LOC": "Non-GPE locations, mountain ranges, bodies of water", |
|
"PRODUCT": "Objects, vehicles, foods, etc. (not services)", |
|
"EVENT": "Named hurricanes, battles, wars, sports events, etc.", |
|
"WORK_OF_ART": "Titles of books, songs, etc.", |
|
"LAW": "Named documents made into laws", |
|
"LANGUAGE": "Any named language", |
|
"DATE": "Absolute or relative dates or periods", |
|
"TIME": "Times smaller than a day", |
|
"PERCENT": "Percentage (including '%')", |
|
"MONEY": "Monetary values, including unit", |
|
"QUANTITY": "Measurements, as of weight or distance", |
|
"ORDINAL": "'first', 'second', etc.", |
|
"CARDINAL": "Numerals that do not fall under another type", |
|
"NORP": "Nationalities or religious or political groups", |
|
"FAC": "Buildings, airports, highways, bridges, etc.", |
|
"PRODUCT": "Objects, vehicles, foods, etc. (not services)", |
|
"EVENT": "Named hurricanes, battles, wars, sports events, etc.", |
|
"WORK_OF_ART": "Titles of books, songs, etc.", |
|
"LAW": "Named documents made into laws", |
|
"LANGUAGE": "Any named language" |
|
} |
|
|
|
|
|
DEFAULT_SELECTED_ENTITIES = list(NER_ENTITY_TYPES.keys())[:5] |
|
|
|
LLM_MODELS = ["gemini", "gpt", "claude"] |
|
|
|
def is_llm_model(model_id: str) -> bool: |
|
"""Check if the model is an LLM-based model.""" |
|
return any(llm_model in model_id.lower() for llm_model in LLM_MODELS) |
|
|
|
|
|
def render_ner_html(text, entities, selected_entity_types=None): |
|
import html as html_lib |
|
import re |
|
if not text.strip() or not entities: |
|
return "<div style='text-align: center; color: #666; padding: 20px;'>No named entities found in the text.</div>" |
|
if selected_entity_types is None: |
|
selected_entity_types = list(NER_ENTITY_TYPES.keys()) |
|
COLORS = [ |
|
'#e3f2fd', '#e8f5e9', '#fff8e1', '#f3e5f5', '#e8eaf6', '#e0f7fa', |
|
'#f1f8e9', '#fce4ec', '#f5f5f5', '#fafafa', '#e1f5fe', '#f3e5f5', '#f1f8e9' |
|
] |
|
|
|
entities = sorted(entities, key=lambda e: e.get('start', 0)) |
|
non_overlapping = [] |
|
for e in entities: |
|
if e.get('type', '') in selected_entity_types or e.get('entity', '') in selected_entity_types: |
|
if not non_overlapping or e['start'] >= non_overlapping[-1]['end']: |
|
label = e.get('type', e.get('entity', '')) |
|
color = COLORS[hash(label) % len(COLORS)] |
|
non_overlapping.append({ |
|
'start': e['start'], |
|
'end': e['end'], |
|
'label': label, |
|
'text': e.get('word', e.get('text', '')), |
|
'color': color |
|
}) |
|
filtered_entities = [entity for entity in non_overlapping if entity['label'] in selected_entity_types] |
|
html = ["<div class='ner-highlight' style='line-height:1.6;padding:15px;border:1px solid #e0e0e0;border-radius:4px;background:#f9f9f9;white-space:pre-wrap;'>"] |
|
if not filtered_entities: |
|
html.append("<div style='text-align: center; color: #666; padding: 20px;'>") |
|
html.append("No entities of the selected types found in the text.") |
|
html.append("</div>") |
|
else: |
|
last_pos = 0 |
|
for entity in filtered_entities: |
|
start = entity['start'] |
|
end = entity['end'] |
|
if start > last_pos: |
|
html.append(html_lib.escape(text[last_pos:start])) |
|
html.append(f"<span style='background:{entity['color']};border-radius:3px;padding:2px 4px;margin:0 1px;border:1px solid rgba(0,0,0,0.1);'>") |
|
html.append(f"{html_lib.escape(entity['text'])} ") |
|
html.append(f"<span style='font-size:0.8em;font-weight:bold;color:#555;border-radius:2px;padding:0 2px;background:rgba(255,255,255,0.7);'>{html_lib.escape(entity['label'])}</span>") |
|
html.append("</span>") |
|
last_pos = end |
|
if last_pos < len(text): |
|
html.append(html_lib.escape(text[last_pos:])) |
|
html.append("</div>") |
|
return "".join(html) |
|
|