Ling / utils /ner_helpers.py
Nam Fam
update files
ea99abb
# NER helpers and constants
from typing import List
# Standard NER entity types with descriptions
NER_ENTITY_TYPES = {
"PERSON": "People, including fictional",
"ORG": "Companies, agencies, institutions, etc.",
"GPE": "Countries, cities, states",
"LOC": "Non-GPE locations, mountain ranges, bodies of water",
"PRODUCT": "Objects, vehicles, foods, etc. (not services)",
"EVENT": "Named hurricanes, battles, wars, sports events, etc.",
"WORK_OF_ART": "Titles of books, songs, etc.",
"LAW": "Named documents made into laws",
"LANGUAGE": "Any named language",
"DATE": "Absolute or relative dates or periods",
"TIME": "Times smaller than a day",
"PERCENT": "Percentage (including '%')",
"MONEY": "Monetary values, including unit",
"QUANTITY": "Measurements, as of weight or distance",
"ORDINAL": "'first', 'second', etc.",
"CARDINAL": "Numerals that do not fall under another type",
"NORP": "Nationalities or religious or political groups",
"FAC": "Buildings, airports, highways, bridges, etc.",
"PRODUCT": "Objects, vehicles, foods, etc. (not services)",
"EVENT": "Named hurricanes, battles, wars, sports events, etc.",
"WORK_OF_ART": "Titles of books, songs, etc.",
"LAW": "Named documents made into laws",
"LANGUAGE": "Any named language"
}
# Default selected entity types (first 5 by default)
DEFAULT_SELECTED_ENTITIES = list(NER_ENTITY_TYPES.keys())[:5]
LLM_MODELS = ["gemini", "gpt", "claude"]
def is_llm_model(model_id: str) -> bool:
"""Check if the model is an LLM-based model."""
return any(llm_model in model_id.lower() for llm_model in LLM_MODELS)
# Render NER HTML for tagged view
def render_ner_html(text, entities, selected_entity_types=None):
import html as html_lib
import re
if not text.strip() or not entities:
return "<div style='text-align: center; color: #666; padding: 20px;'>No named entities found in the text.</div>"
if selected_entity_types is None:
selected_entity_types = list(NER_ENTITY_TYPES.keys())
COLORS = [
'#e3f2fd', '#e8f5e9', '#fff8e1', '#f3e5f5', '#e8eaf6', '#e0f7fa',
'#f1f8e9', '#fce4ec', '#f5f5f5', '#fafafa', '#e1f5fe', '#f3e5f5', '#f1f8e9'
]
# Sort and filter entities by start position and selected types
entities = sorted(entities, key=lambda e: e.get('start', 0))
non_overlapping = []
for e in entities:
if e.get('type', '') in selected_entity_types or e.get('entity', '') in selected_entity_types:
if not non_overlapping or e['start'] >= non_overlapping[-1]['end']:
label = e.get('type', e.get('entity', ''))
color = COLORS[hash(label) % len(COLORS)]
non_overlapping.append({
'start': e['start'],
'end': e['end'],
'label': label,
'text': e.get('word', e.get('text', '')),
'color': color
})
filtered_entities = [entity for entity in non_overlapping if entity['label'] in selected_entity_types]
html = ["<div class='ner-highlight' style='line-height:1.6;padding:15px;border:1px solid #e0e0e0;border-radius:4px;background:#f9f9f9;white-space:pre-wrap;'>"]
if not filtered_entities:
html.append("<div style='text-align: center; color: #666; padding: 20px;'>")
html.append("No entities of the selected types found in the text.")
html.append("</div>")
else:
last_pos = 0
for entity in filtered_entities:
start = entity['start']
end = entity['end']
if start > last_pos:
html.append(html_lib.escape(text[last_pos:start]))
html.append(f"<span style='background:{entity['color']};border-radius:3px;padding:2px 4px;margin:0 1px;border:1px solid rgba(0,0,0,0.1);'>")
html.append(f"{html_lib.escape(entity['text'])} ")
html.append(f"<span style='font-size:0.8em;font-weight:bold;color:#555;border-radius:2px;padding:0 2px;background:rgba(255,255,255,0.7);'>{html_lib.escape(entity['label'])}</span>")
html.append("</span>")
last_pos = end
if last_pos < len(text):
html.append(html_lib.escape(text[last_pos:]))
html.append("</div>")
return "".join(html)