Spaces:

namfam
/

Ling

Running

Ling / utils /ner_helpers.py

Nam Fam

update files

ea99abb 8 days ago

4.33 kB

	# NER helpers and constants
	from typing import List

	# Standard NER entity types with descriptions
	NER_ENTITY_TYPES = {
	"PERSON": "People, including fictional",
	"ORG": "Companies, agencies, institutions, etc.",
	"GPE": "Countries, cities, states",
	"LOC": "Non-GPE locations, mountain ranges, bodies of water",
	"PRODUCT": "Objects, vehicles, foods, etc. (not services)",
	"EVENT": "Named hurricanes, battles, wars, sports events, etc.",
	"WORK_OF_ART": "Titles of books, songs, etc.",
	"LAW": "Named documents made into laws",
	"LANGUAGE": "Any named language",
	"DATE": "Absolute or relative dates or periods",
	"TIME": "Times smaller than a day",
	"PERCENT": "Percentage (including '%')",
	"MONEY": "Monetary values, including unit",
	"QUANTITY": "Measurements, as of weight or distance",
	"ORDINAL": "'first', 'second', etc.",
	"CARDINAL": "Numerals that do not fall under another type",
	"NORP": "Nationalities or religious or political groups",
	"FAC": "Buildings, airports, highways, bridges, etc.",
	"PRODUCT": "Objects, vehicles, foods, etc. (not services)",
	"EVENT": "Named hurricanes, battles, wars, sports events, etc.",
	"WORK_OF_ART": "Titles of books, songs, etc.",
	"LAW": "Named documents made into laws",
	"LANGUAGE": "Any named language"
	}

	# Default selected entity types (first 5 by default)
	DEFAULT_SELECTED_ENTITIES = list(NER_ENTITY_TYPES.keys())[:5]

	LLM_MODELS = ["gemini", "gpt", "claude"]

	def is_llm_model(model_id: str) -> bool:
	"""Check if the model is an LLM-based model."""
	return any(llm_model in model_id.lower() for llm_model in LLM_MODELS)

	# Render NER HTML for tagged view
	def render_ner_html(text, entities, selected_entity_types=None):
	import html as html_lib
	import re
	if not text.strip() or not entities:
	return "<div style='text-align: center; color: #666; padding: 20px;'>No named entities found in the text.</div>"
	if selected_entity_types is None:
	selected_entity_types = list(NER_ENTITY_TYPES.keys())
	COLORS = [
	'#e3f2fd', '#e8f5e9', '#fff8e1', '#f3e5f5', '#e8eaf6', '#e0f7fa',
	'#f1f8e9', '#fce4ec', '#f5f5f5', '#fafafa', '#e1f5fe', '#f3e5f5', '#f1f8e9'
	]
	# Sort and filter entities by start position and selected types
	entities = sorted(entities, key=lambda e: e.get('start', 0))
	non_overlapping = []
	for e in entities:
	if e.get('type', '') in selected_entity_types or e.get('entity', '') in selected_entity_types:
	if not non_overlapping or e['start'] >= non_overlapping[-1]['end']:
	label = e.get('type', e.get('entity', ''))
	color = COLORS[hash(label) % len(COLORS)]
	non_overlapping.append({
	'start': e['start'],
	'end': e['end'],
	'label': label,
	'text': e.get('word', e.get('text', '')),
	'color': color
	})
	filtered_entities = [entity for entity in non_overlapping if entity['label'] in selected_entity_types]
	html = ["<div class='ner-highlight' style='line-height:1.6;padding:15px;border:1px solid #e0e0e0;border-radius:4px;background:#f9f9f9;white-space:pre-wrap;'>"]
	if not filtered_entities:
	html.append("<div style='text-align: center; color: #666; padding: 20px;'>")
	html.append("No entities of the selected types found in the text.")
	html.append("</div>")
	else:
	last_pos = 0
	for entity in filtered_entities:
	start = entity['start']
	end = entity['end']
	if start > last_pos:
	html.append(html_lib.escape(text[last_pos:start]))
	html.append(f"<span style='background:{entity['color']};border-radius:3px;padding:2px 4px;margin:0 1px;border:1px solid rgba(0,0,0,0.1);'>")
	html.append(f"{html_lib.escape(entity['text'])} ")
	html.append(f"<span style='font-size:0.8em;font-weight:bold;color:#555;border-radius:2px;padding:0 2px;background:rgba(255,255,255,0.7);'>{html_lib.escape(entity['label'])}</span>")
	html.append("</span>")
	last_pos = end
	if last_pos < len(text):
	html.append(html_lib.escape(text[last_pos:]))
	html.append("</div>")
	return "".join(html)