File size: 3,098 Bytes
6c7907e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import streamlit as st
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load model and tokenizer
path_to_checkpoint = 'PranavaKailash/CyNER-2.0-DeBERTa-v3-base'
tokenizer = AutoTokenizer.from_pretrained(path_to_checkpoint, use_fast=True, max_length=768)
model = AutoModelForTokenClassification.from_pretrained(path_to_checkpoint)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

def tag_sentence(sentence, entities_dict):
    """
    Add HTML tags to entities for visualization.
    """
    all_entities = sorted(
        [(e['start'], e['end'], e['entity'], e['word']) for ents in entities_dict.values() for e in ents],
        key=lambda x: x[0]
    )

    merged_entities = []
    current_entity = None

    for start, end, entity_type, word in all_entities:
        if current_entity is None:
            current_entity = [start, end, entity_type, word]
        else:
            if start == current_entity[1] and entity_type == current_entity[2] and entity_type.startswith('I-'):
                current_entity[1] = end
                current_entity[3] += word.replace('▁', ' ') 
            else:
                merged_entities.append(tuple(current_entity))
                current_entity = [start, end, entity_type, word]

    if current_entity:
        merged_entities.append(tuple(current_entity))

    tagged_sentence = ""
    last_idx = 0

    for start, end, entity_type, _ in merged_entities:
        tagged_sentence += sentence[last_idx:start]
        entity_tag = entity_type.replace('I-', 'B-')
        tagged_sentence += f"<span style='color:blue'><{entity_tag}></span>{sentence[start:end]}<span style='color:blue'>/{entity_tag}></span>"
        last_idx = end

    tagged_sentence += sentence[last_idx:]
    return tagged_sentence

def perform_ner(text):
    """
    Run NER pipeline and prepare results for display.
    """
    entities = ner_pipeline(text)
    entities_dict = {}
    for entity in entities:
        entity_type = entity['entity']
        if entity_type not in entities_dict:
            entities_dict[entity_type] = []
        entities_dict[entity_type].append({
            "entity": entity['entity'],
            "score": entity['score'],
            "index": entity['index'],
            "word": entity['word'],
            "start": entity['start'],
            "end": entity['end']
        })
    
    tagged_sentence = tag_sentence(text, entities_dict)
    return entities_dict, tagged_sentence

# Streamlit UI
st.title("CyNER 2.0 - Named Entity Recognition")
st.write("Enter text to get named entity recognition results.")

input_text = st.text_area("Input Text", "Type your text here...")

if st.button("Analyze"):
    if input_text.strip():
        entities_dict, tagged_sentence = perform_ner(input_text)
        
        # Display results
        st.subheader("Tagged Entities")
        st.markdown(tagged_sentence, unsafe_allow_html=True)
        
        st.subheader("Entities and Details")
        st.json(entities_dict)
    else:
        st.warning("Please enter some text for analysis.")