Spaces:
Sleeping
Sleeping
Update NamedEntity.py
Browse files- NamedEntity.py +104 -0
NamedEntity.py
CHANGED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
| **Abbreviation** | **Description** |
|
3 |
+
|------------------|-----------------|
|
4 |
+
| O | Outside of a named entity
|
5 |
+
| B-MIS | Beginning of a miscellaneous entity right after another miscellaneous entity
|
6 |
+
| I-MIS | Miscellaneous entity
|
7 |
+
| B-PER | Beginning of a person’s name right after another person’s name
|
8 |
+
| I-PER | Person’s name
|
9 |
+
| B-ORG | Beginning of an organization right after another organization
|
10 |
+
| I-ORG | Organization
|
11 |
+
| B-LOC | Beginning of a location right after another location
|
12 |
+
| I-LOC | Location
|
13 |
+
"""
|
14 |
+
|
15 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
16 |
+
|
17 |
+
from enum import Enum
|
18 |
+
|
19 |
+
|
20 |
+
class DictKey(Enum):
|
21 |
+
ENTITY = 'entity'
|
22 |
+
SCORE = 'score'
|
23 |
+
INDEX = 'index'
|
24 |
+
WORD = 'word'
|
25 |
+
START = 'start'
|
26 |
+
END = 'end'
|
27 |
+
|
28 |
+
|
29 |
+
class NER:
|
30 |
+
def __init__(self, text_to_analyse):
|
31 |
+
"""
|
32 |
+
The Constructor for the Named Entity Recognition class.
|
33 |
+
:param text_to_analyse: The text in which to find named entities.
|
34 |
+
"""
|
35 |
+
if text_to_analyse is None or len(text_to_analyse.strip()) == 0:
|
36 |
+
raise ValueError("text_to_analyse must not be empty and must be set to a valid string value")
|
37 |
+
|
38 |
+
self.tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
|
39 |
+
if self.tokenizer is None:
|
40 |
+
raise ValueError("Unable to load tokenizer from DSLIM BERT model")
|
41 |
+
|
42 |
+
self.model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
|
43 |
+
if self.model is None:
|
44 |
+
raise ValueError("Unable to load model from DSLIM BERT model")
|
45 |
+
|
46 |
+
self.nlp = pipeline("ner", model=self.model, tokenizer=self.tokenizer, grouped_entities=True)
|
47 |
+
if self.nlp is None:
|
48 |
+
raise ValueError("Unable to load pipeline from DSLIM BERT model")
|
49 |
+
|
50 |
+
self.text_to_analyse = text_to_analyse
|
51 |
+
self.results = self.nlp(text_to_analyse)
|
52 |
+
self.all_entities = self.get_list_of_entities()
|
53 |
+
self.unique_entities = self.unique_entities()
|
54 |
+
self.markdown = None
|
55 |
+
self.markdown_text = None
|
56 |
+
|
57 |
+
def get_entity_value(self, key: DictKey, item_index):
|
58 |
+
"""
|
59 |
+
Extracts the value for a specific key (as an Enum) from a specific dictionary item in the list.
|
60 |
+
:param key: DictKey Enum representing the key for which the value is required.
|
61 |
+
:param item_index: Index of the item in the list to process.
|
62 |
+
:return: Value for the given key in the specified dictionary item, or None if key is not found.
|
63 |
+
"""
|
64 |
+
if item_index < len(self.results):
|
65 |
+
return self.results[item_index].get(key.value)
|
66 |
+
else:
|
67 |
+
raise ValueError("The supplied list index is out of bounds")
|
68 |
+
|
69 |
+
def get_list_of_entities(self):
|
70 |
+
"""
|
71 |
+
Returns a list of all entities in the original text, in the order they appear. There may be repeated
|
72 |
+
entities in this list.
|
73 |
+
:return: A list of all entities in the original text.
|
74 |
+
"""
|
75 |
+
# create a list where each item is the value of word from each of the dictionaries in self.results
|
76 |
+
return [item.get(DictKey.WORD.value) for item in self.results]
|
77 |
+
|
78 |
+
def entity_markdown(self):
|
79 |
+
"""
|
80 |
+
Convert a string to markdown format and change the color of specified substrings to red.
|
81 |
+
"""
|
82 |
+
self.markdown = self.text_to_analyse
|
83 |
+
|
84 |
+
for substring in self.get_list_of_entities():
|
85 |
+
self.markdown = self.markdown.replace(substring, f'<span style = "color:red;">{substring}</span>')
|
86 |
+
|
87 |
+
self.markdown_text = self.markdown.replace('\n', ' \n') # Two spaces at the end of line for markdown new line
|
88 |
+
|
89 |
+
def unique_entities(self):
|
90 |
+
"""
|
91 |
+
Return a list of all unique entities in the original text.
|
92 |
+
:return: A list of unique entities.
|
93 |
+
"""
|
94 |
+
unique_set = set() # Sets are faster than lists for checking membership
|
95 |
+
|
96 |
+
# Create a new list to store the unique strings in order
|
97 |
+
unique_list = []
|
98 |
+
|
99 |
+
for string in self.all_entities:
|
100 |
+
if string not in unique_set:
|
101 |
+
unique_set.add(string)
|
102 |
+
unique_list.append(string)
|
103 |
+
|
104 |
+
return unique_list
|