wibberlet commited on
Commit
94be5fc
·
1 Parent(s): 9257d34

Update NamedEntity.py

Browse files
Files changed (1) hide show
  1. NamedEntity.py +104 -0
NamedEntity.py CHANGED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ | **Abbreviation** | **Description** |
3
+ |------------------|-----------------|
4
+ | O | Outside of a named entity
5
+ | B-MIS | Beginning of a miscellaneous entity right after another miscellaneous entity
6
+ | I-MIS | Miscellaneous entity
7
+ | B-PER | Beginning of a person’s name right after another person’s name
8
+ | I-PER | Person’s name
9
+ | B-ORG | Beginning of an organization right after another organization
10
+ | I-ORG | Organization
11
+ | B-LOC | Beginning of a location right after another location
12
+ | I-LOC | Location
13
+ """
14
+
15
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
16
+
17
+ from enum import Enum
18
+
19
+
20
+ class DictKey(Enum):
21
+ ENTITY = 'entity'
22
+ SCORE = 'score'
23
+ INDEX = 'index'
24
+ WORD = 'word'
25
+ START = 'start'
26
+ END = 'end'
27
+
28
+
29
+ class NER:
30
+ def __init__(self, text_to_analyse):
31
+ """
32
+ The Constructor for the Named Entity Recognition class.
33
+ :param text_to_analyse: The text in which to find named entities.
34
+ """
35
+ if text_to_analyse is None or len(text_to_analyse.strip()) == 0:
36
+ raise ValueError("text_to_analyse must not be empty and must be set to a valid string value")
37
+
38
+ self.tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
39
+ if self.tokenizer is None:
40
+ raise ValueError("Unable to load tokenizer from DSLIM BERT model")
41
+
42
+ self.model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
43
+ if self.model is None:
44
+ raise ValueError("Unable to load model from DSLIM BERT model")
45
+
46
+ self.nlp = pipeline("ner", model=self.model, tokenizer=self.tokenizer, grouped_entities=True)
47
+ if self.nlp is None:
48
+ raise ValueError("Unable to load pipeline from DSLIM BERT model")
49
+
50
+ self.text_to_analyse = text_to_analyse
51
+ self.results = self.nlp(text_to_analyse)
52
+ self.all_entities = self.get_list_of_entities()
53
+ self.unique_entities = self.unique_entities()
54
+ self.markdown = None
55
+ self.markdown_text = None
56
+
57
+ def get_entity_value(self, key: DictKey, item_index):
58
+ """
59
+ Extracts the value for a specific key (as an Enum) from a specific dictionary item in the list.
60
+ :param key: DictKey Enum representing the key for which the value is required.
61
+ :param item_index: Index of the item in the list to process.
62
+ :return: Value for the given key in the specified dictionary item, or None if key is not found.
63
+ """
64
+ if item_index < len(self.results):
65
+ return self.results[item_index].get(key.value)
66
+ else:
67
+ raise ValueError("The supplied list index is out of bounds")
68
+
69
+ def get_list_of_entities(self):
70
+ """
71
+ Returns a list of all entities in the original text, in the order they appear. There may be repeated
72
+ entities in this list.
73
+ :return: A list of all entities in the original text.
74
+ """
75
+ # create a list where each item is the value of word from each of the dictionaries in self.results
76
+ return [item.get(DictKey.WORD.value) for item in self.results]
77
+
78
+ def entity_markdown(self):
79
+ """
80
+ Convert a string to markdown format and change the color of specified substrings to red.
81
+ """
82
+ self.markdown = self.text_to_analyse
83
+
84
+ for substring in self.get_list_of_entities():
85
+ self.markdown = self.markdown.replace(substring, f'<span style = "color:red;">{substring}</span>')
86
+
87
+ self.markdown_text = self.markdown.replace('\n', ' \n') # Two spaces at the end of line for markdown new line
88
+
89
+ def unique_entities(self):
90
+ """
91
+ Return a list of all unique entities in the original text.
92
+ :return: A list of unique entities.
93
+ """
94
+ unique_set = set() # Sets are faster than lists for checking membership
95
+
96
+ # Create a new list to store the unique strings in order
97
+ unique_list = []
98
+
99
+ for string in self.all_entities:
100
+ if string not in unique_set:
101
+ unique_set.add(string)
102
+ unique_list.append(string)
103
+
104
+ return unique_list