Spaces:

wibberlet
/

NamedEntities

Sleeping

App Files Files Community

wibberlet commited on Jan 8, 2024

Commit

94be5fc

1 Parent(s): 9257d34

Update NamedEntity.py

Browse files

Files changed (1) hide show

NamedEntity.py +104 -0

NamedEntity.py CHANGED Viewed

	@@ -0,0 +1,104 @@

+"""
+| **Abbreviation** | **Description** |
+|------------------|-----------------|
+| O                | Outside of a named entity
+| B-MIS            | Beginning of a miscellaneous entity right after another miscellaneous entity
+| I-MIS            | Miscellaneous entity
+| B-PER            | Beginning of a person’s name right after another person’s name
+| I-PER            | Person’s name
+| B-ORG            | Beginning of an organization right after another organization
+| I-ORG            | Organization
+| B-LOC            | Beginning of a location right after another location
+| I-LOC            | Location
+"""
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+from enum import Enum
+class DictKey(Enum):
+    ENTITY = 'entity'
+    SCORE = 'score'
+    INDEX = 'index'
+    WORD = 'word'
+    START = 'start'
+    END = 'end'
+class NER:
+    def __init__(self, text_to_analyse):
+        """
+        The Constructor for the Named Entity Recognition class.
+        :param text_to_analyse: The text in which to find named entities.
+        """
+        if text_to_analyse is None or len(text_to_analyse.strip()) == 0:
+            raise ValueError("text_to_analyse must not be empty and must be set to a valid string value")
+        self.tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
+        if self.tokenizer is None:
+            raise ValueError("Unable to load tokenizer from DSLIM BERT model")
+        self.model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
+        if self.model is None:
+            raise ValueError("Unable to load model from DSLIM BERT model")
+        self.nlp = pipeline("ner", model=self.model, tokenizer=self.tokenizer, grouped_entities=True)
+        if self.nlp is None:
+            raise ValueError("Unable to load pipeline from DSLIM BERT model")
+        self.text_to_analyse = text_to_analyse
+        self.results = self.nlp(text_to_analyse)
+        self.all_entities = self.get_list_of_entities()
+        self.unique_entities = self.unique_entities()
+        self.markdown = None
+        self.markdown_text = None
+    def get_entity_value(self, key: DictKey, item_index):
+        """
+        Extracts the value for a specific key (as an Enum) from a specific dictionary item in the list.
+        :param key: DictKey Enum representing the key for which the value is required.
+        :param item_index: Index of the item in the list to process.
+        :return: Value for the given key in the specified dictionary item, or None if key is not found.
+        """
+        if item_index < len(self.results):
+            return self.results[item_index].get(key.value)
+        else:
+            raise ValueError("The supplied list index is out of bounds")
+    def get_list_of_entities(self):
+        """
+        Returns a list of all entities in the original text, in the order they appear. There may be repeated
+        entities in this list.
+        :return: A list of all entities in the original text.
+        """
+        # create a list where each item is the value of word from each of the dictionaries in self.results
+        return [item.get(DictKey.WORD.value) for item in self.results]
+    def entity_markdown(self):
+        """
+        Convert a string to markdown format and change the color of specified substrings to red.
+        """
+        self.markdown = self.text_to_analyse
+        for substring in self.get_list_of_entities():
+            self.markdown = self.markdown.replace(substring, f'<span style = "color:red;">{substring}</span>')
+        self.markdown_text = self.markdown.replace('\n', '  \n')  # Two spaces at the end of line for markdown new line
+    def unique_entities(self):
+        """
+        Return a list of all unique entities in the original text.
+        :return: A list of unique entities.
+        """
+        unique_set = set()  # Sets are faster than lists for checking membership
+        # Create a new list to store the unique strings in order
+        unique_list = []
+        for string in self.all_entities:
+            if string not in unique_set:
+                unique_set.add(string)
+                unique_list.append(string)
+        return unique_list