NamedEntityRecognitionTool

Running

App Files Files Community

Chris4K commited on Jan 9, 2024

Commit

28094fc

1 Parent(s): cf00609

Update ner_tool.py

Browse files

Files changed (1) hide show

ner_tool.py +24 -11

ner_tool.py CHANGED Viewed

@@ -16,8 +16,12 @@ class NamedEntityRecognitionTool(Tool):
         # Perform named entity recognition on the input text
         entities = ner_analyzer(text)
-        # Prepare a list to store token-level entities
-        token_entities = []
         for entity in entities:
             label = entity.get("entity", "UNKNOWN")
@@ -30,15 +34,24 @@ class NamedEntityRecognitionTool(Tool):
             # Check for multi-token entities
             if "##" in word:
-                # For multi-token entities, add each sub-token with its label
-                sub_tokens = word.split("##")
-                for i, sub_token in enumerate(sub_tokens):
-                    token_entities.append({"token": sub_token, "label": label, "entity_text": entity_text})
             else:
-                # For single-token entities, add the token with its label
-                token_entities.append({"token": word, "label": label, "entity_text": entity_text})
-        # Print the identified token-level entities
-        print(f"Token-level Entities: {token_entities}")
-        return {"entities": token_entities}  # Return a dictionary with the specified output component

         # Perform named entity recognition on the input text
         entities = ner_analyzer(text)
+        # Prepare a list to store word-level entities
+        word_entities = []
+        # Initialize variables to track the current word and its label
+        current_word = ""
+        current_label = None
         for entity in entities:
             label = entity.get("entity", "UNKNOWN")
             # Check for multi-token entities
             if "##" in word:
+                # Concatenate sub-tokens to form the complete word
+                current_word += entity_text
+                current_label = label
             else:
+                # If it's the first token of a new word, add the previous word to the list
+                if current_word:
+                    word_entities.append({"word": current_word, "label": current_label, "entity_text": current_word})
+                    current_word = ""
+                    current_label = None
+                # Add the current token as a new word
+                word_entities.append({"word": word, "label": label, "entity_text": entity_text})
+        # Check for any remaining word
+        if current_word:
+            word_entities.append({"word": current_word, "label": current_label, "entity_text": current_word})
+        # Print the identified word-level entities
+        print(f"Word-level Entities: {word_entities}")
+        return {"entities": word_entities}  # Return a dictionary with the specified output component