Chris4K commited on
Commit
28094fc
·
1 Parent(s): cf00609

Update ner_tool.py

Browse files
Files changed (1) hide show
  1. ner_tool.py +24 -11
ner_tool.py CHANGED
@@ -16,8 +16,12 @@ class NamedEntityRecognitionTool(Tool):
16
  # Perform named entity recognition on the input text
17
  entities = ner_analyzer(text)
18
 
19
- # Prepare a list to store token-level entities
20
- token_entities = []
 
 
 
 
21
 
22
  for entity in entities:
23
  label = entity.get("entity", "UNKNOWN")
@@ -30,15 +34,24 @@ class NamedEntityRecognitionTool(Tool):
30
 
31
  # Check for multi-token entities
32
  if "##" in word:
33
- # For multi-token entities, add each sub-token with its label
34
- sub_tokens = word.split("##")
35
- for i, sub_token in enumerate(sub_tokens):
36
- token_entities.append({"token": sub_token, "label": label, "entity_text": entity_text})
37
  else:
38
- # For single-token entities, add the token with its label
39
- token_entities.append({"token": word, "label": label, "entity_text": entity_text})
 
 
 
 
 
 
 
 
 
 
40
 
41
- # Print the identified token-level entities
42
- print(f"Token-level Entities: {token_entities}")
43
 
44
- return {"entities": token_entities} # Return a dictionary with the specified output component
 
16
  # Perform named entity recognition on the input text
17
  entities = ner_analyzer(text)
18
 
19
+ # Prepare a list to store word-level entities
20
+ word_entities = []
21
+
22
+ # Initialize variables to track the current word and its label
23
+ current_word = ""
24
+ current_label = None
25
 
26
  for entity in entities:
27
  label = entity.get("entity", "UNKNOWN")
 
34
 
35
  # Check for multi-token entities
36
  if "##" in word:
37
+ # Concatenate sub-tokens to form the complete word
38
+ current_word += entity_text
39
+ current_label = label
 
40
  else:
41
+ # If it's the first token of a new word, add the previous word to the list
42
+ if current_word:
43
+ word_entities.append({"word": current_word, "label": current_label, "entity_text": current_word})
44
+ current_word = ""
45
+ current_label = None
46
+
47
+ # Add the current token as a new word
48
+ word_entities.append({"word": word, "label": label, "entity_text": entity_text})
49
+
50
+ # Check for any remaining word
51
+ if current_word:
52
+ word_entities.append({"word": current_word, "label": current_label, "entity_text": current_word})
53
 
54
+ # Print the identified word-level entities
55
+ print(f"Word-level Entities: {word_entities}")
56
 
57
+ return {"entities": word_entities} # Return a dictionary with the specified output component