File size: 2,522 Bytes
c5922b9
 
 
 
 
d25649c
c5922b9
507724b
c5922b9
 
 
 
 
 
 
 
d25649c
 
 
 
 
 
 
 
 
 
 
 
 
fb510e6
d25649c
 
 
 
 
 
e607aa8
d25649c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5922b9
fb510e6
d25649c
c5922b9
d25649c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from transformers import pipeline
from transformers import Tool

class NamedEntityRecognitionTool(Tool):
    name = "ner_tool"
    description = "Identifies and labels various entities in a given text."
    inputs = ["text"]
    outputs = ["text"]

    def __call__(self, text: str):
        # Initialize the named entity recognition pipeline
        ner_analyzer = pipeline("ner")

        # Perform named entity recognition on the input text
        entities = ner_analyzer(text)

        # Categorize entities based on labels into different types
        categorized_entities = {
            "persons": [],
            "organizations": [],
            "locations": [],
            "dates": [],
            "times": [],
            "money": [],
            "percentages": [],
            "numbers": [],
            "ordinals": [],
            "miscellaneous": [],
        }

        for entity in entities:
            label = entity.get("entity", "UNKNOWN")
            word = entity.get("word", "")
            start = entity.get("start", -1)
            end = entity.get("end", -1)

            # Extract the complete entity text
            entity_text = text[start:end].strip()

            if label.startswith("I-PER"):
                categorized_entities["persons"].append(entity_text)
            elif label.startswith("I-ORG"):
                categorized_entities["organizations"].append(entity_text)
            elif label.startswith("I-LOC"):
                categorized_entities["locations"].append(entity_text)
            elif label.startswith("I-DATE"):
                categorized_entities["dates"].append(entity_text)
            elif label.startswith("I-TIME"):
                categorized_entities["times"].append(entity_text)
            elif label.startswith("I-MONEY"):
                categorized_entities["money"].append(entity_text)
            elif label.startswith("I-PERCENT"):
                categorized_entities["percentages"].append(entity_text)
            elif label.startswith("I-CARDINAL"):
                categorized_entities["numbers"].append(entity_text)
            elif label.startswith("I-ORDINAL"):
                categorized_entities["ordinals"].append(entity_text)
            else:
                categorized_entities["miscellaneous"].append(entity_text)

        # Print the identified entities
        print(f"Categorized Entities: {categorized_entities}")

        return {"entities": categorized_entities}  # Return a dictionary with the specified output component