File size: 6,901 Bytes
240e0a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import re
from os import path

from collections import Counter

from loguru import logger

# from langdetect import detect
import spacy
import en_core_web_sm
import zh_core_web_sm

from magic_pdf.libs.language import detect_lang


class NLPModels:
    """
    How to upload local models to s3:
        - config aws cli:
            doc\SETUP-CLI.md
            doc\setup_cli.sh
            app\config\__init__.py
        - $ cd {local_dir_storing_models}
        - $ ls models
            en_core_web_sm-3.7.1/
            zh_core_web_sm-3.7.0/
        - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
        - $ aws s3 --profile=p_project_norm ls  s3://llm-infra/models/
            PRE en_core_web_sm-3.7.1/
            PRE zh_core_web_sm-3.7.0/
    """

    def __init__(self):
        # if OS is windows, set "TMP_DIR" to "D:/tmp"

        home_dir = path.expanduser("~")
        self.default_local_path = path.join(home_dir, ".nlp_models")
        self.default_shared_path = "/share/pdf_processor/nlp_models"
        self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
        self.default_s3_path = "s3://llm-infra/models"
        self.nlp_models = self.nlp_models = {
            "en_core_web_sm": {
                "type": "spacy",
                "version": "3.7.1",
            },
            "en_core_web_md": {
                "type": "spacy",
                "version": "3.7.1",
            },
            "en_core_web_lg": {
                "type": "spacy",
                "version": "3.7.1",
            },
            "zh_core_web_sm": {
                "type": "spacy",
                "version": "3.7.0",
            },
            "zh_core_web_md": {
                "type": "spacy",
                "version": "3.7.0",
            },
            "zh_core_web_lg": {
                "type": "spacy",
                "version": "3.7.0",
            },
        }
        self.en_core_web_sm_model = en_core_web_sm.load()
        self.zh_core_web_sm_model = zh_core_web_sm.load()

    def load_model(self, model_name, model_type, model_version):
        if (
            model_name in self.nlp_models
            and self.nlp_models[model_name]["type"] == model_type
            and self.nlp_models[model_name]["version"] == model_version
        ):
            return spacy.load(model_name) if spacy.util.is_package(model_name) else None

        else:
            logger.error(f"Unsupported model name or version: {model_name} {model_version}")
            return None

    def detect_language(self, text, use_langdetect=False):
        if len(text) == 0:
            return None
        if use_langdetect:
            # print("use_langdetect")
            # print(detect_lang(text))
            # return detect_lang(text)
            if detect_lang(text) == "zh":
                return "zh"
            else:
                return "en"

        if not use_langdetect:
            en_count = len(re.findall(r"[a-zA-Z]", text))
            cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))

            if en_count > cn_count:
                return "en"

            if cn_count > en_count:
                return "zh"

    def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
        """
        Detect entity categories using NLP models and return the most frequent entity types.

        Parameters
        ----------
        text : str
            Text to be processed.

        Returns
        -------
        str
            The most frequent entity type.
        """
        lang = self.detect_language(text, use_langdetect=True)

        if lang == "en":
            nlp_model = self.en_core_web_sm_model
        elif lang == "zh":
            nlp_model = self.zh_core_web_sm_model
        else:
            # logger.error(f"Unsupported language: {lang}")
            return {}

        # Splitting text into smaller parts
        text_parts = re.split(r"[,;,;、\s & |]+", text)

        text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)]  # Remove non-words
        text_combined = " ".join(text_parts)

        try:
            doc = nlp_model(text_combined)
            entity_counts = Counter([ent.label_ for ent in doc.ents])
            word_counts_in_entities = Counter()

            for ent in doc.ents:
                word_counts_in_entities[ent.label_] += len(ent.text.split())

            total_words_in_entities = sum(word_counts_in_entities.values())
            total_words = len([token for token in doc if not token.is_punct])

            if total_words_in_entities == 0 or total_words == 0:
                return None

            entity_percentage = total_words_in_entities / total_words
            if entity_percentage < 0.5:
                return None

            most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
            entity_percentage = word_count / total_words_in_entities

            if entity_percentage >= threshold:
                return most_common_entity
            else:
                return None
        except Exception as e:
            logger.error(f"Error in entity detection: {e}")
            return None


def __main__():
    nlpModel = NLPModels()

    test_strings = [
        "张三",
        "张三, 李四,王五; 赵六",
        "John Doe",
        "Jane Smith",
        "Lee, John",
        "John Doe, Jane Smith; Alice Johnson,Bob Lee",
        "孙七, Michael Jordan;赵八",
        "David Smith  Michael O'Connor; Kevin ßáçøñ",
        "李雷·韩梅梅, 张三·李四",
        "Charles Robert Darwin, Isaac Newton",
        "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
        "John Doe, Jane Smith; Alice Johnson",
        "张三, 李四,王五; 赵六",
        "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
        "Rachel Mills  &  William Barry  &  Susanne B. Haga",
        "Claire Chabut* and Jean-François Bussières",
        "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
        "Changchun",
        "china",
        "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
        "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
        "Synergistic Effect of Supported Nickel Catalyst with",
        "Intumescent Flame-Retardants on Flame Retardancy",
        "and Thermal Stability of Polypropylene",
    ]

    for test in test_strings:
        print()
        print(f"Original String: {test}")

        result = nlpModel.detect_entity_catgr_using_nlp(test)
        print(f"Detected entities: {result}")


if __name__ == "__main__":
    __main__()