|
import re |
|
from os import path |
|
|
|
from collections import Counter |
|
|
|
from loguru import logger |
|
|
|
|
|
import spacy |
|
import en_core_web_sm |
|
import zh_core_web_sm |
|
|
|
from magic_pdf.libs.language import detect_lang |
|
|
|
|
|
class NLPModels: |
|
""" |
|
How to upload local models to s3: |
|
- config aws cli: |
|
doc\SETUP-CLI.md |
|
doc\setup_cli.sh |
|
app\config\__init__.py |
|
- $ cd {local_dir_storing_models} |
|
- $ ls models |
|
en_core_web_sm-3.7.1/ |
|
zh_core_web_sm-3.7.0/ |
|
- $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm |
|
- $ aws s3 --profile=p_project_norm ls s3://llm-infra/models/ |
|
PRE en_core_web_sm-3.7.1/ |
|
PRE zh_core_web_sm-3.7.0/ |
|
""" |
|
|
|
def __init__(self): |
|
|
|
|
|
home_dir = path.expanduser("~") |
|
self.default_local_path = path.join(home_dir, ".nlp_models") |
|
self.default_shared_path = "/share/pdf_processor/nlp_models" |
|
self.default_hdfs_path = "hdfs://pdf_processor/nlp_models" |
|
self.default_s3_path = "s3://llm-infra/models" |
|
self.nlp_models = self.nlp_models = { |
|
"en_core_web_sm": { |
|
"type": "spacy", |
|
"version": "3.7.1", |
|
}, |
|
"en_core_web_md": { |
|
"type": "spacy", |
|
"version": "3.7.1", |
|
}, |
|
"en_core_web_lg": { |
|
"type": "spacy", |
|
"version": "3.7.1", |
|
}, |
|
"zh_core_web_sm": { |
|
"type": "spacy", |
|
"version": "3.7.0", |
|
}, |
|
"zh_core_web_md": { |
|
"type": "spacy", |
|
"version": "3.7.0", |
|
}, |
|
"zh_core_web_lg": { |
|
"type": "spacy", |
|
"version": "3.7.0", |
|
}, |
|
} |
|
self.en_core_web_sm_model = en_core_web_sm.load() |
|
self.zh_core_web_sm_model = zh_core_web_sm.load() |
|
|
|
def load_model(self, model_name, model_type, model_version): |
|
if ( |
|
model_name in self.nlp_models |
|
and self.nlp_models[model_name]["type"] == model_type |
|
and self.nlp_models[model_name]["version"] == model_version |
|
): |
|
return spacy.load(model_name) if spacy.util.is_package(model_name) else None |
|
|
|
else: |
|
logger.error(f"Unsupported model name or version: {model_name} {model_version}") |
|
return None |
|
|
|
def detect_language(self, text, use_langdetect=False): |
|
if len(text) == 0: |
|
return None |
|
if use_langdetect: |
|
|
|
|
|
|
|
if detect_lang(text) == "zh": |
|
return "zh" |
|
else: |
|
return "en" |
|
|
|
if not use_langdetect: |
|
en_count = len(re.findall(r"[a-zA-Z]", text)) |
|
cn_count = len(re.findall(r"[\u4e00-\u9fff]", text)) |
|
|
|
if en_count > cn_count: |
|
return "en" |
|
|
|
if cn_count > en_count: |
|
return "zh" |
|
|
|
def detect_entity_catgr_using_nlp(self, text, threshold=0.5): |
|
""" |
|
Detect entity categories using NLP models and return the most frequent entity types. |
|
|
|
Parameters |
|
---------- |
|
text : str |
|
Text to be processed. |
|
|
|
Returns |
|
------- |
|
str |
|
The most frequent entity type. |
|
""" |
|
lang = self.detect_language(text, use_langdetect=True) |
|
|
|
if lang == "en": |
|
nlp_model = self.en_core_web_sm_model |
|
elif lang == "zh": |
|
nlp_model = self.zh_core_web_sm_model |
|
else: |
|
|
|
return {} |
|
|
|
|
|
text_parts = re.split(r"[,;,;、\s & |]+", text) |
|
|
|
text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)] |
|
text_combined = " ".join(text_parts) |
|
|
|
try: |
|
doc = nlp_model(text_combined) |
|
entity_counts = Counter([ent.label_ for ent in doc.ents]) |
|
word_counts_in_entities = Counter() |
|
|
|
for ent in doc.ents: |
|
word_counts_in_entities[ent.label_] += len(ent.text.split()) |
|
|
|
total_words_in_entities = sum(word_counts_in_entities.values()) |
|
total_words = len([token for token in doc if not token.is_punct]) |
|
|
|
if total_words_in_entities == 0 or total_words == 0: |
|
return None |
|
|
|
entity_percentage = total_words_in_entities / total_words |
|
if entity_percentage < 0.5: |
|
return None |
|
|
|
most_common_entity, word_count = word_counts_in_entities.most_common(1)[0] |
|
entity_percentage = word_count / total_words_in_entities |
|
|
|
if entity_percentage >= threshold: |
|
return most_common_entity |
|
else: |
|
return None |
|
except Exception as e: |
|
logger.error(f"Error in entity detection: {e}") |
|
return None |
|
|
|
|
|
def __main__(): |
|
nlpModel = NLPModels() |
|
|
|
test_strings = [ |
|
"张三", |
|
"张三, 李四,王五; 赵六", |
|
"John Doe", |
|
"Jane Smith", |
|
"Lee, John", |
|
"John Doe, Jane Smith; Alice Johnson,Bob Lee", |
|
"孙七, Michael Jordan;赵八", |
|
"David Smith Michael O'Connor; Kevin ßáçøñ", |
|
"李雷·韩梅梅, 张三·李四", |
|
"Charles Robert Darwin, Isaac Newton", |
|
"莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔", |
|
"John Doe, Jane Smith; Alice Johnson", |
|
"张三, 李四,王五; 赵六", |
|
"Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG", |
|
"Rachel Mills & William Barry & Susanne B. Haga", |
|
"Claire Chabut* and Jean-François Bussières", |
|
"1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China", |
|
"Changchun", |
|
"china", |
|
"Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2", |
|
"Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene", |
|
"Synergistic Effect of Supported Nickel Catalyst with", |
|
"Intumescent Flame-Retardants on Flame Retardancy", |
|
"and Thermal Stability of Polypropylene", |
|
] |
|
|
|
for test in test_strings: |
|
print() |
|
print(f"Original String: {test}") |
|
|
|
result = nlpModel.detect_entity_catgr_using_nlp(test) |
|
print(f"Detected entities: {result}") |
|
|
|
|
|
if __name__ == "__main__": |
|
__main__() |
|
|