Spaces:
Sleeping
Sleeping
File size: 1,516 Bytes
14e19a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
from config import LANGUAGE_IDENTIFICATION_LIBRARY
module = LANGUAGE_IDENTIFICATION_LIBRARY.lower()
def classify_language(text: str, target_languages: list = None) -> str:
if module == "fastlid" or module == "fasttext":
from fastlid import fastlid
classifier = fastlid
if target_languages != None: fastlid.set_languages = target_languages
elif module == "langid":
import langid
classifier = langid.classify
if target_languages != None: langid.set_languages(target_languages)
else:
raise ValueError(f"Wrong LANGUAGE_IDENTIFICATION_LIBRARY in config.py")
lang = classifier(text)[0]
return lang
def classify_zh_ja(text: str) -> str:
for idx, char in enumerate(text):
unicode_val = ord(char)
# 检测日语字符
if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF:
return "ja"
# 检测汉字字符
if 0x4E00 <= unicode_val <= 0x9FFF:
# 检查周围的字符
next_char = text[idx + 1] if idx + 1 < len(text) else None
if next_char and (0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF):
return "ja"
return "zh"
if __name__ == "__main__":
text = "这是一个测试文本"
print(classify_language(text))
print(classify_zh_ja(text)) # "zh"
text = "これはテストテキストです"
print(classify_language(text))
print(classify_zh_ja(text)) # "ja"
|