|
--- |
|
license: apache-2.0 |
|
language: |
|
- zh |
|
- ja |
|
- ar |
|
- en |
|
- hi |
|
metrics: |
|
- accuracy |
|
library_name: allennlp |
|
--- |
|
## Language Identification |
|
|
|
该模型是基于 AllenNLP 在 [qgyd2021/language_identification](https://huggingface.co/datasets/qgyd2021/language_identification) 数据集上训练的语种识别模型。 |
|
|
|
|
|
|
|
在 valid 验证集上的准确率情况: |
|
|
|
| 语种 | 样本数量 | 准确率 | |
|
| :--- | :----: | ------: | |
|
| af | 6221 | 0.8666 | |
|
| ar | 19808 | 0.9994 | |
|
| bg | 19913 | 0.9958 | |
|
| bn | 7396 | 0.9968 | |
|
| bs | 1653 | 0.8232 | |
|
| cs | 19122 | 0.9615 | |
|
| da | 19500 | 0.9727 | |
|
| de | 19702 | 0.996 | |
|
| el | 19455 | 0.9761 | |
|
| en | 39710 | 0.9942 | |
|
| eo | 18542 | 0.9944 | |
|
| es | 19924 | 0.9937 | |
|
| et | 19482 | 0.9727 | |
|
| fi | 19223 | 0.9554 | |
|
| fo | 4612 | 0.9697 | |
|
| fr | 19990 | 0.9957 | |
|
| ga | 19949 | 0.9973 | |
|
| gl | 508 | 0.822 | |
|
| hi | 19984 | 0.9965 | |
|
| hi_en | 1358 | 0.951 | |
|
| hr | 18840 | 0.9789 | |
|
| hu | 669 | 0.8873 | |
|
| hy | 124 | 0.9688 | |
|
| id | 4669 | 0.9968 | |
|
| is | 19795 | 0.9876 | |
|
| it | 19742 | 0.9941 | |
|
| ja | 20130 | 0.9996 | |
|
| ko | 20098 | 0.9998 | |
|
| lt | 19280 | 0.9721 | |
|
| lv | 19459 | 0.9931 | |
|
| mr | 10300 | 0.9961 | |
|
| mt | 19708 | 0.993 | |
|
| nl | 18452 | 0.9258 | |
|
| no | 19404 | 0.9714 | |
|
| pl | 19920 | 0.9973 | |
|
| pt | 19996 | 0.9946 | |
|
| ro | 19804 | 0.9944 | |
|
| ru | 20003 | 0.9954 | |
|
| sk | 19804 | 0.9861 | |
|
| sl | 19665 | 0.9926 | |
|
| sv | 18941 | 0.95 | |
|
| sw | 19768 | 0.9871 | |
|
| th | 19917 | 0.9991 | |
|
| tl | 19572 | 0.9991 | |
|
| tn | 19883 | 0.9933 | |
|
| tr | 19809 | 0.9939 | |
|
| ts | 19752 | 0.9854 | |
|
| uk | 17643 | 0.9994 | |
|
| ur | 19895 | 0.992 | |
|
| vi | 19836 | 0.9982 | |
|
| yo | 1936 | 0.9827 | |
|
| zh | 40108 | 0.9996 | |
|
| zu | 5406 | 0.9905 | |
|
|
|
|
|
|
|
|
|
测试代码: |
|
```python |
|
#!/usr/bin/python3 |
|
# -*- coding: utf-8 -*- |
|
import argparse |
|
import time |
|
|
|
from allennlp.models.archival import archive_model, load_archive |
|
from allennlp.predictors.text_classifier import TextClassifierPredictor |
|
|
|
from project_settings import project_path |
|
|
|
|
|
def get_args(): |
|
""" |
|
python3 step_5_predict_by_archive.py |
|
:return: |
|
""" |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--text", |
|
default="hello guy.", |
|
type=str |
|
) |
|
parser.add_argument( |
|
"--archive_file", |
|
default=(project_path / "trained_models/language_identification").as_posix(), |
|
type=str |
|
) |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
def main(): |
|
args = get_args() |
|
|
|
archive = load_archive(archive_file=args.archive_file) |
|
|
|
predictor = TextClassifierPredictor( |
|
model=archive.model, |
|
dataset_reader=archive.dataset_reader, |
|
) |
|
|
|
json_dict = { |
|
"sentence": args.text |
|
} |
|
|
|
begin_time = time.time() |
|
outputs = predictor.predict_json( |
|
json_dict |
|
) |
|
label = outputs["label"] |
|
prob = round(max(outputs["probs"]), 4) |
|
print(label) |
|
print(prob) |
|
|
|
print('time cost: {}'.format(time.time() - begin_time)) |
|
return |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|
|
``` |
|
|
|
requirements.txt |
|
```text |
|
allennlp==2.10.1 |
|
allennlp-models==2.10.1 |
|
torch==1.12.1 |
|
overrides==1.9.0 |
|
pytorch_pretrained_bert==0.6.2 |
|
``` |