GitHub Actions commited on
Commit
1364963
·
1 Parent(s): b746f44

Auto-deploy from GitHub (binary files removed)

Browse files
Files changed (33) hide show
  1. space/results/output.log +88 -0
  2. space/results/output.py +116 -0
  3. space/space/.gitignore +0 -2
  4. space/space/requirements.txt +0 -0
  5. space/space/space/space/space/space/.github/workflows/main.yml +47 -0
  6. space/space/space/space/space/space/.gitignore +23 -0
  7. space/space/space/space/space/space/LICENSE +201 -0
  8. space/space/space/space/space/space/configs/config.yaml +1 -0
  9. space/space/space/space/space/space/environment.yml +9 -0
  10. space/space/space/space/space/space/models/best_epoch_16.pt +3 -0
  11. space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb +0 -0
  12. space/space/space/space/space/space/notebooks/Kien_RF_lightgbm.ipynb +741 -0
  13. space/space/space/space/space/space/notebooks/Kien_Rule_base.ipynb +0 -0
  14. space/space/space/space/space/space/notebooks/Softmax_PhoBERT.ipynb +0 -0
  15. space/space/space/space/space/space/requirements.txt +0 -0
  16. space/space/space/space/space/space/run.py +73 -0
  17. space/space/space/space/space/space/space/.gitattributes +35 -0
  18. space/space/space/space/space/space/space/README.md +87 -0
  19. space/space/space/space/space/space/src/__init__.py +4 -0
  20. space/space/space/space/space/space/src/app.py +64 -0
  21. space/space/space/space/space/space/src/configs.py +15 -0
  22. space/space/space/space/space/space/src/data_set.py +31 -0
  23. space/space/space/space/space/space/src/evaluate.py +21 -0
  24. space/space/space/space/space/space/src/front.py +32 -0
  25. space/space/space/space/space/space/src/model.py +16 -0
  26. space/space/space/space/space/space/src/predict.py +46 -0
  27. space/space/space/space/space/space/src/preprocessing.py +171 -0
  28. space/space/space/space/space/space/src/torchcrf/__init__.py +340 -0
  29. space/space/space/space/space/space/src/train.py +98 -0
  30. space/space/space/src/app.py +96 -14
  31. space/space/space/src/predict.py +1 -1
  32. space/space/space/st.py +98 -0
  33. src/predict.py +5 -1
space/results/output.log ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Train Epoch 1/20: 100%|██████████| 736/736 [00:22<00:00, 32.46it/s, avg_loss=2.69, batch_loss=0.947]
2
+ Epoch 1: train_loss=2.6912, train_f1=0.8224, val_loss=1.0848, val_f1=0.8273
3
+ Saved imporved model to ./models/best_epoch_1.pt
4
+ Train Epoch 2/20: 100%|██████████| 736/736 [00:21<00:00, 33.55it/s, avg_loss=0.806, batch_loss=0.998]
5
+
6
+ Epoch 2: train_loss=0.8061, train_f1=0.8674, val_loss=0.7191, val_f1=0.8613
7
+ Saved imporved model to ./models/best_epoch_2.pt
8
+ Train Epoch 3/20: 100%|██████████| 736/736 [00:22<00:00, 32.59it/s, avg_loss=0.584, batch_loss=0.0527]
9
+
10
+ Epoch 3: train_loss=0.5842, train_f1=0.8996, val_loss=0.5643, val_f1=0.8895
11
+ Saved imporved model to ./models/best_epoch_3.pt
12
+ Train Epoch 4/20: 100%|██████████| 736/736 [00:23<00:00, 31.34it/s, avg_loss=0.478, batch_loss=1.06]
13
+
14
+ Epoch 4: train_loss=0.4782, train_f1=0.9122, val_loss=0.4838, val_f1=0.8994
15
+ Saved imporved model to ./models/best_epoch_4.pt
16
+ Train Epoch 5/20: 100%|██████████| 736/736 [00:22<00:00, 32.59it/s, avg_loss=0.406, batch_loss=0.421]
17
+
18
+ Epoch 5: train_loss=0.4056, train_f1=0.9254, val_loss=0.4281, val_f1=0.9101
19
+ Saved imporved model to ./models/best_epoch_5.pt
20
+ Train Epoch 6/20: 100%|██████████| 736/736 [00:21<00:00, 34.15it/s, avg_loss=0.36, batch_loss=1.01]
21
+
22
+ Epoch 6: train_loss=0.3599, train_f1=0.9343, val_loss=0.3934, val_f1=0.9190
23
+ Saved imporved model to ./models/best_epoch_6.pt
24
+ Train Epoch 7/20: 100%|██████████| 736/736 [00:22<00:00, 33.08it/s, avg_loss=0.322, batch_loss=0.392]
25
+
26
+ Epoch 7: train_loss=0.3218, train_f1=0.9383, val_loss=0.3751, val_f1=0.9192
27
+ Saved imporved model to ./models/best_epoch_7.pt
28
+ Train Epoch 8/20: 100%|██████████| 736/736 [00:22<00:00, 32.66it/s, avg_loss=0.294, batch_loss=0.468]
29
+
30
+ Epoch 8: train_loss=0.2942, train_f1=0.9424, val_loss=0.3560, val_f1=0.9189
31
+ Train Epoch 9/20: 100%|██████████| 736/736 [00:23<00:00, 31.68it/s, avg_loss=0.27, batch_loss=0.681]
32
+
33
+ Epoch 9: train_loss=0.2699, train_f1=0.9429, val_loss=0.3521, val_f1=0.9177
34
+ Train Epoch 10/20: 100%|██████████| 736/736 [00:21<00:00, 33.46it/s, avg_loss=0.252, batch_loss=0.525]
35
+
36
+ Epoch 10: train_loss=0.2517, train_f1=0.9493, val_loss=0.3413, val_f1=0.9222
37
+ Saved imporved model to ./models/best_epoch_10.pt
38
+ Train Epoch 11/20: 100%|██████████| 736/736 [00:22<00:00, 32.92it/s, avg_loss=0.238, batch_loss=0.022]
39
+
40
+ Epoch 11: train_loss=0.2383, train_f1=0.9551, val_loss=0.3292, val_f1=0.9232
41
+ Saved imporved model to ./models/best_epoch_11.pt
42
+ Train Epoch 12/20: 100%|██████████| 736/736 [00:23<00:00, 31.72it/s, avg_loss=0.222, batch_loss=0.529]
43
+
44
+ Epoch 12: train_loss=0.2223, train_f1=0.9543, val_loss=0.3305, val_f1=0.9207
45
+ Train Epoch 13/20: 100%|██████████| 736/736 [00:23<00:00, 31.74it/s, avg_loss=0.213, batch_loss=0.381]
46
+
47
+ Epoch 13: train_loss=0.2127, train_f1=0.9593, val_loss=0.3244, val_f1=0.9221
48
+ Train Epoch 14/20: 100%|██████████| 736/736 [00:23<00:00, 31.69it/s, avg_loss=0.203, batch_loss=0.279]
49
+
50
+ Epoch 14: train_loss=0.2026, train_f1=0.9609, val_loss=0.3213, val_f1=0.9224
51
+ Train Epoch 15/20: 100%|██████████| 736/736 [00:23<00:00, 31.84it/s, avg_loss=0.193, batch_loss=0.0462]
52
+
53
+ Epoch 15: train_loss=0.1925, train_f1=0.9574, val_loss=0.3392, val_f1=0.9117
54
+ Train Epoch 16/20: 100%|██████████| 736/736 [00:22<00:00, 32.11it/s, avg_loss=0.186, batch_loss=0.943]
55
+
56
+ Epoch 16: train_loss=0.1863, train_f1=0.9654, val_loss=0.3169, val_f1=0.9250
57
+ Saved imporved model to ./models/best_epoch_16.pt
58
+ Train Epoch 17/20: 100%|██████████| 736/736 [00:22<00:00, 32.38it/s, avg_loss=0.18, batch_loss=0.113]
59
+
60
+ Epoch 17: train_loss=0.1795, train_f1=0.9677, val_loss=0.3187, val_f1=0.9237
61
+ Train Epoch 18/20: 100%|██████████| 736/736 [00:22<00:00, 33.30it/s, avg_loss=0.173, batch_loss=0.00558]
62
+
63
+ Epoch 18: train_loss=0.1728, train_f1=0.9692, val_loss=0.3219, val_f1=0.9173
64
+ Train Epoch 19/20: 100%|██████████| 736/736 [00:23<00:00, 31.48it/s, avg_loss=0.167, batch_loss=0.115]
65
+
66
+ Epoch 19: train_loss=0.1673, train_f1=0.9681, val_loss=0.3261, val_f1=0.9195
67
+ Train Epoch 20/20: 100%|██████████| 736/736 [00:22<00:00, 32.17it/s, avg_loss=0.164, batch_loss=0.0463]
68
+
69
+ Epoch 20: train_loss=0.1640, train_f1=0.9715, val_loss=0.3230, val_f1=0.9185
70
+
71
+ Loading best model from ./models/best_epoch_16.pt for final evaluation...
72
+ Done
73
+
74
+ Evaluation on test set ...
75
+ Test_loss=0.2967, Test_f1=0.9087
76
+ precision recall f1-score support
77
+
78
+ 0 1.00 1.00 1.00 51036
79
+ 1 0.99 0.98 0.99 1112
80
+ 2 0.97 0.99 0.98 506
81
+ 3 0.86 0.79 0.82 180
82
+ 4 0.84 0.80 0.82 291
83
+ 5 0.89 0.91 0.90 939
84
+ 6 0.87 0.84 0.86 428
85
+
86
+ accuracy 0.99 54492
87
+ macro avg 0.92 0.90 0.91 54492
88
+ weighted avg 0.99 0.99 0.99 54492
space/results/output.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Results
2
+ training_log = {
3
+ "epoch": list(range(1, 21)),
4
+ "train_loss": [
5
+ 2.6912, 0.8061, 0.5842, 0.4782, 0.4056,
6
+ 0.3599, 0.3218, 0.2942, 0.2699, 0.2517,
7
+ 0.2383, 0.2223, 0.2127, 0.2026, 0.1925,
8
+ 0.1863, 0.1795, 0.1728, 0.1673, 0.1640
9
+ ],
10
+ "val_loss": [
11
+ 1.0848, 0.7191, 0.5643, 0.4838, 0.4281,
12
+ 0.3934, 0.3751, 0.3560, 0.3521, 0.3413,
13
+ 0.3292, 0.3305, 0.3244, 0.3213, 0.3392,
14
+ 0.3169, 0.3187, 0.3219, 0.3261, 0.3230
15
+ ],
16
+ "train_f1": [
17
+ 0.8224, 0.8674, 0.8996, 0.9122, 0.9254,
18
+ 0.9343, 0.9383, 0.9424, 0.9429, 0.9493,
19
+ 0.9551, 0.9543, 0.9593, 0.9609, 0.9574,
20
+ 0.9654, 0.9677, 0.9692, 0.9681, 0.9715
21
+ ],
22
+ "val_f1": [
23
+ 0.8273, 0.8613, 0.8895, 0.8994, 0.9101,
24
+ 0.9190, 0.9192, 0.9189, 0.9177, 0.9222,
25
+ 0.9232, 0.9207, 0.9221, 0.9224, 0.9117,
26
+ 0.9250, 0.9237, 0.9173, 0.9195, 0.9185
27
+ ]
28
+ }
29
+
30
+ report_dict = {
31
+ 'O': {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 51036},
32
+ 'B-PER': {"precision": 0.99, "recall": 0.98, "f1-score": 0.98, "support": 1112},
33
+ 'I-PER': {"precision": 0.97, "recall": 0.99, "f1-score": 0.98, "support": 506},
34
+ 'B-ORG': {"precision": 0.93, "recall": 0.95, "f1-score": 0.94, "support": 939},
35
+ 'I-ORG': {"precision": 0.93, "recall": 0.91, "f1-score": 0.92, "support": 428},
36
+ 'B-LOC': {"precision": 0.83, "recall": 0.84, "f1-score": 0.84, "support": 180},
37
+ 'I-LOC': {"precision": 0.88, "recall": 0.84, "f1-score": 0.86, "support": 291},
38
+ "accuracy": 0.99,
39
+ "macro avg": {"precision": 0.93, "recall": 0.93, "f1-score": 0.93, "support": 54492},
40
+ "weighted avg": {"precision": 0.99, "recall": 0.99, "f1-score": 0.99, "support": 54492}
41
+ }
42
+
43
+
44
+ report_dict_2 = {
45
+ 'O': {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 68476},
46
+ 'B-PER': {"precision": 0.99, "recall": 0.98, "f1-score": 0.98, "support": 1464},
47
+ 'I-PER': {"precision": 0.98, "recall": 0.98, "f1-score": 0.98, "support": 686},
48
+ 'B-ORG': {"precision": 0.77, "recall": 0.82, "f1-score": 0.80, "support": 257},
49
+ 'I-ORG': {"precision": 0.80, "recall": 0.77, "f1-score": 0.78, "support": 430},
50
+ 'B-LOC': {"precision": 0.88, "recall": 0.90, "f1-score": 0.89, "support": 1241},
51
+ 'I-LOC': {"precision": 0.83, "recall": 0.82, "f1-score": 0.82, "support": 554},
52
+ "accuracy": 0.99,
53
+ "macro avg": {"precision": 0.89, "recall": 0.89, "f1-score": 0.89, "support": 73108},
54
+ "weighted avg": {"precision": 0.99, "recall": 0.99, "f1-score": 0.99, "support": 73108}
55
+ }
56
+
57
+
58
+ model_compare = {
59
+ "Header": ["Model", "F1", "Accuracy"],
60
+ "Data": {
61
+ "PhoBERT + CRF": {"F1": 0.93, "Accuracy": 0.99},
62
+ "CRF": {"F1": 0.91, "Accuracy": 0.99},
63
+ "Softmax": {"F1": 0.89, "Accuracy": 0.99},
64
+ "Random Forest": {"F1": 0.78, "Accuracy": 0.98}
65
+ }
66
+ }
67
+
68
+ data_compare = {
69
+ "Header": ["Data Preprocessing Strategy", "F1"],
70
+ "Data": {
71
+ "Raw": 0.93,
72
+ "Crawl for Balance": 0.91,
73
+ "Remove Sentences with Only 'O' Tags": 0.91
74
+ }
75
+ }
76
+
77
+
78
+
79
+ # EDA
80
+ data_aug_count_sorted = {
81
+ 'B-PER': 474,
82
+ 'I-PER': 121,
83
+ 'B-LOC': 874,
84
+ 'I-LOC': 289,
85
+ 'B-ORG': 1110,
86
+ 'I-ORG': 761
87
+ }
88
+
89
+ raw_data_count_sorted = {
90
+ 'B-PER': 7479,
91
+ 'I-PER': 3522,
92
+ 'B-LOC': 6244,
93
+ 'I-LOC': 2783,
94
+ 'B-ORG': 1212,
95
+ 'I-ORG': 2055,
96
+ 'B-NAT': 282,
97
+ 'I-NAT': 279
98
+ }
99
+
100
+ raw_data_count_withoutNAT_sorted = {
101
+ 'B-PER': 7479,
102
+ 'I-PER': 3522,
103
+ 'B-LOC': 6244,
104
+ 'I-LOC': 2783,
105
+ 'B-ORG': 1212,
106
+ 'I-ORG': 2055
107
+ }
108
+
109
+ combined_count_sorted = {
110
+ 'B-PER': 7953,
111
+ 'I-PER': 3643,
112
+ 'B-LOC': 7118,
113
+ 'I-LOC': 3072,
114
+ 'B-ORG': 2322,
115
+ 'I-ORG': 2816
116
+ }
space/space/.gitignore CHANGED
@@ -10,8 +10,6 @@ __pycache__/
10
 
11
  # Dataset and results folders
12
  data/
13
- results/
14
- outputs/
15
  logs/
16
 
17
  # Large files
 
10
 
11
  # Dataset and results folders
12
  data/
 
 
13
  logs/
14
 
15
  # Large files
space/space/requirements.txt CHANGED
Binary files a/space/space/requirements.txt and b/space/space/requirements.txt differ
 
space/space/space/space/space/space/.github/workflows/main.yml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy to Hugging Face Space
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main # hoặc branch bạn dùng
7
+
8
+ jobs:
9
+ deploy:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout repo
14
+ uses: actions/checkout@v3
15
+
16
+ - name: Set up Git
17
+ run: |
18
+ git config --global user.email "[email protected]"
19
+ git config --global user.name "GitHub Actions"
20
+
21
+ - name: Push to Hugging Face Spaces
22
+ env:
23
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
24
+ run: |
25
+ git clone https://huggingface.co/spaces/DucLai/Vietnamese_NER space
26
+
27
+ # Đồng bộ code vào repo Space (không copy .git)
28
+ rsync -av --exclude '.git' ./ space/
29
+
30
+ # Xoá file binary ra khỏi Git index trước khi commit
31
+ cd space
32
+ find . -type f \( \
33
+ -iname "*.png" -o \
34
+ -iname "*.jpg" -o \
35
+ -iname "*.jpeg" -o \
36
+ -iname "*.mp4" -o \
37
+ -iname "*.zip" -o \
38
+ -iname "*.pth" -o \
39
+ -iname "*.h5" -o \
40
+ -iname "*.tar.gz" -o \
41
+ -iname "*.wav" \
42
+ \) -exec git rm --cached {} \; || true
43
+
44
+ # Commit và push
45
+ git add .
46
+ git commit -m "Auto-deploy from GitHub (binary files removed)" || echo "No changes to commit"
47
+ git push https://DucLai:${HF_TOKEN}@huggingface.co/spaces/DucLai/Vietnamese_NER HEAD
space/space/space/space/space/space/.gitignore ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+
6
+ # Hugging Face binary/model outputs
7
+ *.pth
8
+ *.h5
9
+ *.ckpt
10
+
11
+ # Dataset and results folders
12
+ data/
13
+ results/
14
+ outputs/
15
+ logs/
16
+
17
+ # Large files
18
+ *.zip
19
+ *.tar.gz
20
+ *.mp4
21
+ *.png
22
+ *.jpg
23
+ *.jpeg
space/space/space/space/space/space/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
space/space/space/space/space/space/configs/config.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ ECHO is on.
space/space/space/space/space/space/environment.yml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ name: vnner
2
+ channels:
3
+ - defaults
4
+ - conda-forge
5
+ dependencies:
6
+ - python=3.10
7
+ - pip
8
+ - pip:
9
+ - -r requirements.txt
space/space/space/space/space/space/models/best_epoch_16.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:622cac3a55eec6a245f70c2ec7591d8fbfa8c18e13db7555915405fb57b145a0
3
+ size 24130
space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
space/space/space/space/space/space/notebooks/Kien_RF_lightgbm.ipynb ADDED
@@ -0,0 +1,741 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "id": "10ec017cb658e125",
6
+ "metadata": {
7
+ "ExecuteTime": {
8
+ "end_time": "2025-06-11T00:21:33.244538Z",
9
+ "start_time": "2025-06-11T00:21:05.317283Z"
10
+ }
11
+ },
12
+ "source": [
13
+ "import pandas as pd\n",
14
+ "\n",
15
+ "splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}\n",
16
+ "df_train = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"train\"])\n",
17
+ "df_valid = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"valid\"])\n",
18
+ "df = pd.concat([df_train, df_valid]).reset_index(drop=True)"
19
+ ],
20
+ "outputs": [],
21
+ "execution_count": 1
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "id": "c533c55a2ad7b16e",
26
+ "metadata": {
27
+ "ExecuteTime": {
28
+ "end_time": "2025-06-11T00:21:33.499341Z",
29
+ "start_time": "2025-06-11T00:21:33.262933Z"
30
+ }
31
+ },
32
+ "source": [
33
+ "# Tạo thêm các cột khác\n",
34
+ "def join_tokens(tokens):\n",
35
+ " text = ' '.join(tokens)\n",
36
+ " return text\n",
37
+ "\n",
38
+ "def reform_raw_text(tokens):\n",
39
+ " text = ' '.join(tokens)\n",
40
+ " return text.replace(\"_\", \" \")\n",
41
+ "\n",
42
+ "def label(x):\n",
43
+ " return [id_tag[int(i)] for i in x]\n",
44
+ "\n",
45
+ "def replace_7_8(lst):\n",
46
+ " return [0 if x in (7, 8) else x for x in lst]\n",
47
+ "\n",
48
+ "\n",
49
+ "tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}\n",
50
+ "id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}\n",
51
+ "\n",
52
+ "\n",
53
+ "df['ner_tags'] = df['ner_tags'].apply(replace_7_8)\n",
54
+ "df['text_withseg'] = df['tokens'].apply(join_tokens)\n",
55
+ "df['text_raw'] = df['tokens'].apply(reform_raw_text)\n",
56
+ "df[\"ner_labels\"] = df.ner_tags.apply(label)\n",
57
+ "df.columns = ['tokens', 'id', 'seg_text', 'raw_text', 'labels']\n",
58
+ "df\n"
59
+ ],
60
+ "outputs": [
61
+ {
62
+ "data": {
63
+ "text/plain": [
64
+ " tokens \\\n",
65
+ "0 [Không_khí, thật, náo_nhiệt, .] \n",
66
+ "1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n",
67
+ "2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n",
68
+ "3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n",
69
+ "4 [Nhật_ký, của, thuyền_viên, .] \n",
70
+ "... ... \n",
71
+ "16853 [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ... \n",
72
+ "16854 [Nhưng, mọi, chuyện, không, dừng, ở, đó, .] \n",
73
+ "16855 [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,... \n",
74
+ "16856 [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ... \n",
75
+ "16857 [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t... \n",
76
+ "\n",
77
+ " id \\\n",
78
+ "0 [0, 0, 0, 0] \n",
79
+ "1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n",
80
+ "2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n",
81
+ "3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n",
82
+ "4 [0, 0, 0, 0] \n",
83
+ "... ... \n",
84
+ "16853 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... \n",
85
+ "16854 [0, 0, 0, 0, 0, 0, 0, 0] \n",
86
+ "16855 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n",
87
+ "16856 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n",
88
+ "16857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n",
89
+ "\n",
90
+ " seg_text \\\n",
91
+ "0 Không_khí thật náo_nhiệt . \n",
92
+ "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n",
93
+ "2 Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n",
94
+ "3 Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,... \n",
95
+ "4 Nhật_ký của thuyền_viên . \n",
96
+ "... ... \n",
97
+ "16853 Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ... \n",
98
+ "16854 Nhưng mọi chuyện không dừng ở đó . \n",
99
+ "16855 Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh... \n",
100
+ "16856 Biết_bao người đã tình_nguyện hiến_dâng cả cuộ... \n",
101
+ "16857 Trên đây mới là “ thành_tích ” tiêu tiền của m... \n",
102
+ "\n",
103
+ " raw_text \\\n",
104
+ "0 Không khí thật náo nhiệt . \n",
105
+ "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n",
106
+ "2 Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n",
107
+ "3 Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,... \n",
108
+ "4 Nhật ký của thuyền viên . \n",
109
+ "... ... \n",
110
+ "16853 Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ... \n",
111
+ "16854 Nhưng mọi chuyện không dừng ở đó . \n",
112
+ "16855 Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh... \n",
113
+ "16856 Biết bao người đã tình nguyện hiến dâng cả cuộ... \n",
114
+ "16857 Trên đây mới là “ thành tích ” tiêu tiền của m... \n",
115
+ "\n",
116
+ " labels \n",
117
+ "0 [O, O, O, O] \n",
118
+ "1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n",
119
+ "2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n",
120
+ "3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n",
121
+ "4 [O, O, O, O] \n",
122
+ "... ... \n",
123
+ "16853 [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,... \n",
124
+ "16854 [O, O, O, O, O, O, O, O] \n",
125
+ "16855 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n",
126
+ "16856 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n",
127
+ "16857 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n",
128
+ "\n",
129
+ "[16858 rows x 5 columns]"
130
+ ],
131
+ "text/html": [
132
+ "<div>\n",
133
+ "<style scoped>\n",
134
+ " .dataframe tbody tr th:only-of-type {\n",
135
+ " vertical-align: middle;\n",
136
+ " }\n",
137
+ "\n",
138
+ " .dataframe tbody tr th {\n",
139
+ " vertical-align: top;\n",
140
+ " }\n",
141
+ "\n",
142
+ " .dataframe thead th {\n",
143
+ " text-align: right;\n",
144
+ " }\n",
145
+ "</style>\n",
146
+ "<table border=\"1\" class=\"dataframe\">\n",
147
+ " <thead>\n",
148
+ " <tr style=\"text-align: right;\">\n",
149
+ " <th></th>\n",
150
+ " <th>tokens</th>\n",
151
+ " <th>id</th>\n",
152
+ " <th>seg_text</th>\n",
153
+ " <th>raw_text</th>\n",
154
+ " <th>labels</th>\n",
155
+ " </tr>\n",
156
+ " </thead>\n",
157
+ " <tbody>\n",
158
+ " <tr>\n",
159
+ " <th>0</th>\n",
160
+ " <td>[Không_khí, thật, náo_nhiệt, .]</td>\n",
161
+ " <td>[0, 0, 0, 0]</td>\n",
162
+ " <td>Không_khí thật náo_nhiệt .</td>\n",
163
+ " <td>Không khí thật náo nhiệt .</td>\n",
164
+ " <td>[O, O, O, O]</td>\n",
165
+ " </tr>\n",
166
+ " <tr>\n",
167
+ " <th>1</th>\n",
168
+ " <td>[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...</td>\n",
169
+ " <td>[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
170
+ " <td>Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...</td>\n",
171
+ " <td>Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...</td>\n",
172
+ " <td>[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...</td>\n",
173
+ " </tr>\n",
174
+ " <tr>\n",
175
+ " <th>2</th>\n",
176
+ " <td>[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...</td>\n",
177
+ " <td>[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
178
+ " <td>Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...</td>\n",
179
+ " <td>Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...</td>\n",
180
+ " <td>[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...</td>\n",
181
+ " </tr>\n",
182
+ " <tr>\n",
183
+ " <th>3</th>\n",
184
+ " <td>[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...</td>\n",
185
+ " <td>[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...</td>\n",
186
+ " <td>Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...</td>\n",
187
+ " <td>Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...</td>\n",
188
+ " <td>[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...</td>\n",
189
+ " </tr>\n",
190
+ " <tr>\n",
191
+ " <th>4</th>\n",
192
+ " <td>[Nhật_ký, của, thuyền_viên, .]</td>\n",
193
+ " <td>[0, 0, 0, 0]</td>\n",
194
+ " <td>Nhật_ký của thuyền_viên .</td>\n",
195
+ " <td>Nhật ký của thuyền viên .</td>\n",
196
+ " <td>[O, O, O, O]</td>\n",
197
+ " </tr>\n",
198
+ " <tr>\n",
199
+ " <th>...</th>\n",
200
+ " <td>...</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " <td>...</td>\n",
204
+ " <td>...</td>\n",
205
+ " </tr>\n",
206
+ " <tr>\n",
207
+ " <th>16853</th>\n",
208
+ " <td>[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...</td>\n",
209
+ " <td>[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...</td>\n",
210
+ " <td>Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...</td>\n",
211
+ " <td>Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...</td>\n",
212
+ " <td>[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...</td>\n",
213
+ " </tr>\n",
214
+ " <tr>\n",
215
+ " <th>16854</th>\n",
216
+ " <td>[Nhưng, mọi, chuyện, không, dừng, ở, đó, .]</td>\n",
217
+ " <td>[0, 0, 0, 0, 0, 0, 0, 0]</td>\n",
218
+ " <td>Nhưng mọi chuyện không dừng ở đó .</td>\n",
219
+ " <td>Nhưng mọi chuyện không dừng ở đó .</td>\n",
220
+ " <td>[O, O, O, O, O, O, O, O]</td>\n",
221
+ " </tr>\n",
222
+ " <tr>\n",
223
+ " <th>16855</th>\n",
224
+ " <td>[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...</td>\n",
225
+ " <td>[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
226
+ " <td>Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...</td>\n",
227
+ " <td>Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...</td>\n",
228
+ " <td>[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...</td>\n",
229
+ " </tr>\n",
230
+ " <tr>\n",
231
+ " <th>16856</th>\n",
232
+ " <td>[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...</td>\n",
233
+ " <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]</td>\n",
234
+ " <td>Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...</td>\n",
235
+ " <td>Biết bao người đã tình nguyện hiến dâng cả cuộ...</td>\n",
236
+ " <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]</td>\n",
237
+ " </tr>\n",
238
+ " <tr>\n",
239
+ " <th>16857</th>\n",
240
+ " <td>[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...</td>\n",
241
+ " <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
242
+ " <td>Trên đây mới là “ thành_tích ” tiêu tiền của m...</td>\n",
243
+ " <td>Trên đây mới là “ thành tích ” tiêu tiền của m...</td>\n",
244
+ " <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...</td>\n",
245
+ " </tr>\n",
246
+ " </tbody>\n",
247
+ "</table>\n",
248
+ "<p>16858 rows × 5 columns</p>\n",
249
+ "</div>"
250
+ ]
251
+ },
252
+ "execution_count": 2,
253
+ "metadata": {},
254
+ "output_type": "execute_result"
255
+ }
256
+ ],
257
+ "execution_count": 2
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "id": "14d9b9fae58b7173",
262
+ "metadata": {
263
+ "ExecuteTime": {
264
+ "end_time": "2025-06-11T00:21:59.373985Z",
265
+ "start_time": "2025-06-11T00:21:34.524025Z"
266
+ }
267
+ },
268
+ "source": [
269
+ "import torch\n",
270
+ "from transformers import AutoTokenizer, AutoModel\n",
271
+ "from tqdm import tqdm\n",
272
+ "\n",
273
+ "# Load PhoBERT tokenizer và model\n",
274
+ "tokenizer = AutoTokenizer.from_pretrained(\"vinai/phobert-base\", use_fast=False)\n",
275
+ "model = AutoModel.from_pretrained(\"vinai/phobert-base\")\n",
276
+ "model.eval()"
277
+ ],
278
+ "outputs": [
279
+ {
280
+ "name": "stdout",
281
+ "output_type": "stream",
282
+ "text": [
283
+ "cuda\n"
284
+ ]
285
+ },
286
+ {
287
+ "data": {
288
+ "text/plain": [
289
+ "RobertaModel(\n",
290
+ " (embeddings): RobertaEmbeddings(\n",
291
+ " (word_embeddings): Embedding(64001, 768, padding_idx=1)\n",
292
+ " (position_embeddings): Embedding(258, 768, padding_idx=1)\n",
293
+ " (token_type_embeddings): Embedding(1, 768)\n",
294
+ " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
295
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
296
+ " )\n",
297
+ " (encoder): RobertaEncoder(\n",
298
+ " (layer): ModuleList(\n",
299
+ " (0-11): 12 x RobertaLayer(\n",
300
+ " (attention): RobertaAttention(\n",
301
+ " (self): RobertaSdpaSelfAttention(\n",
302
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
303
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
304
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
305
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
306
+ " )\n",
307
+ " (output): RobertaSelfOutput(\n",
308
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
309
+ " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
310
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
311
+ " )\n",
312
+ " )\n",
313
+ " (intermediate): RobertaIntermediate(\n",
314
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
315
+ " (intermediate_act_fn): GELUActivation()\n",
316
+ " )\n",
317
+ " (output): RobertaOutput(\n",
318
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
319
+ " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
320
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
321
+ " )\n",
322
+ " )\n",
323
+ " )\n",
324
+ " )\n",
325
+ " (pooler): RobertaPooler(\n",
326
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
327
+ " (activation): Tanh()\n",
328
+ " )\n",
329
+ ")"
330
+ ]
331
+ },
332
+ "execution_count": 3,
333
+ "metadata": {},
334
+ "output_type": "execute_result"
335
+ }
336
+ ],
337
+ "execution_count": 3
338
+ },
339
+ {
340
+ "cell_type": "code",
341
+ "id": "a47ec382649c3036",
342
+ "metadata": {
343
+ "ExecuteTime": {
344
+ "end_time": "2025-06-11T00:23:23.888583Z",
345
+ "start_time": "2025-06-11T00:23:23.885204Z"
346
+ }
347
+ },
348
+ "source": [
349
+ "# Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece\n",
350
+ "def group_embeddings(tokens, embeddings):\n",
351
+ " word_embeddings = []\n",
352
+ " current_vecs = []\n",
353
+ "\n",
354
+ " for token, emb in zip(tokens, embeddings):\n",
355
+ " if token in [\"<s>\", \"</s>\"]:\n",
356
+ " continue\n",
357
+ "\n",
358
+ " if token.endswith(\"@@\"):\n",
359
+ " current_vecs.append(emb)\n",
360
+ " else:\n",
361
+ " current_vecs.append(emb)\n",
362
+ " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n",
363
+ " word_embeddings.append(word_emb)\n",
364
+ " current_vecs = []\n",
365
+ "\n",
366
+ " if current_vecs: # Trong trường hợp sót lại cuối câu\n",
367
+ " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n",
368
+ " word_embeddings.append(word_emb)\n",
369
+ "\n",
370
+ " return word_embeddings"
371
+ ],
372
+ "outputs": [],
373
+ "execution_count": 4
374
+ },
375
+ {
376
+ "cell_type": "code",
377
+ "id": "f8c0ad89ae81b0c",
378
+ "metadata": {
379
+ "ExecuteTime": {
380
+ "end_time": "2025-06-11T00:25:52.567135Z",
381
+ "start_time": "2025-06-11T00:23:56.155322Z"
382
+ }
383
+ },
384
+ "source": [
385
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
386
+ "model.to(device)\n",
387
+ "\n",
388
+ "all_embeddings = [] # list of [seq_len_i, 768] tensors\n",
389
+ "all_labels = [] # list of [seq_len_i,] tensors\n",
390
+ "len_em = []\n",
391
+ "\n",
392
+ "# count = 0\n",
393
+ "\n",
394
+ "for i, row in df.iterrows():\n",
395
+ "\n",
396
+ " # count += 1\n",
397
+ " # if count == 500:\n",
398
+ " # break\n",
399
+ "\n",
400
+ " # Truy cập phần tử từng dòng\n",
401
+ " sentence = row['seg_text']\n",
402
+ " gold_labels = row[\"id\"]\n",
403
+ "\n",
404
+ " # Cho sentence đi qua SentencePiece\n",
405
+ " input_ids = tokenizer.encode(sentence, return_tensors=\"pt\").to(device)\n",
406
+ "\n",
407
+ " tokens = tokenizer.convert_ids_to_tokens(input_ids[0].to(device))\n",
408
+ "\n",
409
+ " # Encode tạo embeddings\n",
410
+ " with torch.no_grad():\n",
411
+ " outputs = model(input_ids)\n",
412
+ " last_hidden_state = outputs.last_hidden_state.squeeze(0)\n",
413
+ "\n",
414
+ " # Gộp các embeddings đã bị tách khi đi qua SentencePiece\n",
415
+ " word_embeds = group_embeddings(tokens, last_hidden_state)\n",
416
+ "\n",
417
+ " # Kiểm tra số lượng embeddings và số lượng labels\n",
418
+ " if len(word_embeds) != len(gold_labels):\n",
419
+ " continue\n",
420
+ "\n",
421
+ " # Thêm vào list tổng / Tới đây là data đã sẵn sàng cho training\n",
422
+ " all_embeddings.append(torch.stack(word_embeds))\n",
423
+ " all_labels.append(torch.tensor(gold_labels))"
424
+ ],
425
+ "outputs": [],
426
+ "execution_count": 6
427
+ },
428
+ {
429
+ "metadata": {
430
+ "ExecuteTime": {
431
+ "end_time": "2025-06-11T00:35:23.255306Z",
432
+ "start_time": "2025-06-11T00:35:23.252026Z"
433
+ }
434
+ },
435
+ "cell_type": "code",
436
+ "source": "# We skip 43 data since they aren't convertable",
437
+ "id": "c3e406ad994802be",
438
+ "outputs": [
439
+ {
440
+ "name": "stdout",
441
+ "output_type": "stream",
442
+ "text": [
443
+ "-43\n"
444
+ ]
445
+ }
446
+ ],
447
+ "execution_count": 15
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "id": "cadc3a861025b3b9",
452
+ "metadata": {
453
+ "ExecuteTime": {
454
+ "end_time": "2025-06-11T00:36:18.857012Z",
455
+ "start_time": "2025-06-11T00:36:08.257408Z"
456
+ }
457
+ },
458
+ "source": [
459
+ "import numpy as np\n",
460
+ "from sklearn.model_selection import train_test_split\n",
461
+ "\n",
462
+ "X_flat = []\n",
463
+ "y_flat = []\n",
464
+ "\n",
465
+ "for emb_seq, label_seq in zip(all_embeddings, all_labels):\n",
466
+ " for emb, label in zip(emb_seq, label_seq):\n",
467
+ " X_flat.append(emb.cpu().numpy()) # emb: [768]\n",
468
+ " y_flat.append(label.item()) # label: int\n",
469
+ "\n",
470
+ "X_flat = np.array(X_flat) # [N, 768]\n",
471
+ "y_flat = np.array(y_flat) # [N]\n"
472
+ ],
473
+ "outputs": [],
474
+ "execution_count": 16
475
+ },
476
+ {
477
+ "cell_type": "code",
478
+ "id": "52a0fe72a50d4f73",
479
+ "metadata": {
480
+ "ExecuteTime": {
481
+ "end_time": "2025-06-11T00:39:58.211159Z",
482
+ "start_time": "2025-06-11T00:39:58.208074Z"
483
+ }
484
+ },
485
+ "source": [
486
+ "print(X_flat[0].shape)\n",
487
+ "print(y_flat.shape)"
488
+ ],
489
+ "outputs": [
490
+ {
491
+ "name": "stdout",
492
+ "output_type": "stream",
493
+ "text": [
494
+ "(768,)\n",
495
+ "(368172,)\n"
496
+ ]
497
+ }
498
+ ],
499
+ "execution_count": 19
500
+ },
501
+ {
502
+ "cell_type": "code",
503
+ "id": "d6275df555f0c4c3",
504
+ "metadata": {
505
+ "ExecuteTime": {
506
+ "end_time": "2025-06-11T00:42:00.129778Z",
507
+ "start_time": "2025-06-11T00:42:00.096986Z"
508
+ }
509
+ },
510
+ "source": [
511
+ "# Kiểm tra độ lệch data\n",
512
+ "unique_values, counts = np.unique(y_flat, return_counts=True)\n",
513
+ "\n",
514
+ "# In ra từng giá trị và số lần xuất hiện\n",
515
+ "for val, count in zip(unique_values, counts):\n",
516
+ " print(f\"Label {val}: {count} times\")\n"
517
+ ],
518
+ "outputs": [
519
+ {
520
+ "name": "stdout",
521
+ "output_type": "stream",
522
+ "text": [
523
+ "Label 0: 344986 times\n",
524
+ "Label 1: 7450 times\n",
525
+ "Label 2: 3504 times\n",
526
+ "Label 3: 1204 times\n",
527
+ "Label 4: 2050 times\n",
528
+ "Label 5: 6211 times\n",
529
+ "Label 6: 2767 times\n"
530
+ ]
531
+ }
532
+ ],
533
+ "execution_count": 24
534
+ },
535
+ {
536
+ "cell_type": "code",
537
+ "id": "664020977ba9a1e2",
538
+ "metadata": {
539
+ "ExecuteTime": {
540
+ "end_time": "2025-06-11T00:42:03.350616Z",
541
+ "start_time": "2025-06-11T00:42:02.915680Z"
542
+ }
543
+ },
544
+ "source": [
545
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
546
+ " X_flat, y_flat, test_size=0.2, random_state=42, stratify=y_flat)\n"
547
+ ],
548
+ "outputs": [],
549
+ "execution_count": 25
550
+ },
551
+ {
552
+ "cell_type": "code",
553
+ "id": "d4acda9c7cae3214",
554
+ "metadata": {
555
+ "ExecuteTime": {
556
+ "end_time": "2025-06-11T00:42:25.235471Z",
557
+ "start_time": "2025-06-11T00:42:16.769480Z"
558
+ }
559
+ },
560
+ "source": [
561
+ "import lightgbm as lgb\n",
562
+ "from sklearn.metrics import accuracy_score, f1_score, classification_report\n",
563
+ "\n",
564
+ "\n",
565
+ "# Tạo Dataset cho LightGBM\n",
566
+ "train_data = lgb.Dataset(X_train, label=y_train)\n",
567
+ "test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)\n",
568
+ "\n",
569
+ "# Cấu hình tham số LightGBM (Random Forest mode)\n",
570
+ "params = {\n",
571
+ " \"objective\": \"multiclass\", # nếu multiclass classification\n",
572
+ " \"num_class\": len(np.unique(y_train)),\n",
573
+ " \"metric\": \"multi_logloss\",\n",
574
+ " \"boosting_type\": \"rf\", # random forest mode trong LightGBM\n",
575
+ " \"num_leaves\": 31,\n",
576
+ " \"bagging_freq\": 1,\n",
577
+ " \"bagging_fraction\": 0.8,\n",
578
+ " \"feature_fraction\": 0.8,\n",
579
+ " \"bagging_seed\": 42,\n",
580
+ " \"verbose\": -1,\n",
581
+ " \"seed\": 42,\n",
582
+ " \"is_unbalance\": True\n",
583
+ "}\n",
584
+ "\n",
585
+ "\n",
586
+ "\n",
587
+ "# Train model, tích hợp wandb callback để log metrics\n",
588
+ "model = lgb.train(\n",
589
+ " params,\n",
590
+ " train_data,\n",
591
+ " num_boost_round=2,\n",
592
+ " valid_sets=[train_data, test_data],\n",
593
+ " valid_names=[\"train\", \"test\"]\n",
594
+ ")\n",
595
+ "\n",
596
+ "# Dự đoán trên test set\n",
597
+ "y_pred_prob = model.predict(X_test)\n",
598
+ "y_pred = np.argmax(y_pred_prob, axis=1)\n",
599
+ "\n",
600
+ "# Ánh xạ số về nhãn tên entity\n",
601
+ "label_map = {\n",
602
+ " 0: 'O',\n",
603
+ " 1: 'B-PER',\n",
604
+ " 2: 'I-PER',\n",
605
+ " 3: 'B-ORG',\n",
606
+ " 4: 'I-ORG',\n",
607
+ " 5: 'B-LOC',\n",
608
+ " 6: 'I-LOC'\n",
609
+ "}\n",
610
+ "\n",
611
+ "# Chuyển y_test và y_pred sang nhãn gốc\n",
612
+ "y_test_labels = [label_map[i] for i in y_test]\n",
613
+ "y_pred_labels = [label_map[i] for i in y_pred]\n",
614
+ "\n",
615
+ "# In classification report với nhãn thật\n",
616
+ "print(\"\\nClassification Report (theo label gốc):\")\n",
617
+ "print(classification_report(y_test_labels, y_pred_labels, digits=4))\n",
618
+ "\n",
619
+ "\n"
620
+ ],
621
+ "outputs": [
622
+ {
623
+ "name": "stdout",
624
+ "output_type": "stream",
625
+ "text": [
626
+ "\n",
627
+ "Classification Report (theo label gốc):\n",
628
+ " precision recall f1-score support\n",
629
+ "\n",
630
+ " B-LOC 0.3679 0.5000 0.4239 1242\n",
631
+ " B-ORG 0.2639 0.3942 0.3161 241\n",
632
+ " B-PER 0.4395 0.7490 0.5540 1490\n",
633
+ " I-LOC 0.2321 0.4448 0.3050 553\n",
634
+ " I-ORG 0.1532 0.2878 0.2000 410\n",
635
+ " I-PER 0.4304 0.5863 0.4964 701\n",
636
+ " O 0.9869 0.9478 0.9669 68998\n",
637
+ "\n",
638
+ " accuracy 0.9235 73635\n",
639
+ " macro avg 0.4106 0.5586 0.4660 73635\n",
640
+ "weighted avg 0.9474 0.9235 0.9336 73635\n",
641
+ "\n"
642
+ ]
643
+ }
644
+ ],
645
+ "execution_count": 26
646
+ },
647
+ {
648
+ "metadata": {
649
+ "ExecuteTime": {
650
+ "end_time": "2025-06-11T00:45:00.649942Z",
651
+ "start_time": "2025-06-11T00:45:00.646595Z"
652
+ }
653
+ },
654
+ "cell_type": "code",
655
+ "source": "print(model.feature_importance().shape)",
656
+ "id": "b1cf76bc3e58bc93",
657
+ "outputs": [
658
+ {
659
+ "name": "stdout",
660
+ "output_type": "stream",
661
+ "text": [
662
+ "(768,)\n"
663
+ ]
664
+ }
665
+ ],
666
+ "execution_count": 35
667
+ },
668
+ {
669
+ "metadata": {
670
+ "ExecuteTime": {
671
+ "end_time": "2025-06-11T00:52:36.844604Z",
672
+ "start_time": "2025-06-11T00:52:36.827018Z"
673
+ }
674
+ },
675
+ "cell_type": "code",
676
+ "source": [
677
+ "correct = 0\n",
678
+ "for i in range(73635):\n",
679
+ " if y_pred[i] == y_test[i]:\n",
680
+ " correct += 1\n",
681
+ "correct"
682
+ ],
683
+ "id": "39d391e67a51211c",
684
+ "outputs": [
685
+ {
686
+ "data": {
687
+ "text/plain": [
688
+ "68001"
689
+ ]
690
+ },
691
+ "execution_count": 58,
692
+ "metadata": {},
693
+ "output_type": "execute_result"
694
+ }
695
+ ],
696
+ "execution_count": 58
697
+ },
698
+ {
699
+ "metadata": {
700
+ "ExecuteTime": {
701
+ "end_time": "2025-06-11T00:57:45.109129Z",
702
+ "start_time": "2025-06-11T00:57:45.105078Z"
703
+ }
704
+ },
705
+ "cell_type": "code",
706
+ "source": "print(y_test.shape)",
707
+ "id": "1a0ba8f0410c5589",
708
+ "outputs": [
709
+ {
710
+ "name": "stdout",
711
+ "output_type": "stream",
712
+ "text": [
713
+ "(73635,)\n"
714
+ ]
715
+ }
716
+ ],
717
+ "execution_count": 61
718
+ }
719
+ ],
720
+ "metadata": {
721
+ "kernelspec": {
722
+ "display_name": "Python 3",
723
+ "language": "python",
724
+ "name": "python3"
725
+ },
726
+ "language_info": {
727
+ "codemirror_mode": {
728
+ "name": "ipython",
729
+ "version": 2
730
+ },
731
+ "file_extension": ".py",
732
+ "mimetype": "text/x-python",
733
+ "name": "python",
734
+ "nbconvert_exporter": "python",
735
+ "pygments_lexer": "ipython2",
736
+ "version": "2.7.6"
737
+ }
738
+ },
739
+ "nbformat": 4,
740
+ "nbformat_minor": 5
741
+ }
space/space/space/space/space/space/notebooks/Kien_Rule_base.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
space/space/space/space/space/space/notebooks/Softmax_PhoBERT.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
space/space/space/space/space/space/requirements.txt ADDED
Binary file (2.43 kB). View file
 
space/space/space/space/space/space/run.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.preprocessing import download_raw_data, preprocess_data_for_EDA, load_phoBERT_model_and_tokenizer, create_embeddings, split_dataset
2
+ from src.data_set import NerDataset, collate_fn
3
+ from src.configs import configs
4
+ from src.model import CRF_Tagger
5
+ from src.train import train_model
6
+
7
+ import torch
8
+ from torch.utils.data import DataLoader
9
+
10
+ import warnings
11
+ warnings.filterwarnings("ignore")
12
+
13
+
14
+ def main():
15
+
16
+ # Download VLSP2016 from hgface
17
+ print("Download raw data ...")
18
+ df = download_raw_data()
19
+
20
+ # Save raw data
21
+ df.to_csv(r".\data\raw_data.csv", index=False)
22
+ print("Save at data\raw_data.csv \n")
23
+
24
+ # Process data for EDA
25
+ print("Process data for EDA ...")
26
+ df = preprocess_data_for_EDA(df)
27
+ df.to_csv(r".\data\processed_data_EDA.csv", index=False)
28
+ print("Save at data\processed_data_EDA.csv \n")
29
+
30
+ # Init PhoBERT Tokenizer and PhoBERT Model
31
+ print("Embedding data ...")
32
+ model, tokenizer = load_phoBERT_model_and_tokenizer()
33
+
34
+ # Embeddings data
35
+ processed_data = create_embeddings(df, model, tokenizer)
36
+ torch.save(processed_data, r".\data\processed_data_full.pt")
37
+ print("Save at data\processed_data_full.pt \n")
38
+
39
+ # Split data into train/valid/test
40
+ print("Train/Valid/Test Split ...")
41
+ X_train, Y_train, X_val, Y_val, X_test, Y_test = split_dataset(processed_data)
42
+ print("Done \n")
43
+
44
+ # Data Agumentation for training set
45
+ # Pass
46
+
47
+ # Init DataLoader
48
+ print("Init DataLoader ...")
49
+ datasets = {
50
+ 'train': NerDataset(X_train, Y_train),
51
+ 'val': NerDataset(X_val, Y_val),
52
+ 'test': NerDataset(X_test, Y_test)
53
+ }
54
+
55
+ loaders = {
56
+ split: DataLoader(dataset, batch_size=configs["batch_size"], shuffle=(split=='train'), collate_fn=collate_fn)
57
+ for split, dataset in datasets.items()
58
+ }
59
+ print("Done \n")
60
+
61
+ # Init sequence label model
62
+ print("Init Model ...")
63
+ NUM_TAGS = 7
64
+ model = CRF_Tagger(input_dim=X_train[0].size(1), num_tags=NUM_TAGS)
65
+ optimizer = torch.optim.Adam(model.parameters(), lr=configs["learning_rate"])
66
+ print("Done \n")
67
+
68
+ # Training Model
69
+ print("Start training ...")
70
+ train_model(model, optimizer, configs, loaders)
71
+
72
+ if __name__ == "__main__":
73
+ main()
space/space/space/space/space/space/space/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
space/space/space/space/space/space/space/README.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vietnamese NER Demo
3
+ emoji: 🧠
4
+ colorFrom: indigo
5
+ colorTo: yellow
6
+ sdk: streamlit
7
+ sdk_version: 1.46.1
8
+ app_file: src/app.py
9
+ pinned: false
10
+ ---
11
+ # Vietnamese Named Entity Recognition
12
+
13
+ ## 🛠️ Set Up Your Environment With Conda
14
+
15
+ ### Option 1: Using `requirements.txt`
16
+
17
+ ```bash
18
+ conda create --name vnner python=3.10
19
+ conda activate vnner
20
+ pip install -r requirements.txt
21
+ ```
22
+
23
+ ### Option 2: Using `environment.yml`
24
+
25
+ ```bash
26
+ conda env create -f environment.yml
27
+ conda activate vnner
28
+ ```
29
+
30
+ ## Run
31
+ ```bash
32
+ python run.py
33
+ ```
34
+ ---
35
+
36
+ ## 📂 Project Structure
37
+
38
+ ```
39
+ my_ai_project/
40
+
41
+ ├── data/
42
+ │ ├── raw_data.csv # Dữ liệu gốc
43
+ │ ├── processed_data_EDA.csv # Dữ liệu sau khi tiền xử lý
44
+ │ └── processed_data_full.csv # Dữ liệu sẵn sàng training
45
+
46
+ ├── notebooks/ # Thử nghiệm và khám phá dữ liệu
47
+ │ ├── Duc_Notebook.ipynb # CRF + RandomForest
48
+ │ ├── Softmax_PhoBERT.ipynb # Softmax
49
+
50
+ ├── src/ # Mã nguồn chính của dự án
51
+ │ ├── __init__.py
52
+ │ ├── data_loader.py # Nạp và xử lý dữ liệu
53
+ │ ├── preprocessing.py # Hàm tiền xử lý dữ liệu
54
+ │ ├── model.py # Định nghĩa kiến trúc mô hình
55
+ │ ├── train.py # Huấn luyện mô hình
56
+ │ ├── evaluate.py # Đánh giá mô hình
57
+ │ └── predict.py # Dự đoán với mô hình đã huấn luyện
58
+
59
+ ├── models/ # Mô hình đã lưu sau khi huấn luyện
60
+ │ └── best_model.pth # File trọng số mô hình
61
+
62
+ ├── outputs/ # Kết quả, biểu đồ, log, metrics
63
+ │ ├── logs/ # Nhật ký huấn luyện (tensorboard/logging)
64
+ │ └── figures/ # Biểu đồ trực quan hóa
65
+
66
+ ├── configs/ # File cấu hình cho mô hình, huấn luyện
67
+ │ └── config.yaml
68
+
69
+ ├── tests/ # Unit test cho các hàm chính
70
+
71
+ ├── requirements.txt # Thư viện cần cài đặt
72
+ ├── environment.yml # Môi trường Conda
73
+ ├── README.md # Giới thiệu dự án
74
+ └── run.py # Script chính để chạy toàn bộ pipeline
75
+ ```
76
+
77
+ ---
78
+
79
+ ## 📚 Additional Resources (Optional)
80
+
81
+ If you have any questions about the project structure, consider reading these helpful articles first:
82
+
83
+ * [Understanding `__init__.py`](https://zetcode.com/python/init-file/)
84
+ * [Markdown Basic Syntax](https://www.markdownguide.org/basic-syntax/#escaping-characters)
85
+ * [Difference Between `requirements.txt` and `environment.yml`](https://www.reddit.com/r/learnpython/comments/xvlpdz/why_do_people_provide_a_requirementstxt_or/)
86
+
87
+ These resources could be useful for you!
space/space/space/space/space/space/src/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """Marks the directory as a Python package."""
2
+ __version__ = "1.0.0"
3
+ __author__ = "Duc Lai"
4
+ PACKAGE_NAME = "src"
space/space/space/space/space/space/src/app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from src.predict import predict_demo
4
+ from src.front import render_html
5
+
6
+ st.set_page_config(page_title="Vietnamese NER", layout="wide")
7
+
8
+ # ===== Tiêu đề chính =====
9
+ st.title("🔍 Ứng dụng nhận diện thực thể có tên (NER) cho tiếng Việt")
10
+
11
+ # Tabs
12
+ tab1, tab2, tab3 = st.tabs(["📊 Phân tích dữ liệu", "📈 Kết quả huấn luyện", "🧪 Demo mô hình"])
13
+
14
+ # --- Tab 1: PHÂN TÍCH DỮ LIỆU ---
15
+ with tab1:
16
+ st.header("📊 Phân tích dữ liệu")
17
+
18
+ df = pd.DataFrame({
19
+ "Loại thực thể": ["PER", "LOC", "ORG", "MISC"],
20
+ "Số lượng": [3200, 2500, 1800, 900]
21
+ })
22
+
23
+ st.bar_chart(df.set_index("Loại thực thể"))
24
+
25
+ # --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
26
+ with tab2:
27
+ st.header("📈 Kết quả huấn luyện")
28
+
29
+ loss = [0.9, 0.7, 0.5, 0.35, 0.28]
30
+ epoch = [1, 2, 3, 4, 5]
31
+ df_loss = pd.DataFrame({"Epoch": epoch, "Loss": loss})
32
+ st.line_chart(df_loss.set_index("Epoch"))
33
+
34
+ st.subheader("Đánh giá mô hình")
35
+ df_eval = pd.DataFrame({
36
+ "Phiên bản": ["v1", "v2", "v3"],
37
+ "F1-score": [0.78, 0.83, 0.86],
38
+ "Accuracy": [0.81, 0.85, 0.88]
39
+ })
40
+ st.dataframe(df_eval)
41
+
42
+ # --- Tab 3: DEMO MÔ HÌNH ---
43
+ with tab3:
44
+ st.header("🧪 Vietnamese Named Entity Recognition")
45
+
46
+ text = st.text_input("Nhập văn bản tiếng Việt:", "Nguyễn Văn A đang làm việc tại Hà Nội")
47
+
48
+ if st.button("Phân tích"):
49
+ if not text.strip():
50
+ st.warning("Vui lòng nhập văn bản!")
51
+ else:
52
+ tokens, labels = predict_demo(text)
53
+
54
+ st.subheader("Thực thể được phát hiện")
55
+ entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"]
56
+
57
+ if entities:
58
+ for tok, lab in entities:
59
+ st.markdown(f"🔹 **{tok}** — *{lab}*")
60
+ else:
61
+ st.info("Không phát hiện thực thể.")
62
+
63
+ st.subheader("Highlight trong văn bản:")
64
+ st.markdown(render_html(tokens, labels), unsafe_allow_html=True)
space/space/space/space/space/space/src/configs.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ configs = {
2
+ # Init
3
+ "project": "NER",
4
+ "name": "CRF_VLSP2016_Ultra",
5
+ "model": "Linear/CRF",
6
+
7
+ # Hyperparameters
8
+ "optim": "Adam",
9
+ "learning_rate": 1e-3,
10
+ "batch_size": 16,
11
+ "epochs": 20,
12
+ "train_ratio": 0.7,
13
+ "val_ratio": 0.15,
14
+ "test_ratio": 0.15
15
+ }
space/space/space/space/space/space/src/data_set.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data import Dataset
2
+ import torch
3
+
4
+ class NerDataset(Dataset):
5
+ def __init__(self, embeddings, labels):
6
+ super().__init__()
7
+ self.embeddings = embeddings
8
+ self.labels = labels
9
+
10
+ def __len__(self):
11
+ return len(self.embeddings)
12
+
13
+ def __getitem__(self, idx):
14
+ return self.embeddings[idx], self.labels[idx]
15
+
16
+ def collate_fn(batch): # Batch_size x Seq_length x 768
17
+ embeddings, labels = zip(*batch)
18
+ lengths = [e.size(0) for e in embeddings]
19
+ max_len = max(lengths)
20
+
21
+ padded_embs = torch.stack([
22
+ torch.cat([e, torch.zeros(max_len - e.size(0), e.size(1))]) for e in embeddings
23
+ ])
24
+
25
+ padded_labels = torch.stack([
26
+ torch.cat([l, torch.full((max_len - l.size(0),), -1, dtype=torch.long)]) for l in labels
27
+ ])
28
+
29
+ return padded_embs, padded_labels, lengths
30
+
31
+
space/space/space/space/space/space/src/evaluate.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.predict import predict
2
+ from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
3
+
4
+ def evaluate(model, loader, count_loss=True, report=False):
5
+
6
+ # Model Preidction (Inference)
7
+ all_preds, all_true, loss = predict(model, loader, count_loss)
8
+ class_report = None
9
+
10
+ # Get evaluation metric
11
+ precision, recall, f1, _ = precision_recall_fscore_support(all_true, all_preds, average='macro', zero_division=0)
12
+ acc = accuracy_score(all_true, all_preds)
13
+
14
+ # Get classification report
15
+ if report:
16
+ class_report = classification_report(all_true, all_preds)
17
+
18
+ return precision, recall, f1, acc, loss, class_report
19
+
20
+ def evaluate_ignore_O(model, loader):
21
+ pass
space/space/space/space/space/space/src/front.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def render_html(tokens, labels):
2
+ """
3
+ Tô màu highlight theo nhãn IOB, với màu khác nhau cho PER, ORG, LOC
4
+ """
5
+ label_colors = {
6
+ "PER": "lightcoral", # đỏ nhạt
7
+ "ORG": "lightblue", # xanh nhạt
8
+ "LOC": "lightgreen", # xanh lá nhạt
9
+ }
10
+
11
+ html = ""
12
+ current_label = None
13
+
14
+ for tok, label in zip(tokens, labels):
15
+ if label.startswith("B-"):
16
+ if current_label:
17
+ html += "</span> "
18
+ current_label = label[2:]
19
+ color = label_colors.get(current_label, "lightgray")
20
+ html += f"<span style='background-color:{color};padding:2px;border-radius:4px;' title='{current_label}'>{tok}"
21
+ elif label.startswith("I-") and current_label:
22
+ html += f" {tok}"
23
+ else:
24
+ if current_label:
25
+ html += "</span> "
26
+ current_label = None
27
+ html += f"{tok} "
28
+
29
+ if current_label:
30
+ html += "</span>"
31
+
32
+ return f"<div style='font-family:monospace;font-size:16px'>{html.strip()}</div>"
space/space/space/space/space/space/src/model.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torchcrf import CRF
2
+ import torch.nn as nn
3
+
4
+ class CRF_Tagger(nn.Module):
5
+ def __init__(self, input_dim, num_tags):
6
+ super().__init__()
7
+ self.embed2tag = nn.Linear(input_dim, num_tags)
8
+ self.crf = CRF(num_tags, batch_first=True)
9
+
10
+ def forward(self, x, labels, mask):
11
+ emissions = self.embed2tag(x)
12
+ return -self.crf(emissions, labels, mask=mask, reduction="mean")
13
+
14
+ def decode(self, x, mask=None):
15
+ emissions = self.embed2tag(x)
16
+ return self.crf.decode(emissions, mask)
space/space/space/space/space/space/src/predict.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from src.model import CRF_Tagger
3
+ from src.preprocessing import process_demo_sentence
4
+
5
+ def predict(model, loader, count_loss=True):
6
+
7
+ model.eval() # Evaluation Mode, Ignore Dropout, BatchNorm, ...
8
+ all_preds, all_true = [], []
9
+ loss = 0.0
10
+
11
+ with torch.no_grad(): # Stop track gradient
12
+ for x, y, _ in loader:
13
+ mask = (y != -1)
14
+
15
+ # Get loss
16
+ if count_loss:
17
+ loss += model(x, y, mask).item()
18
+
19
+ # Get prediction
20
+ preds = model.decode(x, mask)
21
+
22
+ # Loop for each sentence in mini-batch
23
+ for pred_seq, true_seq, m in zip(preds, y, mask):
24
+ true_labels = true_seq[m].tolist() # tensor[mask tensor boolean]
25
+ all_preds.extend(pred_seq)
26
+ all_true.extend(true_labels)
27
+
28
+ return all_preds, all_true, loss/len(loader)
29
+
30
+ def predict_demo(text):
31
+
32
+
33
+ id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
34
+
35
+ x, tokens = process_demo_sentence(text) # 1 x seq_length x 768
36
+ NUM_TAGS = 7
37
+
38
+ model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
39
+ model.load_state_dict(torch.load(".\models\best_epoch_16.pt"))
40
+ model.eval()
41
+ with torch.no_grad():
42
+ preds = model.decode(x)
43
+
44
+ labels = [id_tag[lab] for lab in preds[0]] # preds[0] vì sẽ trả về nhiều batch nhưng chúng ta chỉ có 1
45
+
46
+ return tokens, labels
space/space/space/space/space/space/src/preprocessing.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModel
4
+ from tqdm import tqdm
5
+ from sklearn.model_selection import train_test_split
6
+ from src.configs import configs
7
+ from pyvi import ViTokenizer
8
+
9
+ def join_tokens(tokens):
10
+ text = ' '.join(tokens)
11
+ return text
12
+
13
+ def reform_raw_text(tokens):
14
+ text = ' '.join(tokens)
15
+ return text.replace("_", " ")
16
+
17
+ def label(x, ):
18
+ id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
19
+ return [id_tag[int(i)] for i in x]
20
+
21
+ def replace_7_8(lst):
22
+ return [0 if x in (7, 8) else x for x in lst]
23
+
24
+ # Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece
25
+ def group_embeddings(tokens, embeddings):
26
+ word_embeddings = []
27
+ current_vecs = []
28
+
29
+ for token, emb in zip(tokens, embeddings):
30
+ if token in ["<s>", "</s>"]:
31
+ continue
32
+
33
+ if token.endswith("@@"):
34
+ current_vecs.append(emb)
35
+ else:
36
+ current_vecs.append(emb)
37
+ word_emb = torch.mean(torch.stack(current_vecs), dim=0)
38
+ word_embeddings.append(word_emb)
39
+ current_vecs = []
40
+
41
+ if current_vecs: # Trong trường hợp sót lại cuối câu
42
+ word_emb = torch.mean(torch.stack(current_vecs), dim=0)
43
+ word_embeddings.append(word_emb)
44
+
45
+ return word_embeddings
46
+
47
+
48
+ # Download the dataset form Hugging Face
49
+ def download_raw_data():
50
+ splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}
51
+ df_train = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["train"])
52
+ df_valid = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["valid"])
53
+ df = pd.concat([df_train, df_valid]).reset_index(drop=True)
54
+
55
+ return df
56
+
57
+ # Process dataframe for EDA
58
+ def preprocess_data_for_EDA(df):
59
+ # Define tag - id
60
+ tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
61
+ id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
62
+
63
+ # Add columns and remove inappropriate tags
64
+ df['ner_tags'] = df['ner_tags'].apply(replace_7_8)
65
+ df['text_withseg'] = df['tokens'].apply(join_tokens)
66
+ df['text_raw'] = df['tokens'].apply(reform_raw_text)
67
+ df["ner_labels"] = df.ner_tags.apply(label)
68
+ df.columns = ['tokens', 'id_labels', 'seg_text', 'raw_text', 'labels']
69
+
70
+ return df
71
+
72
+
73
+
74
+
75
+ def load_phoBERT_model_and_tokenizer():
76
+ # Load PhoBERT tokenizer và model
77
+ tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
78
+ model = AutoModel.from_pretrained("vinai/phobert-base")
79
+ model.eval()
80
+ return model, tokenizer
81
+
82
+
83
+ # Embedding text
84
+ def create_embeddings(df, model, tokenizer):
85
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
86
+ model.to(device)
87
+
88
+ all_embeddings = [] # list of [seq_len_i, 768] tensors
89
+ all_labels = [] # list of [seq_len_i,] tensors
90
+ remove_index = []
91
+
92
+ for i, row in tqdm(df.iterrows(), total=len(df)):
93
+
94
+ # Truy cập phần tử từng dòng
95
+ sentence = row['seg_text']
96
+ gold_labels = row["id_labels"]
97
+
98
+ # Cho sentence đi qua SentencePiece
99
+ input_ids = tokenizer.encode(sentence, return_tensors="pt").to(device)
100
+
101
+ tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
102
+
103
+ # Encode tạo embeddings
104
+ with torch.no_grad():
105
+ outputs = model(input_ids)
106
+ last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()
107
+
108
+ # Gộp các embeddings đã bị tách khi đi qua SentencePiece
109
+ word_embeds = group_embeddings(tokens, last_hidden_state)
110
+
111
+ # Kiểm tra số lượng embeddings và số lượng labels, nếu conflict -> xóa dòng đó
112
+ if len(word_embeds) != len(gold_labels):
113
+ # print(f"Warning: Skip row {i} - length mismatch")
114
+ remove_index.append(i)
115
+ continue
116
+
117
+ # Thêm vào list tổng & Tới đây là data đã sẵn sàng cho training
118
+ all_embeddings.append(torch.stack(word_embeds))
119
+ all_labels.append(torch.tensor(gold_labels))
120
+
121
+ # Create Dict
122
+ processed_data = {
123
+ "embeddings": all_embeddings,
124
+ "labels": all_labels
125
+ }
126
+
127
+ return processed_data
128
+
129
+
130
+ def split_dataset(data):
131
+
132
+ # Train_Val / Test Split
133
+ X_train_val, X_test, Y_train_val, Y_test = train_test_split(data["embeddings"], data["labels"], test_size=configs["test_ratio"], random_state=42)
134
+
135
+ # Train / Val Split
136
+ val_rest_ratio = configs["val_ratio"] / (configs["val_ratio"] + configs["train_ratio"])
137
+ X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size = val_rest_ratio, random_state=42)
138
+
139
+ return X_train, Y_train, X_val, Y_val, X_test, Y_test
140
+
141
+
142
+ # TODO: Refactor hàm process_demo_sentence, và hàm predict demo, warning nếu độ dài tokens_word không bằng độ dài sau group_embeddings
143
+
144
+ def process_demo_sentence(text):
145
+ """
146
+ Trả về tensor shape 1 x Seq_length x 768
147
+ """
148
+ segmented_text = ViTokenizer.tokenize(text)
149
+ tokens_word = segmented_text.strip().split(" ")
150
+
151
+ model, tokenizer = load_phoBERT_model_and_tokenizer()
152
+
153
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
154
+ model.to(device)
155
+
156
+ input_ids = tokenizer.encode(segmented_text, return_tensors="pt").to(device)
157
+
158
+ tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
159
+
160
+ with torch.no_grad():
161
+ outputs = model(input_ids)
162
+ last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()
163
+
164
+ word_embeds = group_embeddings(tokens, last_hidden_state)
165
+
166
+ all_embeddings = torch.stack(word_embeds) # seq_length x 768
167
+
168
+ all_embeddings = all_embeddings.unsqueeze(0) # Thêm chiều batch size là 1 -> 1 x seq_length x 768
169
+
170
+ return all_embeddings, tokens_word
171
+
space/space/space/space/space/space/src/torchcrf/__init__.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __version__ = '0.7.2'
2
+
3
+ from typing import List, Optional
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+
9
+ class CRF(nn.Module):
10
+ """Conditional random field.
11
+
12
+ This module implements a conditional random field [LMP01]_. The forward computation
13
+ of this class computes the log likelihood of the given sequence of tags and
14
+ emission score tensor. This class also has `~CRF.decode` method which finds
15
+ the best tag sequence given an emission score tensor using `Viterbi algorithm`_.
16
+
17
+ Args:
18
+ num_tags: Number of tags.
19
+ batch_first: Whether the first dimension corresponds to the size of a minibatch.
20
+
21
+ Attributes:
22
+ start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
23
+ ``(num_tags,)``.
24
+ end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
25
+ ``(num_tags,)``.
26
+ transitions (`~torch.nn.Parameter`): Transition score tensor of size
27
+ ``(num_tags, num_tags)``.
28
+
29
+
30
+ .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
31
+ "Conditional random fields: Probabilistic models for segmenting and
32
+ labeling sequence data". *Proc. 18th International Conf. on Machine
33
+ Learning*. Morgan Kaufmann. pp. 282–289.
34
+
35
+ .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
36
+ """
37
+
38
+ def __init__(self, num_tags: int, batch_first: bool = False) -> None:
39
+ if num_tags <= 0:
40
+ raise ValueError(f'invalid number of tags: {num_tags}')
41
+ super().__init__()
42
+ self.num_tags = num_tags
43
+ self.batch_first = batch_first
44
+ self.start_transitions = nn.Parameter(torch.empty(num_tags))
45
+ self.end_transitions = nn.Parameter(torch.empty(num_tags))
46
+ self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))
47
+
48
+ self.reset_parameters()
49
+
50
+ def reset_parameters(self) -> None:
51
+ """Initialize the transition parameters.
52
+
53
+ The parameters will be initialized randomly from a uniform distribution
54
+ between -0.1 and 0.1.
55
+ """
56
+ nn.init.uniform_(self.start_transitions, -0.1, 0.1)
57
+ nn.init.uniform_(self.end_transitions, -0.1, 0.1)
58
+ nn.init.uniform_(self.transitions, -0.1, 0.1)
59
+
60
+ def __repr__(self) -> str:
61
+ return f'{self.__class__.__name__}(num_tags={self.num_tags})'
62
+
63
+ def forward(
64
+ self,
65
+ emissions: torch.Tensor,
66
+ tags: torch.LongTensor,
67
+ mask: Optional[torch.ByteTensor] = None,
68
+ reduction: str = 'sum',
69
+ ) -> torch.Tensor:
70
+ """Compute the conditional log likelihood of a sequence of tags given emission scores.
71
+
72
+ Args:
73
+ emissions (`~torch.Tensor`): Emission score tensor of size
74
+ ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
75
+ ``(batch_size, seq_length, num_tags)`` otherwise.
76
+ tags (`~torch.LongTensor`): Sequence of tags tensor of size
77
+ ``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
78
+ ``(batch_size, seq_length)`` otherwise.
79
+ mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
80
+ if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
81
+ reduction: Specifies the reduction to apply to the output:
82
+ ``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
83
+ ``sum``: the output will be summed over batches. ``mean``: the output will be
84
+ averaged over batches. ``token_mean``: the output will be averaged over tokens.
85
+
86
+ Returns:
87
+ `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
88
+ reduction is ``none``, ``()`` otherwise.
89
+ """
90
+ self._validate(emissions, tags=tags, mask=mask)
91
+ if reduction not in ('none', 'sum', 'mean', 'token_mean'):
92
+ raise ValueError(f'invalid reduction: {reduction}')
93
+ if mask is None:
94
+ mask = torch.ones_like(tags, dtype=torch.uint8)
95
+
96
+ if self.batch_first:
97
+ emissions = emissions.transpose(0, 1)
98
+ tags = tags.transpose(0, 1)
99
+ mask = mask.transpose(0, 1)
100
+
101
+ # shape: (batch_size,)
102
+ numerator = self._compute_score(emissions, tags, mask)
103
+ # shape: (batch_size,)
104
+ denominator = self._compute_normalizer(emissions, mask)
105
+ # shape: (batch_size,)
106
+ llh = numerator - denominator
107
+
108
+ if reduction == 'none':
109
+ return llh
110
+ if reduction == 'sum':
111
+ return llh.sum()
112
+ if reduction == 'mean':
113
+ return llh.mean()
114
+ assert reduction == 'token_mean'
115
+ return llh.sum() / mask.type_as(emissions).sum()
116
+
117
+ @torch.jit.export
118
+ def decode(self, emissions: torch.Tensor,
119
+ mask: Optional[torch.ByteTensor] = None) -> List[List[int]]:
120
+ """Find the most likely tag sequence using Viterbi algorithm.
121
+
122
+ Args:
123
+ emissions (`~torch.Tensor`): Emission score tensor of size
124
+ ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
125
+ ``(batch_size, seq_length, num_tags)`` otherwise.
126
+ mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
127
+ if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
128
+
129
+ Returns:
130
+ List of list containing the best tag sequence for each batch.
131
+ """
132
+ self._validate(emissions, mask=mask)
133
+ if mask is None:
134
+ mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)
135
+
136
+ if self.batch_first:
137
+ emissions = emissions.transpose(0, 1)
138
+ mask = mask.transpose(0, 1)
139
+
140
+ return self._viterbi_decode(emissions, mask)
141
+
142
+ def _validate(
143
+ self,
144
+ emissions: torch.Tensor,
145
+ tags: Optional[torch.LongTensor] = None,
146
+ mask: Optional[torch.ByteTensor] = None) -> None:
147
+ if emissions.dim() != 3:
148
+ raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
149
+ if emissions.size(2) != self.num_tags:
150
+ raise ValueError(
151
+ f'expected last dimension of emissions is {self.num_tags}, '
152
+ f'got {emissions.size(2)}')
153
+
154
+ if tags is not None:
155
+ if emissions.shape[:2] != tags.shape:
156
+ raise ValueError(
157
+ 'the first two dimensions of emissions and tags must match, '
158
+ f'got {(emissions.shape[0], emissions.shape[1])} and {(tags.shape[0], tags.shape[1])}'
159
+ )
160
+
161
+ if mask is not None:
162
+ if emissions.shape[:2] != mask.shape:
163
+ raise ValueError(
164
+ 'the first two dimensions of emissions and mask must match, '
165
+ f'got {(emissions.shape[0], emissions.shape[1])} and {(mask.shape[0], mask.shape[1])}'
166
+ )
167
+ no_empty_seq = not self.batch_first and mask[0].all()
168
+ no_empty_seq_bf = self.batch_first and mask[:, 0].all()
169
+ if not no_empty_seq and not no_empty_seq_bf:
170
+ raise ValueError('mask of the first timestep must all be on')
171
+
172
+ def _compute_score(
173
+ self, emissions: torch.Tensor, tags: torch.LongTensor,
174
+ mask: torch.ByteTensor) -> torch.Tensor:
175
+ # emissions: (seq_length, batch_size, num_tags)
176
+ # tags: (seq_length, batch_size)
177
+ # mask: (seq_length, batch_size)
178
+ assert emissions.dim() == 3 and tags.dim() == 2
179
+ assert emissions.shape[:2] == tags.shape
180
+ assert emissions.size(2) == self.num_tags
181
+ assert mask.shape == tags.shape
182
+ assert mask[0].all()
183
+
184
+ seq_length, batch_size = tags.shape
185
+ mask = mask.type_as(emissions)
186
+
187
+ # Start transition score and first emission
188
+ # shape: (batch_size,)
189
+ score = self.start_transitions[tags[0]]
190
+ score += emissions[0, torch.arange(batch_size), tags[0]]
191
+
192
+ for i in range(1, seq_length):
193
+ # Transition score to next tag, only added if next timestep is valid (mask == 1)
194
+ # shape: (batch_size,)
195
+ score += self.transitions[tags[i - 1], tags[i]] * mask[i]
196
+
197
+ # Emission score for next tag, only added if next timestep is valid (mask == 1)
198
+ # shape: (batch_size,)
199
+ score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i]
200
+
201
+ # End transition score
202
+ # shape: (batch_size,)
203
+ seq_ends = mask.long().sum(dim=0) - 1
204
+ # shape: (batch_size,)
205
+ last_tags = tags[seq_ends, torch.arange(batch_size)]
206
+ # shape: (batch_size,)
207
+ score += self.end_transitions[last_tags]
208
+
209
+ return score
210
+
211
+ def _compute_normalizer(
212
+ self, emissions: torch.Tensor, mask: torch.ByteTensor) -> torch.Tensor:
213
+ # emissions: (seq_length, batch_size, num_tags)
214
+ # mask: (seq_length, batch_size)
215
+ assert emissions.dim() == 3 and mask.dim() == 2
216
+ assert emissions.shape[:2] == mask.shape
217
+ assert emissions.size(2) == self.num_tags
218
+ assert mask[0].all()
219
+
220
+ seq_length = emissions.size(0)
221
+
222
+ # Start transition score and first emission; score has size of
223
+ # (batch_size, num_tags) where for each batch, the j-th column stores
224
+ # the score that the first timestep has tag j
225
+ # shape: (batch_size, num_tags)
226
+ score = self.start_transitions + emissions[0]
227
+
228
+ for i in range(1, seq_length):
229
+ # Broadcast score for every possible next tag
230
+ # shape: (batch_size, num_tags, 1)
231
+ broadcast_score = score.unsqueeze(2)
232
+
233
+ # Broadcast emission score for every possible current tag
234
+ # shape: (batch_size, 1, num_tags)
235
+ broadcast_emissions = emissions[i].unsqueeze(1)
236
+
237
+ # Compute the score tensor of size (batch_size, num_tags, num_tags) where
238
+ # for each sample, entry at row i and column j stores the sum of scores of all
239
+ # possible tag sequences so far that end with transitioning from tag i to tag j
240
+ # and emitting
241
+ # shape: (batch_size, num_tags, num_tags)
242
+ next_score = broadcast_score + self.transitions + broadcast_emissions
243
+
244
+ # Sum over all possible current tags, but we're in score space, so a sum
245
+ # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
246
+ # all possible tag sequences so far, that end in tag i
247
+ # shape: (batch_size, num_tags)
248
+ next_score = torch.logsumexp(next_score, dim=1)
249
+
250
+ # Set score to the next score if this timestep is valid (mask == 1)
251
+ # shape: (batch_size, num_tags)
252
+ score = torch.where(mask[i].unsqueeze(1), next_score, score)
253
+
254
+ # End transition score
255
+ # shape: (batch_size, num_tags)
256
+ score += self.end_transitions
257
+
258
+ # Sum (log-sum-exp) over all possible tags
259
+ # shape: (batch_size,)
260
+ return torch.logsumexp(score, dim=1)
261
+
262
+ def _viterbi_decode(self, emissions: torch.FloatTensor,
263
+ mask: torch.ByteTensor) -> List[List[int]]:
264
+ # emissions: (seq_length, batch_size, num_tags)
265
+ # mask: (seq_length, batch_size)
266
+ assert emissions.dim() == 3 and mask.dim() == 2
267
+ assert emissions.shape[:2] == mask.shape
268
+ assert emissions.size(2) == self.num_tags
269
+ assert mask[0].all()
270
+
271
+ seq_length, batch_size = mask.shape
272
+
273
+ # Start transition and first emission
274
+ # shape: (batch_size, num_tags)
275
+ score = self.start_transitions + emissions[0]
276
+ history: List[torch.Tensor] = []
277
+
278
+ # score is a tensor of size (batch_size, num_tags) where for every batch,
279
+ # value at column j stores the score of the best tag sequence so far that ends
280
+ # with tag j
281
+ # history saves where the best tags candidate transitioned from; this is used
282
+ # when we trace back the best tag sequence
283
+
284
+ # Viterbi algorithm recursive case: we compute the score of the best tag sequence
285
+ # for every possible next tag
286
+ for i in range(1, seq_length):
287
+ # Broadcast viterbi score for every possible next tag
288
+ # shape: (batch_size, num_tags, 1)
289
+ broadcast_score = score.unsqueeze(2)
290
+
291
+ # Broadcast emission score for every possible current tag
292
+ # shape: (batch_size, 1, num_tags)
293
+ broadcast_emission = emissions[i].unsqueeze(1)
294
+
295
+ # Compute the score tensor of size (batch_size, num_tags, num_tags) where
296
+ # for each sample, entry at row i and column j stores the score of the best
297
+ # tag sequence so far that ends with transitioning from tag i to tag j and emitting
298
+ # shape: (batch_size, num_tags, num_tags)
299
+ next_score = broadcast_score + self.transitions + broadcast_emission
300
+
301
+ # Find the maximum score over all possible current tag
302
+ # shape: (batch_size, num_tags)
303
+ next_score, indices = next_score.max(dim=1)
304
+
305
+ # Set score to the next score if this timestep is valid (mask == 1)
306
+ # and save the index that produces the next score
307
+ # shape: (batch_size, num_tags)
308
+ score = torch.where(mask[i].unsqueeze(1), next_score, score)
309
+ history.append(indices)
310
+
311
+ # End transition score
312
+ # shape: (batch_size, num_tags)
313
+ score += self.end_transitions
314
+
315
+ # Now, compute the best path for each sample
316
+
317
+ # shape: (batch_size,)
318
+ seq_ends = mask.long().sum(dim=0) - 1
319
+ best_tags_list: List[List[int]] = []
320
+
321
+ for idx in range(batch_size):
322
+ # Find the tag which maximizes the score at the last timestep; this is our best tag
323
+ # for the last timestep
324
+ _, best_last_tag = score[idx].max(dim=0)
325
+ best_tags: List[int] = []
326
+ best_tags.append(best_last_tag.item())
327
+
328
+ # We trace back where the best last tag comes from, append that to our best tag
329
+ # sequence, and trace it back again, and so on
330
+ # NOTE: reversed() cannot be used here because it is not supported by TorchScript,
331
+ # see https://github.com/pytorch/pytorch/issues/31772.
332
+ for hist in history[:seq_ends[idx]][::-1]:
333
+ best_last_tag = hist[idx][best_tags[-1]]
334
+ best_tags.append(best_last_tag.item())
335
+
336
+ # Reverse the order because we start from the last timestep
337
+ best_tags.reverse()
338
+ best_tags_list.append(best_tags)
339
+
340
+ return best_tags_list
space/space/space/space/space/space/src/train.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wandb
2
+ from tqdm import tqdm
3
+ from src.evaluate import evaluate
4
+ import torch
5
+
6
+ def train_model(model, optimizer, configs, loaders):
7
+
8
+ # Login wandb
9
+ wandb.login()
10
+
11
+ # Init Wandb for tracking training phase
12
+ wandb.init(
13
+ project=configs["project"],
14
+ name=configs["name"],
15
+ config=configs
16
+ )
17
+
18
+ # Log gradient of parameter
19
+ wandb.watch(model, log="all")
20
+
21
+ # Save model checkpoint by best F1
22
+ best_val_f1 = 0.0
23
+
24
+ # Training Loop
25
+ for epoch in range(1, configs["epochs"] + 1):
26
+ model.train()
27
+ total_loss = 0.0
28
+
29
+ # Create progress bar
30
+ train_bar = tqdm(loaders['train'], desc=f"Train Epoch {epoch}/{configs['epochs']}")
31
+
32
+ for batch_idx, (x, y, _) in enumerate(train_bar, start=1):
33
+ mask = (y != -1)
34
+ loss = model(x, y, mask)
35
+ optimizer.zero_grad()
36
+ loss.backward()
37
+ optimizer.step()
38
+ total_loss += loss.item()
39
+
40
+ train_bar.set_postfix(batch_loss=loss.item(), avg_loss=total_loss / batch_idx)
41
+
42
+ # Evaluate model after each epoch
43
+ avg_train_loss = total_loss / len(loaders['train'])
44
+ train_precision, train_recall, train_f1, train_acc, _, _ = evaluate(model, loaders['train'], count_loss=False)
45
+ val_precision, val_recall, val_f1, val_acc, avg_val_loss, _= evaluate(model, loaders['val'], count_loss=True)
46
+
47
+ # Log metric for train and val set
48
+ print(f"Epoch {epoch}: train_loss={avg_train_loss:.4f}, train_f1={train_f1:.4f}, val_loss={avg_val_loss:.4f}, val_f1={val_f1:.4f}")
49
+ wandb.log({
50
+
51
+ "epoch": epoch,
52
+
53
+ # Group: Training metrics
54
+ "Train/Loss": avg_train_loss,
55
+ "Train/Precision": train_precision,
56
+ "Train/Recall": train_recall,
57
+ "Train/F1": train_f1,
58
+ "Train/Accuracy": train_acc,
59
+
60
+ # Group: Validation metrics
61
+ "Val/Loss": avg_val_loss,
62
+ "Val/Precision": val_precision,
63
+ "Val/Recall": val_recall,
64
+ "Val/F1": val_f1,
65
+ "Val/Accuracy": val_acc
66
+ })
67
+
68
+ # Save best model based on val_f1
69
+ if val_f1 > best_val_f1:
70
+ best_val_f1 = val_f1
71
+ ckpt_path = f"./models/best_epoch_{epoch}.pt"
72
+ torch.save(model.state_dict(), ckpt_path)
73
+ wandb.save(ckpt_path)
74
+ print(f"Saved imporved model to {ckpt_path}")
75
+
76
+ print()
77
+
78
+ # Load best model before test
79
+ print(f"Loading best model from {ckpt_path} for final evaluation...")
80
+ model.load_state_dict(torch.load(ckpt_path))
81
+ print("Done \n")
82
+
83
+
84
+ # Log metric for test set
85
+ print("Evaluation on test set ...")
86
+ test_precision, test_recall, test_f1, test_acc, avg_test_loss, report = evaluate(model, loaders['test'], count_loss=True, report=True)
87
+ wandb.log({
88
+ "Test/Loss": avg_test_loss,
89
+ "Test/Precision": test_precision,
90
+ "Test/Recall": test_recall,
91
+ "Test/F1": test_f1,
92
+ "Test/Accuracy": test_acc,
93
+ })
94
+ print(f"Test_loss={avg_test_loss:.4f}, Test_f1={test_f1:.4f}")
95
+ print(report)
96
+
97
+ # Finish W&B run
98
+ wandb.finish()
space/space/space/src/app.py CHANGED
@@ -1,7 +1,10 @@
1
  import streamlit as st
2
  import pandas as pd
 
 
3
  from src.predict import predict_demo
4
  from src.front import render_html
 
5
 
6
  st.set_page_config(page_title="Vietnamese NER", layout="wide")
7
 
@@ -24,20 +27,99 @@ with tab1:
24
 
25
  # --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
26
  with tab2:
27
- st.header("📈 Kết quả huấn luyện")
28
-
29
- loss = [0.9, 0.7, 0.5, 0.35, 0.28]
30
- epoch = [1, 2, 3, 4, 5]
31
- df_loss = pd.DataFrame({"Epoch": epoch, "Loss": loss})
32
- st.line_chart(df_loss.set_index("Epoch"))
33
-
34
- st.subheader("Đánh giá mô hình")
35
- df_eval = pd.DataFrame({
36
- "Phiên bản": ["v1", "v2", "v3"],
37
- "F1-score": [0.78, 0.83, 0.86],
38
- "Accuracy": [0.81, 0.85, 0.88]
39
- })
40
- st.dataframe(df_eval)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # --- Tab 3: DEMO MÔ HÌNH ---
43
  with tab3:
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import plotly.graph_objects as go
4
+
5
  from src.predict import predict_demo
6
  from src.front import render_html
7
+ from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare
8
 
9
  st.set_page_config(page_title="Vietnamese NER", layout="wide")
10
 
 
27
 
28
  # --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
29
  with tab2:
30
+ st.set_page_config(
31
+ page_title="My NER App",
32
+ layout="wide",
33
+ initial_sidebar_state="expanded"
34
+ )
35
+
36
+ # ==== TẠO FIGURES ====
37
+
38
+ # 1️⃣ Loss
39
+ fig_loss = go.Figure()
40
+ fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
41
+ mode='lines+markers', name='Train Loss'))
42
+ fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
43
+ mode='lines+markers', name='Val Loss'))
44
+ fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
45
+
46
+ # 2️⃣ F1-Score
47
+ fig_f1 = go.Figure()
48
+ fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
49
+ mode='lines+markers', name='Train F1'))
50
+ fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
51
+ mode='lines+markers', name='Val F1'))
52
+ fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
53
+
54
+ # 3️⃣ Classification Report Table & Bar
55
+ labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
56
+ report_data = [[lbl,
57
+ report_dict[lbl]["precision"],
58
+ report_dict[lbl]["recall"],
59
+ report_dict[lbl]["f1-score"]]
60
+ for lbl in labels]
61
+ df_report = pd.DataFrame(report_data,
62
+ columns=["Label", "Precision", "Recall", "F1-Score"])
63
+
64
+ fig_report = go.Figure()
65
+ for col in ["Precision", "Recall", "F1-Score"]:
66
+ fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
67
+ fig_report.update_layout(barmode='group',
68
+ title="Class Report Metrics of PhoBert + CRF",
69
+ xaxis_title="Label", yaxis_title="Score",
70
+ yaxis=dict(range=[0,1.0]))
71
+
72
+ labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
73
+ report_data2 = [[lbl,
74
+ report_dict_2[lbl]["precision"],
75
+ report_dict_2[lbl]["recall"],
76
+ report_dict_2[lbl]["f1-score"]]
77
+ for lbl in labels2]
78
+ df_report2 = pd.DataFrame(report_data2,
79
+ columns=["Label", "Precision", "Recall", "F1-Score"])
80
+
81
+ fig_report2 = go.Figure()
82
+ for col in ["Precision", "Recall", "F1-Score"]:
83
+ fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
84
+ fig_report2.update_layout(barmode='group',
85
+ title="Class Report Metrics of PhoBert + Softmax",
86
+ xaxis_title="Label", yaxis_title="Score",
87
+ yaxis=dict(range=[0,1.0]))
88
+
89
+ # 4️⃣ Model & Data Comparison Tables
90
+ df_model = pd.DataFrame(
91
+ [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()],
92
+ columns=["Model", "F1-Score", "Accuracy"]
93
+ )
94
+ df_data = pd.DataFrame(
95
+ [[s, f1] for s, f1 in data_compare["Data"].items()],
96
+ columns=["Preprocessing", "F1-Score"]
97
+ )
98
+
99
+ # ==== LAYOUT RAO GỌN VỚI COLUMNS ====
100
+
101
+ # Row 1: Loss | F1
102
+ col1, col2 = st.columns(2)
103
+ with col1:
104
+ st.plotly_chart(fig_loss, use_container_width=True)
105
+ with col2:
106
+ st.plotly_chart(fig_f1, use_container_width=True)
107
+
108
+ # Row 2: Class Report Table | Bar Chart
109
+ col3, col4 = st.columns(2)
110
+ with col3:
111
+ st.plotly_chart(fig_report2, use_container_width=True)
112
+ with col4:
113
+ st.plotly_chart(fig_report, use_container_width=True)
114
+
115
+ # Row 3: Model Compare | Data Compare
116
+ col5, col6 = st.columns(2)
117
+ with col5:
118
+ st.markdown("**Model Comparison**")
119
+ st.dataframe(df_model, use_container_width=True)
120
+ with col6:
121
+ st.markdown("**Data Preprocessing Comparison**")
122
+ st.dataframe(df_data, use_container_width=True)
123
 
124
  # --- Tab 3: DEMO MÔ HÌNH ---
125
  with tab3:
space/space/space/src/predict.py CHANGED
@@ -36,7 +36,7 @@ def predict_demo(text):
36
  NUM_TAGS = 7
37
 
38
  model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
39
- model.load_state_dict(torch.load(".\models\best_epoch_16.pt"))
40
  model.eval()
41
  with torch.no_grad():
42
  preds = model.decode(x)
 
36
  NUM_TAGS = 7
37
 
38
  model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
39
+ model.load_state_dict(torch.load("../models/best_epoch_16.pt"))
40
  model.eval()
41
  with torch.no_grad():
42
  preds = model.decode(x)
space/space/space/st.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.graph_objects as go
4
+ from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare
5
+
6
+ st.set_page_config(
7
+ page_title="My NER App",
8
+ layout="wide",
9
+ initial_sidebar_state="expanded"
10
+ )
11
+
12
+ # ==== TẠO FIGURES ====
13
+
14
+ # 1️⃣ Loss
15
+ fig_loss = go.Figure()
16
+ fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
17
+ mode='lines+markers', name='Train Loss'))
18
+ fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
19
+ mode='lines+markers', name='Val Loss'))
20
+ fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
21
+
22
+ # 2️⃣ F1-Score
23
+ fig_f1 = go.Figure()
24
+ fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
25
+ mode='lines+markers', name='Train F1'))
26
+ fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
27
+ mode='lines+markers', name='Val F1'))
28
+ fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
29
+
30
+ # 3️⃣ Classification Report Table & Bar
31
+ labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
32
+ report_data = [[lbl,
33
+ report_dict[lbl]["precision"],
34
+ report_dict[lbl]["recall"],
35
+ report_dict[lbl]["f1-score"]]
36
+ for lbl in labels]
37
+ df_report = pd.DataFrame(report_data,
38
+ columns=["Label", "Precision", "Recall", "F1-Score"])
39
+
40
+ fig_report = go.Figure()
41
+ for col in ["Precision", "Recall", "F1-Score"]:
42
+ fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
43
+ fig_report.update_layout(barmode='group',
44
+ title="Class Report Metrics of PhoBert + CRF",
45
+ xaxis_title="Label", yaxis_title="Score",
46
+ yaxis=dict(range=[0,1.0]))
47
+
48
+ labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
49
+ report_data2 = [[lbl,
50
+ report_dict_2[lbl]["precision"],
51
+ report_dict_2[lbl]["recall"],
52
+ report_dict_2[lbl]["f1-score"]]
53
+ for lbl in labels2]
54
+ df_report2 = pd.DataFrame(report_data2,
55
+ columns=["Label", "Precision", "Recall", "F1-Score"])
56
+
57
+ fig_report2 = go.Figure()
58
+ for col in ["Precision", "Recall", "F1-Score"]:
59
+ fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
60
+ fig_report2.update_layout(barmode='group',
61
+ title="Class Report Metrics of PhoBert + Softmax",
62
+ xaxis_title="Label", yaxis_title="Score",
63
+ yaxis=dict(range=[0,1.0]))
64
+
65
+ # 4️⃣ Model & Data Comparison Tables
66
+ df_model = pd.DataFrame(
67
+ [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()],
68
+ columns=["Model", "F1-Score", "Accuracy"]
69
+ )
70
+ df_data = pd.DataFrame(
71
+ [[s, f1] for s, f1 in data_compare["Data"].items()],
72
+ columns=["Preprocessing", "F1-Score"]
73
+ )
74
+
75
+ # ==== LAYOUT RAO GỌN VỚI COLUMNS ====
76
+
77
+ # Row 1: Loss | F1
78
+ col1, col2 = st.columns(2)
79
+ with col1:
80
+ st.plotly_chart(fig_loss, use_container_width=True)
81
+ with col2:
82
+ st.plotly_chart(fig_f1, use_container_width=True)
83
+
84
+ # Row 2: Class Report Table | Bar Chart
85
+ col3, col4 = st.columns(2)
86
+ with col3:
87
+ st.plotly_chart(fig_report2, use_container_width=True)
88
+ with col4:
89
+ st.plotly_chart(fig_report, use_container_width=True)
90
+
91
+ # Row 3: Model Compare | Data Compare
92
+ col5, col6 = st.columns(2)
93
+ with col5:
94
+ st.markdown("**Model Comparison**")
95
+ st.dataframe(df_model, use_container_width=True)
96
+ with col6:
97
+ st.markdown("**Data Preprocessing Comparison**")
98
+ st.dataframe(df_data, use_container_width=True)
src/predict.py CHANGED
@@ -1,6 +1,7 @@
1
  import torch
2
  from src.model import CRF_Tagger
3
  from src.preprocessing import process_demo_sentence
 
4
 
5
  def predict(model, loader, count_loss=True):
6
 
@@ -29,6 +30,9 @@ def predict(model, loader, count_loss=True):
29
 
30
  def predict_demo(text):
31
 
 
 
 
32
 
33
  id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
34
 
@@ -36,7 +40,7 @@ def predict_demo(text):
36
  NUM_TAGS = 7
37
 
38
  model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
39
- model.load_state_dict(torch.load("../models/best_epoch_16.pt"))
40
  model.eval()
41
  with torch.no_grad():
42
  preds = model.decode(x)
 
1
  import torch
2
  from src.model import CRF_Tagger
3
  from src.preprocessing import process_demo_sentence
4
+ import os
5
 
6
  def predict(model, loader, count_loss=True):
7
 
 
30
 
31
  def predict_demo(text):
32
 
33
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
34
+ model_path = os.path.join(BASE_DIR, "models", "best_epoch_16.pt")
35
+
36
 
37
  id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
38
 
 
40
  NUM_TAGS = 7
41
 
42
  model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
43
+ model.load_state_dict(torch.load(model_path))
44
  model.eval()
45
  with torch.no_grad():
46
  preds = model.decode(x)