GitHub Actions commited on
Commit
95062a5
·
1 Parent(s): 2f9d738

Auto-deploy from GitHub (binary files removed)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +1 -1
  2. requirements.txt +0 -0
  3. space/README.md +1 -1
  4. space/space/space/README.md +1 -1
  5. space/space/space/space/space/requirements.txt +0 -0
  6. space/space/space/space/space/space/README.md +1 -1
  7. space/space/space/space/space/space/space/space/space/space/README.md +156 -52
  8. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.log +88 -0
  9. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.py +116 -0
  10. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore +0 -2
  11. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt +0 -0
  12. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.github/workflows/main.yml +47 -0
  13. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore +23 -0
  14. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/LICENSE +201 -0
  15. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/configs/config.yaml +1 -0
  16. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/environment.yml +9 -0
  17. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/models/best_epoch_16.pt +3 -0
  18. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb +0 -0
  19. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_RF_lightgbm.ipynb +741 -0
  20. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_Rule_base.ipynb +0 -0
  21. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Softmax_PhoBERT.ipynb +0 -0
  22. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt +0 -0
  23. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/run.py +73 -0
  24. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitattributes +35 -0
  25. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md +87 -0
  26. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__init__.py +4 -0
  27. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py +64 -0
  28. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/configs.py +15 -0
  29. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/data_set.py +31 -0
  30. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/evaluate.py +21 -0
  31. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/front.py +32 -0
  32. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/model.py +16 -0
  33. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py +46 -0
  34. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/preprocessing.py +171 -0
  35. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/torchcrf/__init__.py +340 -0
  36. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/train.py +98 -0
  37. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py +96 -14
  38. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py +1 -1
  39. space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/st.py +98 -0
  40. space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py +5 -1
  41. space/space/space/space/space/space/space/space/space/space/space/space/space/st.py +4 -95
  42. space/space/space/space/space/space/space/space/space/space/space/src/app.py +21 -8
  43. space/space/space/space/space/space/space/space/space/space/space/st.py +20 -4
  44. space/space/space/space/space/space/space/space/src/app.py +41 -36
  45. space/space/space/space/space/space/space/src/app.py +6 -13
  46. space/space/space/space/space/src/app.py +0 -157
  47. space/space/space/space/space/src/predict.py +3 -8
  48. space/space/space/space/space/src/preprocessing.py +1 -1
  49. space/space/space/space/space/src/templates/demo.html +349 -0
  50. space/space/space/space/space/src/train.py +1 -1
README.md CHANGED
@@ -17,7 +17,7 @@ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-ar
17
 
18
  Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
19
 
20
- ![Demo Screenshot](https://github.com/user-attachments/assets/4fbcdc49-5a8b-47c0-991e-d3ec839cede9)
21
 
22
  ## 🔄 Project Workflow
23
 
 
17
 
18
  Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
19
 
20
+ ![image](https://github.com/user-attachments/assets/ac6f0b96-52f2-4e47-b542-e908c02261c4)
21
 
22
  ## 🔄 Project Workflow
23
 
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
space/README.md CHANGED
@@ -17,7 +17,7 @@ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-ar
17
 
18
  Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
19
 
20
- ![Demo Screenshot](https://github.com/user-attachments/assets/4fbcdc49-5a8b-47c0-991e-d3ec839cede9)
21
 
22
  ## 🔄 Project Workflow
23
 
 
17
 
18
  Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
19
 
20
+ ![image](https://github.com/user-attachments/assets/ac6f0b96-52f2-4e47-b542-e908c02261c4)
21
 
22
  ## 🔄 Project Workflow
23
 
space/space/space/README.md CHANGED
@@ -17,7 +17,7 @@ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-ar
17
 
18
  Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
19
 
20
- ![image](https://github.com/user-attachments/assets/ac6f0b96-52f2-4e47-b542-e908c02261c4)
21
 
22
  ## 🔄 Project Workflow
23
 
 
17
 
18
  Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
19
 
20
+ ![Demo Screenshot](https://github.com/user-attachments/assets/4fbcdc49-5a8b-47c0-991e-d3ec839cede9)
21
 
22
  ## 🔄 Project Workflow
23
 
space/space/space/space/space/requirements.txt CHANGED
Binary files a/space/space/space/space/space/requirements.txt and b/space/space/space/space/space/requirements.txt differ
 
space/space/space/space/space/space/README.md CHANGED
@@ -17,7 +17,7 @@ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-ar
17
 
18
  Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
19
 
20
- ![Demo Screenshot](https://github.com/user-attachments/assets/4fbcdc49-5a8b-47c0-991e-d3ec839cede9)
21
 
22
  ## 🔄 Project Workflow
23
 
 
17
 
18
  Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
19
 
20
+ ![image](https://github.com/user-attachments/assets/ac6f0b96-52f2-4e47-b542-e908c02261c4)
21
 
22
  ## 🔄 Project Workflow
23
 
space/space/space/space/space/space/space/space/space/space/README.md CHANGED
@@ -8,80 +8,184 @@ sdk_version: 1.46.1
8
  app_file: src/app.py
9
  pinned: false
10
  ---
11
- # Vietnamese Named Entity Recognition
12
 
13
- ## 🛠️ Set Up Your Environment With Conda
14
 
15
- ### Option 1: Using `requirements.txt`
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ```bash
 
18
  conda create --name vnner python=3.10
19
  conda activate vnner
 
 
20
  pip install -r requirements.txt
21
  ```
22
 
23
  ### Option 2: Using `environment.yml`
24
-
25
  ```bash
 
26
  conda env create -f environment.yml
27
  conda activate vnner
28
  ```
29
 
30
- ## Run
 
 
31
  ```bash
32
  python run.py
33
  ```
34
- ---
35
 
36
- ## 📂 Project Structure
37
-
38
- ```
39
- my_ai_project/
40
-
41
- ├── data/
42
- │ ├── raw_data.csv # Dữ liệu gốc
43
- │ ├── processed_data_EDA.csv # Dữ liệu sau khi tiền xử lý
44
- │ └── processed_data_full.csv # Dữ liệu sẵn sàng training
45
-
46
- ├── notebooks/ # Thử nghiệm và khám phá dữ liệu
47
- │ ├── Duc_Notebook.ipynb # CRF + RandomForest
48
- │ ├── Softmax_PhoBERT.ipynb # Softmax
49
-
50
- ├── src/ # Mã nguồn chính của dự án
51
- │ ├── __init__.py
52
- │ ├── data_loader.py # Nạp và xử lý dữ liệu
53
- │ ├── preprocessing.py # Hàm tiền xử lý dữ liệu
54
- │ ├── model.py # Định nghĩa kiến trúc mô hình
55
- │ ├── train.py # Huấn luyện mô hình
56
- │ ├── evaluate.py # Đánh giá mô hình
57
- │ └── predict.py # Dự đoán với mô hình đã huấn luyện
58
-
59
- ├── models/ # Mô hình đã lưu sau khi huấn luyện
60
- │ └── best_model.pth # File trọng số mô hình
61
-
62
- ├── outputs/ # Kết quả, biểu đồ, log, metrics
63
- │ ├── logs/ # Nhật ký huấn luyện (tensorboard/logging)
64
- │ └── figures/ # Biểu đồ trực quan hóa
65
-
66
- ├── configs/ # File cấu hình cho mô hình, huấn luyện
67
- │ └── config.yaml
68
-
69
- ├── tests/ # Unit test cho các hàm chính
70
-
71
- ├── requirements.txt # Thư viện cần cài đặt
72
- ├── environment.yml # Môi trường Conda
73
- ├── README.md # Giới thiệu dự án
74
- └── run.py # Script chính để chạy toàn bộ pipeline
75
  ```
76
 
77
- ---
 
 
 
 
 
 
 
78
 
79
- ## 📚 Additional Resources (Optional)
80
 
81
- If you have any questions about the project structure, consider reading these helpful articles first:
82
 
83
- * [Understanding `__init__.py`](https://zetcode.com/python/init-file/)
84
- * [Markdown Basic Syntax](https://www.markdownguide.org/basic-syntax/#escaping-characters)
85
- * [Difference Between `requirements.txt` and `environment.yml`](https://www.reddit.com/r/learnpython/comments/xvlpdz/why_do_people_provide_a_requirementstxt_or/)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- These resources could be useful for you!
 
8
  app_file: src/app.py
9
  pinned: false
10
  ---
11
+ # Vietnamese Named Entity Recognition (NER) 🧠
12
 
13
+ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-art deep learning models including PhoBERT, CRF, and ensemble methods.
14
 
 
15
 
16
+ ## 🚀 Live Demo
17
+
18
+ Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
19
+
20
+ ![Demo Screenshot](https://github.com/user-attachments/assets/4fbcdc49-5a8b-47c0-991e-d3ec839cede9)
21
+
22
+ ## 🔄 Project Workflow
23
+
24
+ ![Project Flowchart](https://github.com/user-attachments/assets/5b800180-d6c8-44f7-8622-ba188f6cd7be)
25
+
26
+ ## 🎯 Overview
27
+
28
+ This project implements a robust Vietnamese Named Entity Recognition system that can identify and classify entities in Vietnamese text. The system combines multiple approaches including:
29
+
30
+ - **PhoBERT-based embeddings** for contextual understanding
31
+ - **Conditional Random Fields (CRF)** for sequence labeling
32
+ - **Random Forest** with semantic embeddings
33
+ - **Rule-based methods** for enhanced accuracy
34
+
35
+ ## 📂 Project Structure
36
+
37
+ ```
38
+ VIETNAMESE_NER/
39
+
40
+ ├── .github/workflows
41
+ │ └── main.yml # Auto deploy to Hugging Space
42
+
43
+ ├── data/ # Dataset files
44
+ │ └── raw_data.csv # Raw training data
45
+
46
+ ├── notebooks/ # Jupyter notebooks for experimentation
47
+ │ ├── Duc_Notebook.ipynb # CRF + RandomForest experiments
48
+ │ ├── Softmax_PhoBERT.ipynb # Softmax approach
49
+ │ ├── Kien_Rule_base.ipynb # Rule-based method with RF
50
+ │ └── Kien_RF_lightgbm.ipynb # RF with semantic embeddings
51
+
52
+ ├── src/ # Main source code
53
+ │ ├── __init__.py
54
+ │ ├── app.py # Streamlit web application
55
+ │ ├── front.py # Highlight function
56
+ │ ├── config.py # Project configuration
57
+ │ ├── data_loader.py # Data loading utilities
58
+ │ ├── preprocessing.py # Data preprocessing functions
59
+ │ ├── model.py # Model architecture definitions
60
+ │ ├── train.py # Training pipeline
61
+ │ ├── evaluate.py # Model evaluation
62
+ │ └── predict.py # Inference utilities
63
+
64
+ ├── models/ # Saved model artifacts
65
+ │ └── best_model.pt # Best trained model weights
66
+
67
+ ├── outputs/ # Training outputs
68
+ │ ├── output.log # Training logs (TensorBoard)
69
+ │ └── figures/ # Visualization plots
70
+
71
+ ├── tests/ # Unit tests (planned)
72
+
73
+ ├── requirements.txt # Python dependencies
74
+ ├── environment.yml # Conda environment file
75
+ ├── README.md # Project documentation
76
+ └── run.py # Main training script
77
+ ```
78
+
79
+
80
+ ## 🏗️ Model Architecture
81
+
82
+ The system uses a hybrid architecture combining the strengths of different approaches:
83
+
84
+ ![Model Architecture](https://github.com/user-attachments/assets/82d243a2-42fa-4dad-b1af-8946767d4f44)
85
+
86
+ ### Core Components:
87
+ - **PhoBERT-Base**: Generates contextual embeddings for Vietnamese text
88
+ - **Linear + CRF Layer**: Handles sequence labeling with context awareness
89
+ - **Softmax/Random Forest**: Provides single-label prediction capabilities
90
+
91
+ ## 📊 Dataset & Performance
92
+
93
+ ### Dataset: VLSP2016
94
+ The model is trained on the VLSP2016 dataset extracted from Vietnamese news articles.
95
+
96
+ #### Dataset Statistics:
97
+ <table>
98
+ <tr>
99
+ <td><img src="https://github.com/user-attachments/assets/20116929-1556-44b2-86e9-086b72320f22" alt="Entity Frequency" width="600"/></td>
100
+ <td><img src="https://github.com/user-attachments/assets/9cafb068-bbda-4ee1-9fc9-bd4edded1438" alt="Entity Distribution" width="600"/></td>
101
+ </tr>
102
+ <tr>
103
+ <td><img src="https://github.com/user-attachments/assets/db9421c0-4e9c-4654-92d0-d924932384dc" alt="Token Length Distribution" width="600"/></td>
104
+ <td><img src="https://github.com/user-attachments/assets/70871bc5-ccb4-4186-9538-ac479c771415" alt="Sentence Length Distribution" width="600"/></td>
105
+ </tr>
106
+ </table>
107
+
108
+
109
+ ### Model Performance:
110
+ <table>
111
+ <tr>
112
+ <td>
113
+ <img src="https://github.com/user-attachments/assets/9fb24f3a-466c-46f1-94d2-bcb6f26abd72" alt="F1 Score" width="600"/>
114
+ </td>
115
+ <td>
116
+ <img src="https://github.com/user-attachments/assets/11b8080a-38d6-4ea2-b350-21361345fd1e" alt="Training Loss" width="600"/>
117
+ </td>
118
+ </tr>
119
+ </table>
120
+
121
+ ![Results Comparison](https://github.com/user-attachments/assets/e2fecc2c-8b27-4f28-a174-41078b17567c)
122
+
123
+ ## 🛠️ Installation & Setup
124
+
125
+ ### Prerequisites
126
+ - Python 3.10+
127
+ - Conda (recommended)
128
+
129
+ ### Option 1: Using `requirements.txt`
130
  ```bash
131
+ # Create and activate conda environment
132
  conda create --name vnner python=3.10
133
  conda activate vnner
134
+
135
+ # Install dependencies
136
  pip install -r requirements.txt
137
  ```
138
 
139
  ### Option 2: Using `environment.yml`
 
140
  ```bash
141
+ # Create environment from yml file
142
  conda env create -f environment.yml
143
  conda activate vnner
144
  ```
145
 
146
+ ## 🚀 Quick Start
147
+
148
+ ### Training the Model
149
  ```bash
150
  python run.py
151
  ```
 
152
 
153
+ ### Running the Streamlit App
154
+ ```bash
155
+ python src/app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  ```
157
 
158
+ ## 🧪 Experimental Approaches
159
+
160
+ The project explores multiple methodologies:
161
+
162
+ 1. **PhoBERT + CRF**: Sequential labeling with contextual embeddings
163
+ 2. **PhoBERT + Softmax**: Direct classification approach
164
+ 3. **Random Forest + Rule-based**: Traditional ML with linguistic rules
165
+ 4. **Random Forest + Semantic Embeddings**: Enhanced feature engineering
166
 
167
+ ## 🤝 Contributing
168
 
169
+ Contributions are welcome! Please feel free to submit a Pull Request.
170
 
171
+ ## 📄 License
172
+
173
+ This project is open source. Please check the repository for license details.
174
+
175
+ ## 🙏 Acknowledgments
176
+
177
+ - VLSP2016 dataset providers
178
+ - PhoBERT model creators
179
+ - Hugging Face for hosting the demo
180
+
181
+ ## 📚 Additional Resources
182
+
183
+ For better understanding of the project structure and technologies used:
184
+
185
+ - [Understanding `__init__.py`](https://zetcode.com/python/init-file/)
186
+ - [Markdown Basic Syntax](https://www.markdownguide.org/basic-syntax/#escaping-characters)
187
+ - [Requirements.txt vs Environment.yml](https://www.reddit.com/r/learnpython/comments/xvlpdz/why_do_people_provide_a_requirementstxt_or/)
188
+
189
+ ---
190
 
191
+ **Happy NER-ing! 🎯**
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.log ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Train Epoch 1/20: 100%|██████████| 736/736 [00:22<00:00, 32.46it/s, avg_loss=2.69, batch_loss=0.947]
2
+ Epoch 1: train_loss=2.6912, train_f1=0.8224, val_loss=1.0848, val_f1=0.8273
3
+ Saved imporved model to ./models/best_epoch_1.pt
4
+ Train Epoch 2/20: 100%|██████████| 736/736 [00:21<00:00, 33.55it/s, avg_loss=0.806, batch_loss=0.998]
5
+
6
+ Epoch 2: train_loss=0.8061, train_f1=0.8674, val_loss=0.7191, val_f1=0.8613
7
+ Saved imporved model to ./models/best_epoch_2.pt
8
+ Train Epoch 3/20: 100%|██████████| 736/736 [00:22<00:00, 32.59it/s, avg_loss=0.584, batch_loss=0.0527]
9
+
10
+ Epoch 3: train_loss=0.5842, train_f1=0.8996, val_loss=0.5643, val_f1=0.8895
11
+ Saved imporved model to ./models/best_epoch_3.pt
12
+ Train Epoch 4/20: 100%|██████████| 736/736 [00:23<00:00, 31.34it/s, avg_loss=0.478, batch_loss=1.06]
13
+
14
+ Epoch 4: train_loss=0.4782, train_f1=0.9122, val_loss=0.4838, val_f1=0.8994
15
+ Saved imporved model to ./models/best_epoch_4.pt
16
+ Train Epoch 5/20: 100%|██████████| 736/736 [00:22<00:00, 32.59it/s, avg_loss=0.406, batch_loss=0.421]
17
+
18
+ Epoch 5: train_loss=0.4056, train_f1=0.9254, val_loss=0.4281, val_f1=0.9101
19
+ Saved imporved model to ./models/best_epoch_5.pt
20
+ Train Epoch 6/20: 100%|██████████| 736/736 [00:21<00:00, 34.15it/s, avg_loss=0.36, batch_loss=1.01]
21
+
22
+ Epoch 6: train_loss=0.3599, train_f1=0.9343, val_loss=0.3934, val_f1=0.9190
23
+ Saved imporved model to ./models/best_epoch_6.pt
24
+ Train Epoch 7/20: 100%|██████████| 736/736 [00:22<00:00, 33.08it/s, avg_loss=0.322, batch_loss=0.392]
25
+
26
+ Epoch 7: train_loss=0.3218, train_f1=0.9383, val_loss=0.3751, val_f1=0.9192
27
+ Saved imporved model to ./models/best_epoch_7.pt
28
+ Train Epoch 8/20: 100%|██████████| 736/736 [00:22<00:00, 32.66it/s, avg_loss=0.294, batch_loss=0.468]
29
+
30
+ Epoch 8: train_loss=0.2942, train_f1=0.9424, val_loss=0.3560, val_f1=0.9189
31
+ Train Epoch 9/20: 100%|██████████| 736/736 [00:23<00:00, 31.68it/s, avg_loss=0.27, batch_loss=0.681]
32
+
33
+ Epoch 9: train_loss=0.2699, train_f1=0.9429, val_loss=0.3521, val_f1=0.9177
34
+ Train Epoch 10/20: 100%|██████████| 736/736 [00:21<00:00, 33.46it/s, avg_loss=0.252, batch_loss=0.525]
35
+
36
+ Epoch 10: train_loss=0.2517, train_f1=0.9493, val_loss=0.3413, val_f1=0.9222
37
+ Saved imporved model to ./models/best_epoch_10.pt
38
+ Train Epoch 11/20: 100%|██████████| 736/736 [00:22<00:00, 32.92it/s, avg_loss=0.238, batch_loss=0.022]
39
+
40
+ Epoch 11: train_loss=0.2383, train_f1=0.9551, val_loss=0.3292, val_f1=0.9232
41
+ Saved imporved model to ./models/best_epoch_11.pt
42
+ Train Epoch 12/20: 100%|██████████| 736/736 [00:23<00:00, 31.72it/s, avg_loss=0.222, batch_loss=0.529]
43
+
44
+ Epoch 12: train_loss=0.2223, train_f1=0.9543, val_loss=0.3305, val_f1=0.9207
45
+ Train Epoch 13/20: 100%|██████████| 736/736 [00:23<00:00, 31.74it/s, avg_loss=0.213, batch_loss=0.381]
46
+
47
+ Epoch 13: train_loss=0.2127, train_f1=0.9593, val_loss=0.3244, val_f1=0.9221
48
+ Train Epoch 14/20: 100%|██████████| 736/736 [00:23<00:00, 31.69it/s, avg_loss=0.203, batch_loss=0.279]
49
+
50
+ Epoch 14: train_loss=0.2026, train_f1=0.9609, val_loss=0.3213, val_f1=0.9224
51
+ Train Epoch 15/20: 100%|██████████| 736/736 [00:23<00:00, 31.84it/s, avg_loss=0.193, batch_loss=0.0462]
52
+
53
+ Epoch 15: train_loss=0.1925, train_f1=0.9574, val_loss=0.3392, val_f1=0.9117
54
+ Train Epoch 16/20: 100%|██████████| 736/736 [00:22<00:00, 32.11it/s, avg_loss=0.186, batch_loss=0.943]
55
+
56
+ Epoch 16: train_loss=0.1863, train_f1=0.9654, val_loss=0.3169, val_f1=0.9250
57
+ Saved imporved model to ./models/best_epoch_16.pt
58
+ Train Epoch 17/20: 100%|██████████| 736/736 [00:22<00:00, 32.38it/s, avg_loss=0.18, batch_loss=0.113]
59
+
60
+ Epoch 17: train_loss=0.1795, train_f1=0.9677, val_loss=0.3187, val_f1=0.9237
61
+ Train Epoch 18/20: 100%|██████████| 736/736 [00:22<00:00, 33.30it/s, avg_loss=0.173, batch_loss=0.00558]
62
+
63
+ Epoch 18: train_loss=0.1728, train_f1=0.9692, val_loss=0.3219, val_f1=0.9173
64
+ Train Epoch 19/20: 100%|██████████| 736/736 [00:23<00:00, 31.48it/s, avg_loss=0.167, batch_loss=0.115]
65
+
66
+ Epoch 19: train_loss=0.1673, train_f1=0.9681, val_loss=0.3261, val_f1=0.9195
67
+ Train Epoch 20/20: 100%|██████████| 736/736 [00:22<00:00, 32.17it/s, avg_loss=0.164, batch_loss=0.0463]
68
+
69
+ Epoch 20: train_loss=0.1640, train_f1=0.9715, val_loss=0.3230, val_f1=0.9185
70
+
71
+ Loading best model from ./models/best_epoch_16.pt for final evaluation...
72
+ Done
73
+
74
+ Evaluation on test set ...
75
+ Test_loss=0.2967, Test_f1=0.9087
76
+ precision recall f1-score support
77
+
78
+ 0 1.00 1.00 1.00 51036
79
+ 1 0.99 0.98 0.99 1112
80
+ 2 0.97 0.99 0.98 506
81
+ 3 0.86 0.79 0.82 180
82
+ 4 0.84 0.80 0.82 291
83
+ 5 0.89 0.91 0.90 939
84
+ 6 0.87 0.84 0.86 428
85
+
86
+ accuracy 0.99 54492
87
+ macro avg 0.92 0.90 0.91 54492
88
+ weighted avg 0.99 0.99 0.99 54492
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Results
2
+ training_log = {
3
+ "epoch": list(range(1, 21)),
4
+ "train_loss": [
5
+ 2.6912, 0.8061, 0.5842, 0.4782, 0.4056,
6
+ 0.3599, 0.3218, 0.2942, 0.2699, 0.2517,
7
+ 0.2383, 0.2223, 0.2127, 0.2026, 0.1925,
8
+ 0.1863, 0.1795, 0.1728, 0.1673, 0.1640
9
+ ],
10
+ "val_loss": [
11
+ 1.0848, 0.7191, 0.5643, 0.4838, 0.4281,
12
+ 0.3934, 0.3751, 0.3560, 0.3521, 0.3413,
13
+ 0.3292, 0.3305, 0.3244, 0.3213, 0.3392,
14
+ 0.3169, 0.3187, 0.3219, 0.3261, 0.3230
15
+ ],
16
+ "train_f1": [
17
+ 0.8224, 0.8674, 0.8996, 0.9122, 0.9254,
18
+ 0.9343, 0.9383, 0.9424, 0.9429, 0.9493,
19
+ 0.9551, 0.9543, 0.9593, 0.9609, 0.9574,
20
+ 0.9654, 0.9677, 0.9692, 0.9681, 0.9715
21
+ ],
22
+ "val_f1": [
23
+ 0.8273, 0.8613, 0.8895, 0.8994, 0.9101,
24
+ 0.9190, 0.9192, 0.9189, 0.9177, 0.9222,
25
+ 0.9232, 0.9207, 0.9221, 0.9224, 0.9117,
26
+ 0.9250, 0.9237, 0.9173, 0.9195, 0.9185
27
+ ]
28
+ }
29
+
30
+ report_dict = {
31
+ 'O': {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 51036},
32
+ 'B-PER': {"precision": 0.99, "recall": 0.98, "f1-score": 0.98, "support": 1112},
33
+ 'I-PER': {"precision": 0.97, "recall": 0.99, "f1-score": 0.98, "support": 506},
34
+ 'B-ORG': {"precision": 0.93, "recall": 0.95, "f1-score": 0.94, "support": 939},
35
+ 'I-ORG': {"precision": 0.93, "recall": 0.91, "f1-score": 0.92, "support": 428},
36
+ 'B-LOC': {"precision": 0.83, "recall": 0.84, "f1-score": 0.84, "support": 180},
37
+ 'I-LOC': {"precision": 0.88, "recall": 0.84, "f1-score": 0.86, "support": 291},
38
+ "accuracy": 0.99,
39
+ "macro avg": {"precision": 0.93, "recall": 0.93, "f1-score": 0.93, "support": 54492},
40
+ "weighted avg": {"precision": 0.99, "recall": 0.99, "f1-score": 0.99, "support": 54492}
41
+ }
42
+
43
+
44
+ report_dict_2 = {
45
+ 'O': {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 68476},
46
+ 'B-PER': {"precision": 0.99, "recall": 0.98, "f1-score": 0.98, "support": 1464},
47
+ 'I-PER': {"precision": 0.98, "recall": 0.98, "f1-score": 0.98, "support": 686},
48
+ 'B-ORG': {"precision": 0.77, "recall": 0.82, "f1-score": 0.80, "support": 257},
49
+ 'I-ORG': {"precision": 0.80, "recall": 0.77, "f1-score": 0.78, "support": 430},
50
+ 'B-LOC': {"precision": 0.88, "recall": 0.90, "f1-score": 0.89, "support": 1241},
51
+ 'I-LOC': {"precision": 0.83, "recall": 0.82, "f1-score": 0.82, "support": 554},
52
+ "accuracy": 0.99,
53
+ "macro avg": {"precision": 0.89, "recall": 0.89, "f1-score": 0.89, "support": 73108},
54
+ "weighted avg": {"precision": 0.99, "recall": 0.99, "f1-score": 0.99, "support": 73108}
55
+ }
56
+
57
+
58
+ model_compare = {
59
+ "Header": ["Model", "F1", "Accuracy"],
60
+ "Data": {
61
+ "PhoBERT + CRF": {"F1": 0.93, "Accuracy": 0.99},
62
+ "CRF": {"F1": 0.91, "Accuracy": 0.99},
63
+ "Softmax": {"F1": 0.89, "Accuracy": 0.99},
64
+ "Random Forest": {"F1": 0.78, "Accuracy": 0.98}
65
+ }
66
+ }
67
+
68
+ data_compare = {
69
+ "Header": ["Data Preprocessing Strategy", "F1"],
70
+ "Data": {
71
+ "Raw": 0.93,
72
+ "Crawl for Balance": 0.91,
73
+ "Remove Sentences with Only 'O' Tags": 0.91
74
+ }
75
+ }
76
+
77
+
78
+
79
+ # EDA
80
+ data_aug_count_sorted = {
81
+ 'B-PER': 474,
82
+ 'I-PER': 121,
83
+ 'B-LOC': 874,
84
+ 'I-LOC': 289,
85
+ 'B-ORG': 1110,
86
+ 'I-ORG': 761
87
+ }
88
+
89
+ raw_data_count_sorted = {
90
+ 'B-PER': 7479,
91
+ 'I-PER': 3522,
92
+ 'B-LOC': 6244,
93
+ 'I-LOC': 2783,
94
+ 'B-ORG': 1212,
95
+ 'I-ORG': 2055,
96
+ 'B-NAT': 282,
97
+ 'I-NAT': 279
98
+ }
99
+
100
+ raw_data_count_withoutNAT_sorted = {
101
+ 'B-PER': 7479,
102
+ 'I-PER': 3522,
103
+ 'B-LOC': 6244,
104
+ 'I-LOC': 2783,
105
+ 'B-ORG': 1212,
106
+ 'I-ORG': 2055
107
+ }
108
+
109
+ combined_count_sorted = {
110
+ 'B-PER': 7953,
111
+ 'I-PER': 3643,
112
+ 'B-LOC': 7118,
113
+ 'I-LOC': 3072,
114
+ 'B-ORG': 2322,
115
+ 'I-ORG': 2816
116
+ }
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore CHANGED
@@ -10,8 +10,6 @@ __pycache__/
10
 
11
  # Dataset and results folders
12
  data/
13
- results/
14
- outputs/
15
  logs/
16
 
17
  # Large files
 
10
 
11
  # Dataset and results folders
12
  data/
 
 
13
  logs/
14
 
15
  # Large files
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt CHANGED
Binary files a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt and b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt differ
 
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.github/workflows/main.yml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy to Hugging Face Space
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main # hoặc branch bạn dùng
7
+
8
+ jobs:
9
+ deploy:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout repo
14
+ uses: actions/checkout@v3
15
+
16
+ - name: Set up Git
17
+ run: |
18
+ git config --global user.email "[email protected]"
19
+ git config --global user.name "GitHub Actions"
20
+
21
+ - name: Push to Hugging Face Spaces
22
+ env:
23
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
24
+ run: |
25
+ git clone https://huggingface.co/spaces/DucLai/Vietnamese_NER space
26
+
27
+ # Đồng bộ code vào repo Space (không copy .git)
28
+ rsync -av --exclude '.git' ./ space/
29
+
30
+ # Xoá file binary ra khỏi Git index trước khi commit
31
+ cd space
32
+ find . -type f \( \
33
+ -iname "*.png" -o \
34
+ -iname "*.jpg" -o \
35
+ -iname "*.jpeg" -o \
36
+ -iname "*.mp4" -o \
37
+ -iname "*.zip" -o \
38
+ -iname "*.pth" -o \
39
+ -iname "*.h5" -o \
40
+ -iname "*.tar.gz" -o \
41
+ -iname "*.wav" \
42
+ \) -exec git rm --cached {} \; || true
43
+
44
+ # Commit và push
45
+ git add .
46
+ git commit -m "Auto-deploy from GitHub (binary files removed)" || echo "No changes to commit"
47
+ git push https://DucLai:${HF_TOKEN}@huggingface.co/spaces/DucLai/Vietnamese_NER HEAD
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+
6
+ # Hugging Face binary/model outputs
7
+ *.pth
8
+ *.h5
9
+ *.ckpt
10
+
11
+ # Dataset and results folders
12
+ data/
13
+ results/
14
+ outputs/
15
+ logs/
16
+
17
+ # Large files
18
+ *.zip
19
+ *.tar.gz
20
+ *.mp4
21
+ *.png
22
+ *.jpg
23
+ *.jpeg
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/configs/config.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ ECHO is on.
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/environment.yml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ name: vnner
2
+ channels:
3
+ - defaults
4
+ - conda-forge
5
+ dependencies:
6
+ - python=3.10
7
+ - pip
8
+ - pip:
9
+ - -r requirements.txt
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/models/best_epoch_16.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:622cac3a55eec6a245f70c2ec7591d8fbfa8c18e13db7555915405fb57b145a0
3
+ size 24130
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_RF_lightgbm.ipynb ADDED
@@ -0,0 +1,741 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "id": "10ec017cb658e125",
6
+ "metadata": {
7
+ "ExecuteTime": {
8
+ "end_time": "2025-06-11T00:21:33.244538Z",
9
+ "start_time": "2025-06-11T00:21:05.317283Z"
10
+ }
11
+ },
12
+ "source": [
13
+ "import pandas as pd\n",
14
+ "\n",
15
+ "splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}\n",
16
+ "df_train = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"train\"])\n",
17
+ "df_valid = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"valid\"])\n",
18
+ "df = pd.concat([df_train, df_valid]).reset_index(drop=True)"
19
+ ],
20
+ "outputs": [],
21
+ "execution_count": 1
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "id": "c533c55a2ad7b16e",
26
+ "metadata": {
27
+ "ExecuteTime": {
28
+ "end_time": "2025-06-11T00:21:33.499341Z",
29
+ "start_time": "2025-06-11T00:21:33.262933Z"
30
+ }
31
+ },
32
+ "source": [
33
+ "# Tạo thêm các cột khác\n",
34
+ "def join_tokens(tokens):\n",
35
+ " text = ' '.join(tokens)\n",
36
+ " return text\n",
37
+ "\n",
38
+ "def reform_raw_text(tokens):\n",
39
+ " text = ' '.join(tokens)\n",
40
+ " return text.replace(\"_\", \" \")\n",
41
+ "\n",
42
+ "def label(x):\n",
43
+ " return [id_tag[int(i)] for i in x]\n",
44
+ "\n",
45
+ "def replace_7_8(lst):\n",
46
+ " return [0 if x in (7, 8) else x for x in lst]\n",
47
+ "\n",
48
+ "\n",
49
+ "tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}\n",
50
+ "id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}\n",
51
+ "\n",
52
+ "\n",
53
+ "df['ner_tags'] = df['ner_tags'].apply(replace_7_8)\n",
54
+ "df['text_withseg'] = df['tokens'].apply(join_tokens)\n",
55
+ "df['text_raw'] = df['tokens'].apply(reform_raw_text)\n",
56
+ "df[\"ner_labels\"] = df.ner_tags.apply(label)\n",
57
+ "df.columns = ['tokens', 'id', 'seg_text', 'raw_text', 'labels']\n",
58
+ "df\n"
59
+ ],
60
+ "outputs": [
61
+ {
62
+ "data": {
63
+ "text/plain": [
64
+ " tokens \\\n",
65
+ "0 [Không_khí, thật, náo_nhiệt, .] \n",
66
+ "1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n",
67
+ "2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n",
68
+ "3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n",
69
+ "4 [Nhật_ký, của, thuyền_viên, .] \n",
70
+ "... ... \n",
71
+ "16853 [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ... \n",
72
+ "16854 [Nhưng, mọi, chuyện, không, dừng, ở, đó, .] \n",
73
+ "16855 [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,... \n",
74
+ "16856 [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ... \n",
75
+ "16857 [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t... \n",
76
+ "\n",
77
+ " id \\\n",
78
+ "0 [0, 0, 0, 0] \n",
79
+ "1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n",
80
+ "2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n",
81
+ "3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n",
82
+ "4 [0, 0, 0, 0] \n",
83
+ "... ... \n",
84
+ "16853 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... \n",
85
+ "16854 [0, 0, 0, 0, 0, 0, 0, 0] \n",
86
+ "16855 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n",
87
+ "16856 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n",
88
+ "16857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n",
89
+ "\n",
90
+ " seg_text \\\n",
91
+ "0 Không_khí thật náo_nhiệt . \n",
92
+ "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n",
93
+ "2 Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n",
94
+ "3 Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,... \n",
95
+ "4 Nhật_ký của thuyền_viên . \n",
96
+ "... ... \n",
97
+ "16853 Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ... \n",
98
+ "16854 Nhưng mọi chuyện không dừng ở đó . \n",
99
+ "16855 Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh... \n",
100
+ "16856 Biết_bao người đã tình_nguyện hiến_dâng cả cuộ... \n",
101
+ "16857 Trên đây mới là “ thành_tích ” tiêu tiền của m... \n",
102
+ "\n",
103
+ " raw_text \\\n",
104
+ "0 Không khí thật náo nhiệt . \n",
105
+ "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n",
106
+ "2 Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n",
107
+ "3 Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,... \n",
108
+ "4 Nhật ký của thuyền viên . \n",
109
+ "... ... \n",
110
+ "16853 Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ... \n",
111
+ "16854 Nhưng mọi chuyện không dừng ở đó . \n",
112
+ "16855 Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh... \n",
113
+ "16856 Biết bao người đã tình nguyện hiến dâng cả cuộ... \n",
114
+ "16857 Trên đây mới là “ thành tích ” tiêu tiền của m... \n",
115
+ "\n",
116
+ " labels \n",
117
+ "0 [O, O, O, O] \n",
118
+ "1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n",
119
+ "2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n",
120
+ "3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n",
121
+ "4 [O, O, O, O] \n",
122
+ "... ... \n",
123
+ "16853 [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,... \n",
124
+ "16854 [O, O, O, O, O, O, O, O] \n",
125
+ "16855 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n",
126
+ "16856 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n",
127
+ "16857 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n",
128
+ "\n",
129
+ "[16858 rows x 5 columns]"
130
+ ],
131
+ "text/html": [
132
+ "<div>\n",
133
+ "<style scoped>\n",
134
+ " .dataframe tbody tr th:only-of-type {\n",
135
+ " vertical-align: middle;\n",
136
+ " }\n",
137
+ "\n",
138
+ " .dataframe tbody tr th {\n",
139
+ " vertical-align: top;\n",
140
+ " }\n",
141
+ "\n",
142
+ " .dataframe thead th {\n",
143
+ " text-align: right;\n",
144
+ " }\n",
145
+ "</style>\n",
146
+ "<table border=\"1\" class=\"dataframe\">\n",
147
+ " <thead>\n",
148
+ " <tr style=\"text-align: right;\">\n",
149
+ " <th></th>\n",
150
+ " <th>tokens</th>\n",
151
+ " <th>id</th>\n",
152
+ " <th>seg_text</th>\n",
153
+ " <th>raw_text</th>\n",
154
+ " <th>labels</th>\n",
155
+ " </tr>\n",
156
+ " </thead>\n",
157
+ " <tbody>\n",
158
+ " <tr>\n",
159
+ " <th>0</th>\n",
160
+ " <td>[Không_khí, thật, náo_nhiệt, .]</td>\n",
161
+ " <td>[0, 0, 0, 0]</td>\n",
162
+ " <td>Không_khí thật náo_nhiệt .</td>\n",
163
+ " <td>Không khí thật náo nhiệt .</td>\n",
164
+ " <td>[O, O, O, O]</td>\n",
165
+ " </tr>\n",
166
+ " <tr>\n",
167
+ " <th>1</th>\n",
168
+ " <td>[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...</td>\n",
169
+ " <td>[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
170
+ " <td>Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...</td>\n",
171
+ " <td>Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...</td>\n",
172
+ " <td>[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...</td>\n",
173
+ " </tr>\n",
174
+ " <tr>\n",
175
+ " <th>2</th>\n",
176
+ " <td>[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...</td>\n",
177
+ " <td>[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
178
+ " <td>Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...</td>\n",
179
+ " <td>Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...</td>\n",
180
+ " <td>[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...</td>\n",
181
+ " </tr>\n",
182
+ " <tr>\n",
183
+ " <th>3</th>\n",
184
+ " <td>[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...</td>\n",
185
+ " <td>[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...</td>\n",
186
+ " <td>Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...</td>\n",
187
+ " <td>Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...</td>\n",
188
+ " <td>[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...</td>\n",
189
+ " </tr>\n",
190
+ " <tr>\n",
191
+ " <th>4</th>\n",
192
+ " <td>[Nhật_ký, của, thuyền_viên, .]</td>\n",
193
+ " <td>[0, 0, 0, 0]</td>\n",
194
+ " <td>Nhật_ký của thuyền_viên .</td>\n",
195
+ " <td>Nhật ký của thuyền viên .</td>\n",
196
+ " <td>[O, O, O, O]</td>\n",
197
+ " </tr>\n",
198
+ " <tr>\n",
199
+ " <th>...</th>\n",
200
+ " <td>...</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>...</td>\n",
203
+ " <td>...</td>\n",
204
+ " <td>...</td>\n",
205
+ " </tr>\n",
206
+ " <tr>\n",
207
+ " <th>16853</th>\n",
208
+ " <td>[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...</td>\n",
209
+ " <td>[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...</td>\n",
210
+ " <td>Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...</td>\n",
211
+ " <td>Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...</td>\n",
212
+ " <td>[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...</td>\n",
213
+ " </tr>\n",
214
+ " <tr>\n",
215
+ " <th>16854</th>\n",
216
+ " <td>[Nhưng, mọi, chuyện, không, dừng, ở, đó, .]</td>\n",
217
+ " <td>[0, 0, 0, 0, 0, 0, 0, 0]</td>\n",
218
+ " <td>Nhưng mọi chuyện không dừng ở đó .</td>\n",
219
+ " <td>Nhưng mọi chuyện không dừng ở đó .</td>\n",
220
+ " <td>[O, O, O, O, O, O, O, O]</td>\n",
221
+ " </tr>\n",
222
+ " <tr>\n",
223
+ " <th>16855</th>\n",
224
+ " <td>[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...</td>\n",
225
+ " <td>[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
226
+ " <td>Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...</td>\n",
227
+ " <td>Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...</td>\n",
228
+ " <td>[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...</td>\n",
229
+ " </tr>\n",
230
+ " <tr>\n",
231
+ " <th>16856</th>\n",
232
+ " <td>[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...</td>\n",
233
+ " <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]</td>\n",
234
+ " <td>Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...</td>\n",
235
+ " <td>Biết bao người đã tình nguyện hiến dâng cả cuộ...</td>\n",
236
+ " <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]</td>\n",
237
+ " </tr>\n",
238
+ " <tr>\n",
239
+ " <th>16857</th>\n",
240
+ " <td>[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...</td>\n",
241
+ " <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
242
+ " <td>Trên đây mới là “ thành_tích ” tiêu tiền của m...</td>\n",
243
+ " <td>Trên đây mới là “ thành tích ” tiêu tiền của m...</td>\n",
244
+ " <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...</td>\n",
245
+ " </tr>\n",
246
+ " </tbody>\n",
247
+ "</table>\n",
248
+ "<p>16858 rows × 5 columns</p>\n",
249
+ "</div>"
250
+ ]
251
+ },
252
+ "execution_count": 2,
253
+ "metadata": {},
254
+ "output_type": "execute_result"
255
+ }
256
+ ],
257
+ "execution_count": 2
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "id": "14d9b9fae58b7173",
262
+ "metadata": {
263
+ "ExecuteTime": {
264
+ "end_time": "2025-06-11T00:21:59.373985Z",
265
+ "start_time": "2025-06-11T00:21:34.524025Z"
266
+ }
267
+ },
268
+ "source": [
269
+ "import torch\n",
270
+ "from transformers import AutoTokenizer, AutoModel\n",
271
+ "from tqdm import tqdm\n",
272
+ "\n",
273
+ "# Load PhoBERT tokenizer và model\n",
274
+ "tokenizer = AutoTokenizer.from_pretrained(\"vinai/phobert-base\", use_fast=False)\n",
275
+ "model = AutoModel.from_pretrained(\"vinai/phobert-base\")\n",
276
+ "model.eval()"
277
+ ],
278
+ "outputs": [
279
+ {
280
+ "name": "stdout",
281
+ "output_type": "stream",
282
+ "text": [
283
+ "cuda\n"
284
+ ]
285
+ },
286
+ {
287
+ "data": {
288
+ "text/plain": [
289
+ "RobertaModel(\n",
290
+ " (embeddings): RobertaEmbeddings(\n",
291
+ " (word_embeddings): Embedding(64001, 768, padding_idx=1)\n",
292
+ " (position_embeddings): Embedding(258, 768, padding_idx=1)\n",
293
+ " (token_type_embeddings): Embedding(1, 768)\n",
294
+ " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
295
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
296
+ " )\n",
297
+ " (encoder): RobertaEncoder(\n",
298
+ " (layer): ModuleList(\n",
299
+ " (0-11): 12 x RobertaLayer(\n",
300
+ " (attention): RobertaAttention(\n",
301
+ " (self): RobertaSdpaSelfAttention(\n",
302
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
303
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
304
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
305
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
306
+ " )\n",
307
+ " (output): RobertaSelfOutput(\n",
308
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
309
+ " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
310
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
311
+ " )\n",
312
+ " )\n",
313
+ " (intermediate): RobertaIntermediate(\n",
314
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
315
+ " (intermediate_act_fn): GELUActivation()\n",
316
+ " )\n",
317
+ " (output): RobertaOutput(\n",
318
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
319
+ " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
320
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
321
+ " )\n",
322
+ " )\n",
323
+ " )\n",
324
+ " )\n",
325
+ " (pooler): RobertaPooler(\n",
326
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
327
+ " (activation): Tanh()\n",
328
+ " )\n",
329
+ ")"
330
+ ]
331
+ },
332
+ "execution_count": 3,
333
+ "metadata": {},
334
+ "output_type": "execute_result"
335
+ }
336
+ ],
337
+ "execution_count": 3
338
+ },
339
+ {
340
+ "cell_type": "code",
341
+ "id": "a47ec382649c3036",
342
+ "metadata": {
343
+ "ExecuteTime": {
344
+ "end_time": "2025-06-11T00:23:23.888583Z",
345
+ "start_time": "2025-06-11T00:23:23.885204Z"
346
+ }
347
+ },
348
+ "source": [
349
+ "# Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece\n",
350
+ "def group_embeddings(tokens, embeddings):\n",
351
+ " word_embeddings = []\n",
352
+ " current_vecs = []\n",
353
+ "\n",
354
+ " for token, emb in zip(tokens, embeddings):\n",
355
+ " if token in [\"<s>\", \"</s>\"]:\n",
356
+ " continue\n",
357
+ "\n",
358
+ " if token.endswith(\"@@\"):\n",
359
+ " current_vecs.append(emb)\n",
360
+ " else:\n",
361
+ " current_vecs.append(emb)\n",
362
+ " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n",
363
+ " word_embeddings.append(word_emb)\n",
364
+ " current_vecs = []\n",
365
+ "\n",
366
+ " if current_vecs: # Trong trường hợp sót lại cuối câu\n",
367
+ " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n",
368
+ " word_embeddings.append(word_emb)\n",
369
+ "\n",
370
+ " return word_embeddings"
371
+ ],
372
+ "outputs": [],
373
+ "execution_count": 4
374
+ },
375
+ {
376
+ "cell_type": "code",
377
+ "id": "f8c0ad89ae81b0c",
378
+ "metadata": {
379
+ "ExecuteTime": {
380
+ "end_time": "2025-06-11T00:25:52.567135Z",
381
+ "start_time": "2025-06-11T00:23:56.155322Z"
382
+ }
383
+ },
384
+ "source": [
385
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
386
+ "model.to(device)\n",
387
+ "\n",
388
+ "all_embeddings = [] # list of [seq_len_i, 768] tensors\n",
389
+ "all_labels = [] # list of [seq_len_i,] tensors\n",
390
+ "len_em = []\n",
391
+ "\n",
392
+ "# count = 0\n",
393
+ "\n",
394
+ "for i, row in df.iterrows():\n",
395
+ "\n",
396
+ " # count += 1\n",
397
+ " # if count == 500:\n",
398
+ " # break\n",
399
+ "\n",
400
+ " # Truy cập phần tử từng dòng\n",
401
+ " sentence = row['seg_text']\n",
402
+ " gold_labels = row[\"id\"]\n",
403
+ "\n",
404
+ " # Cho sentence đi qua SentencePiece\n",
405
+ " input_ids = tokenizer.encode(sentence, return_tensors=\"pt\").to(device)\n",
406
+ "\n",
407
+ " tokens = tokenizer.convert_ids_to_tokens(input_ids[0].to(device))\n",
408
+ "\n",
409
+ " # Encode tạo embeddings\n",
410
+ " with torch.no_grad():\n",
411
+ " outputs = model(input_ids)\n",
412
+ " last_hidden_state = outputs.last_hidden_state.squeeze(0)\n",
413
+ "\n",
414
+ " # Gộp các embeddings đã bị tách khi đi qua SentencePiece\n",
415
+ " word_embeds = group_embeddings(tokens, last_hidden_state)\n",
416
+ "\n",
417
+ " # Kiểm tra số lượng embeddings và số lượng labels\n",
418
+ " if len(word_embeds) != len(gold_labels):\n",
419
+ " continue\n",
420
+ "\n",
421
+ " # Thêm vào list tổng / Tới đây là data đã sẵn sàng cho training\n",
422
+ " all_embeddings.append(torch.stack(word_embeds))\n",
423
+ " all_labels.append(torch.tensor(gold_labels))"
424
+ ],
425
+ "outputs": [],
426
+ "execution_count": 6
427
+ },
428
+ {
429
+ "metadata": {
430
+ "ExecuteTime": {
431
+ "end_time": "2025-06-11T00:35:23.255306Z",
432
+ "start_time": "2025-06-11T00:35:23.252026Z"
433
+ }
434
+ },
435
+ "cell_type": "code",
436
+ "source": "# We skip 43 data since they aren't convertable",
437
+ "id": "c3e406ad994802be",
438
+ "outputs": [
439
+ {
440
+ "name": "stdout",
441
+ "output_type": "stream",
442
+ "text": [
443
+ "-43\n"
444
+ ]
445
+ }
446
+ ],
447
+ "execution_count": 15
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "id": "cadc3a861025b3b9",
452
+ "metadata": {
453
+ "ExecuteTime": {
454
+ "end_time": "2025-06-11T00:36:18.857012Z",
455
+ "start_time": "2025-06-11T00:36:08.257408Z"
456
+ }
457
+ },
458
+ "source": [
459
+ "import numpy as np\n",
460
+ "from sklearn.model_selection import train_test_split\n",
461
+ "\n",
462
+ "X_flat = []\n",
463
+ "y_flat = []\n",
464
+ "\n",
465
+ "for emb_seq, label_seq in zip(all_embeddings, all_labels):\n",
466
+ " for emb, label in zip(emb_seq, label_seq):\n",
467
+ " X_flat.append(emb.cpu().numpy()) # emb: [768]\n",
468
+ " y_flat.append(label.item()) # label: int\n",
469
+ "\n",
470
+ "X_flat = np.array(X_flat) # [N, 768]\n",
471
+ "y_flat = np.array(y_flat) # [N]\n"
472
+ ],
473
+ "outputs": [],
474
+ "execution_count": 16
475
+ },
476
+ {
477
+ "cell_type": "code",
478
+ "id": "52a0fe72a50d4f73",
479
+ "metadata": {
480
+ "ExecuteTime": {
481
+ "end_time": "2025-06-11T00:39:58.211159Z",
482
+ "start_time": "2025-06-11T00:39:58.208074Z"
483
+ }
484
+ },
485
+ "source": [
486
+ "print(X_flat[0].shape)\n",
487
+ "print(y_flat.shape)"
488
+ ],
489
+ "outputs": [
490
+ {
491
+ "name": "stdout",
492
+ "output_type": "stream",
493
+ "text": [
494
+ "(768,)\n",
495
+ "(368172,)\n"
496
+ ]
497
+ }
498
+ ],
499
+ "execution_count": 19
500
+ },
501
+ {
502
+ "cell_type": "code",
503
+ "id": "d6275df555f0c4c3",
504
+ "metadata": {
505
+ "ExecuteTime": {
506
+ "end_time": "2025-06-11T00:42:00.129778Z",
507
+ "start_time": "2025-06-11T00:42:00.096986Z"
508
+ }
509
+ },
510
+ "source": [
511
+ "# Kiểm tra độ lệch data\n",
512
+ "unique_values, counts = np.unique(y_flat, return_counts=True)\n",
513
+ "\n",
514
+ "# In ra từng giá trị và số lần xuất hiện\n",
515
+ "for val, count in zip(unique_values, counts):\n",
516
+ " print(f\"Label {val}: {count} times\")\n"
517
+ ],
518
+ "outputs": [
519
+ {
520
+ "name": "stdout",
521
+ "output_type": "stream",
522
+ "text": [
523
+ "Label 0: 344986 times\n",
524
+ "Label 1: 7450 times\n",
525
+ "Label 2: 3504 times\n",
526
+ "Label 3: 1204 times\n",
527
+ "Label 4: 2050 times\n",
528
+ "Label 5: 6211 times\n",
529
+ "Label 6: 2767 times\n"
530
+ ]
531
+ }
532
+ ],
533
+ "execution_count": 24
534
+ },
535
+ {
536
+ "cell_type": "code",
537
+ "id": "664020977ba9a1e2",
538
+ "metadata": {
539
+ "ExecuteTime": {
540
+ "end_time": "2025-06-11T00:42:03.350616Z",
541
+ "start_time": "2025-06-11T00:42:02.915680Z"
542
+ }
543
+ },
544
+ "source": [
545
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
546
+ " X_flat, y_flat, test_size=0.2, random_state=42, stratify=y_flat)\n"
547
+ ],
548
+ "outputs": [],
549
+ "execution_count": 25
550
+ },
551
+ {
552
+ "cell_type": "code",
553
+ "id": "d4acda9c7cae3214",
554
+ "metadata": {
555
+ "ExecuteTime": {
556
+ "end_time": "2025-06-11T00:42:25.235471Z",
557
+ "start_time": "2025-06-11T00:42:16.769480Z"
558
+ }
559
+ },
560
+ "source": [
561
+ "import lightgbm as lgb\n",
562
+ "from sklearn.metrics import accuracy_score, f1_score, classification_report\n",
563
+ "\n",
564
+ "\n",
565
+ "# Tạo Dataset cho LightGBM\n",
566
+ "train_data = lgb.Dataset(X_train, label=y_train)\n",
567
+ "test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)\n",
568
+ "\n",
569
+ "# Cấu hình tham số LightGBM (Random Forest mode)\n",
570
+ "params = {\n",
571
+ " \"objective\": \"multiclass\", # nếu multiclass classification\n",
572
+ " \"num_class\": len(np.unique(y_train)),\n",
573
+ " \"metric\": \"multi_logloss\",\n",
574
+ " \"boosting_type\": \"rf\", # random forest mode trong LightGBM\n",
575
+ " \"num_leaves\": 31,\n",
576
+ " \"bagging_freq\": 1,\n",
577
+ " \"bagging_fraction\": 0.8,\n",
578
+ " \"feature_fraction\": 0.8,\n",
579
+ " \"bagging_seed\": 42,\n",
580
+ " \"verbose\": -1,\n",
581
+ " \"seed\": 42,\n",
582
+ " \"is_unbalance\": True\n",
583
+ "}\n",
584
+ "\n",
585
+ "\n",
586
+ "\n",
587
+ "# Train model, tích hợp wandb callback để log metrics\n",
588
+ "model = lgb.train(\n",
589
+ " params,\n",
590
+ " train_data,\n",
591
+ " num_boost_round=2,\n",
592
+ " valid_sets=[train_data, test_data],\n",
593
+ " valid_names=[\"train\", \"test\"]\n",
594
+ ")\n",
595
+ "\n",
596
+ "# Dự đoán trên test set\n",
597
+ "y_pred_prob = model.predict(X_test)\n",
598
+ "y_pred = np.argmax(y_pred_prob, axis=1)\n",
599
+ "\n",
600
+ "# Ánh xạ số về nhãn tên entity\n",
601
+ "label_map = {\n",
602
+ " 0: 'O',\n",
603
+ " 1: 'B-PER',\n",
604
+ " 2: 'I-PER',\n",
605
+ " 3: 'B-ORG',\n",
606
+ " 4: 'I-ORG',\n",
607
+ " 5: 'B-LOC',\n",
608
+ " 6: 'I-LOC'\n",
609
+ "}\n",
610
+ "\n",
611
+ "# Chuyển y_test và y_pred sang nhãn gốc\n",
612
+ "y_test_labels = [label_map[i] for i in y_test]\n",
613
+ "y_pred_labels = [label_map[i] for i in y_pred]\n",
614
+ "\n",
615
+ "# In classification report với nhãn thật\n",
616
+ "print(\"\\nClassification Report (theo label gốc):\")\n",
617
+ "print(classification_report(y_test_labels, y_pred_labels, digits=4))\n",
618
+ "\n",
619
+ "\n"
620
+ ],
621
+ "outputs": [
622
+ {
623
+ "name": "stdout",
624
+ "output_type": "stream",
625
+ "text": [
626
+ "\n",
627
+ "Classification Report (theo label gốc):\n",
628
+ " precision recall f1-score support\n",
629
+ "\n",
630
+ " B-LOC 0.3679 0.5000 0.4239 1242\n",
631
+ " B-ORG 0.2639 0.3942 0.3161 241\n",
632
+ " B-PER 0.4395 0.7490 0.5540 1490\n",
633
+ " I-LOC 0.2321 0.4448 0.3050 553\n",
634
+ " I-ORG 0.1532 0.2878 0.2000 410\n",
635
+ " I-PER 0.4304 0.5863 0.4964 701\n",
636
+ " O 0.9869 0.9478 0.9669 68998\n",
637
+ "\n",
638
+ " accuracy 0.9235 73635\n",
639
+ " macro avg 0.4106 0.5586 0.4660 73635\n",
640
+ "weighted avg 0.9474 0.9235 0.9336 73635\n",
641
+ "\n"
642
+ ]
643
+ }
644
+ ],
645
+ "execution_count": 26
646
+ },
647
+ {
648
+ "metadata": {
649
+ "ExecuteTime": {
650
+ "end_time": "2025-06-11T00:45:00.649942Z",
651
+ "start_time": "2025-06-11T00:45:00.646595Z"
652
+ }
653
+ },
654
+ "cell_type": "code",
655
+ "source": "print(model.feature_importance().shape)",
656
+ "id": "b1cf76bc3e58bc93",
657
+ "outputs": [
658
+ {
659
+ "name": "stdout",
660
+ "output_type": "stream",
661
+ "text": [
662
+ "(768,)\n"
663
+ ]
664
+ }
665
+ ],
666
+ "execution_count": 35
667
+ },
668
+ {
669
+ "metadata": {
670
+ "ExecuteTime": {
671
+ "end_time": "2025-06-11T00:52:36.844604Z",
672
+ "start_time": "2025-06-11T00:52:36.827018Z"
673
+ }
674
+ },
675
+ "cell_type": "code",
676
+ "source": [
677
+ "correct = 0\n",
678
+ "for i in range(73635):\n",
679
+ " if y_pred[i] == y_test[i]:\n",
680
+ " correct += 1\n",
681
+ "correct"
682
+ ],
683
+ "id": "39d391e67a51211c",
684
+ "outputs": [
685
+ {
686
+ "data": {
687
+ "text/plain": [
688
+ "68001"
689
+ ]
690
+ },
691
+ "execution_count": 58,
692
+ "metadata": {},
693
+ "output_type": "execute_result"
694
+ }
695
+ ],
696
+ "execution_count": 58
697
+ },
698
+ {
699
+ "metadata": {
700
+ "ExecuteTime": {
701
+ "end_time": "2025-06-11T00:57:45.109129Z",
702
+ "start_time": "2025-06-11T00:57:45.105078Z"
703
+ }
704
+ },
705
+ "cell_type": "code",
706
+ "source": "print(y_test.shape)",
707
+ "id": "1a0ba8f0410c5589",
708
+ "outputs": [
709
+ {
710
+ "name": "stdout",
711
+ "output_type": "stream",
712
+ "text": [
713
+ "(73635,)\n"
714
+ ]
715
+ }
716
+ ],
717
+ "execution_count": 61
718
+ }
719
+ ],
720
+ "metadata": {
721
+ "kernelspec": {
722
+ "display_name": "Python 3",
723
+ "language": "python",
724
+ "name": "python3"
725
+ },
726
+ "language_info": {
727
+ "codemirror_mode": {
728
+ "name": "ipython",
729
+ "version": 2
730
+ },
731
+ "file_extension": ".py",
732
+ "mimetype": "text/x-python",
733
+ "name": "python",
734
+ "nbconvert_exporter": "python",
735
+ "pygments_lexer": "ipython2",
736
+ "version": "2.7.6"
737
+ }
738
+ },
739
+ "nbformat": 4,
740
+ "nbformat_minor": 5
741
+ }
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_Rule_base.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Softmax_PhoBERT.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt ADDED
Binary file (2.43 kB). View file
 
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/run.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.preprocessing import download_raw_data, preprocess_data_for_EDA, load_phoBERT_model_and_tokenizer, create_embeddings, split_dataset
2
+ from src.data_set import NerDataset, collate_fn
3
+ from src.configs import configs
4
+ from src.model import CRF_Tagger
5
+ from src.train import train_model
6
+
7
+ import torch
8
+ from torch.utils.data import DataLoader
9
+
10
+ import warnings
11
+ warnings.filterwarnings("ignore")
12
+
13
+
14
+ def main():
15
+
16
+ # Download VLSP2016 from hgface
17
+ print("Download raw data ...")
18
+ df = download_raw_data()
19
+
20
+ # Save raw data
21
+ df.to_csv(r".\data\raw_data.csv", index=False)
22
+ print("Save at data\raw_data.csv \n")
23
+
24
+ # Process data for EDA
25
+ print("Process data for EDA ...")
26
+ df = preprocess_data_for_EDA(df)
27
+ df.to_csv(r".\data\processed_data_EDA.csv", index=False)
28
+ print("Save at data\processed_data_EDA.csv \n")
29
+
30
+ # Init PhoBERT Tokenizer and PhoBERT Model
31
+ print("Embedding data ...")
32
+ model, tokenizer = load_phoBERT_model_and_tokenizer()
33
+
34
+ # Embeddings data
35
+ processed_data = create_embeddings(df, model, tokenizer)
36
+ torch.save(processed_data, r".\data\processed_data_full.pt")
37
+ print("Save at data\processed_data_full.pt \n")
38
+
39
+ # Split data into train/valid/test
40
+ print("Train/Valid/Test Split ...")
41
+ X_train, Y_train, X_val, Y_val, X_test, Y_test = split_dataset(processed_data)
42
+ print("Done \n")
43
+
44
+ # Data Agumentation for training set
45
+ # Pass
46
+
47
+ # Init DataLoader
48
+ print("Init DataLoader ...")
49
+ datasets = {
50
+ 'train': NerDataset(X_train, Y_train),
51
+ 'val': NerDataset(X_val, Y_val),
52
+ 'test': NerDataset(X_test, Y_test)
53
+ }
54
+
55
+ loaders = {
56
+ split: DataLoader(dataset, batch_size=configs["batch_size"], shuffle=(split=='train'), collate_fn=collate_fn)
57
+ for split, dataset in datasets.items()
58
+ }
59
+ print("Done \n")
60
+
61
+ # Init sequence label model
62
+ print("Init Model ...")
63
+ NUM_TAGS = 7
64
+ model = CRF_Tagger(input_dim=X_train[0].size(1), num_tags=NUM_TAGS)
65
+ optimizer = torch.optim.Adam(model.parameters(), lr=configs["learning_rate"])
66
+ print("Done \n")
67
+
68
+ # Training Model
69
+ print("Start training ...")
70
+ train_model(model, optimizer, configs, loaders)
71
+
72
+ if __name__ == "__main__":
73
+ main()
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vietnamese NER Demo
3
+ emoji: 🧠
4
+ colorFrom: indigo
5
+ colorTo: yellow
6
+ sdk: streamlit
7
+ sdk_version: 1.46.1
8
+ app_file: src/app.py
9
+ pinned: false
10
+ ---
11
+ # Vietnamese Named Entity Recognition
12
+
13
+ ## 🛠️ Set Up Your Environment With Conda
14
+
15
+ ### Option 1: Using `requirements.txt`
16
+
17
+ ```bash
18
+ conda create --name vnner python=3.10
19
+ conda activate vnner
20
+ pip install -r requirements.txt
21
+ ```
22
+
23
+ ### Option 2: Using `environment.yml`
24
+
25
+ ```bash
26
+ conda env create -f environment.yml
27
+ conda activate vnner
28
+ ```
29
+
30
+ ## Run
31
+ ```bash
32
+ python run.py
33
+ ```
34
+ ---
35
+
36
+ ## 📂 Project Structure
37
+
38
+ ```
39
+ my_ai_project/
40
+
41
+ ├── data/
42
+ │ ├── raw_data.csv # Dữ liệu gốc
43
+ │ ├── processed_data_EDA.csv # Dữ liệu sau khi tiền xử lý
44
+ │ └── processed_data_full.csv # Dữ liệu sẵn sàng training
45
+
46
+ ├── notebooks/ # Thử nghiệm và khám phá dữ liệu
47
+ │ ├── Duc_Notebook.ipynb # CRF + RandomForest
48
+ │ ├── Softmax_PhoBERT.ipynb # Softmax
49
+
50
+ ├── src/ # Mã nguồn chính của dự án
51
+ │ ├── __init__.py
52
+ │ ├── data_loader.py # Nạp và xử lý dữ liệu
53
+ │ ├── preprocessing.py # Hàm tiền xử lý dữ liệu
54
+ │ ├── model.py # Định nghĩa kiến trúc mô hình
55
+ │ ├── train.py # Huấn luyện mô hình
56
+ │ ├── evaluate.py # Đánh giá mô hình
57
+ │ └── predict.py # Dự đoán với mô hình đã huấn luyện
58
+
59
+ ├── models/ # Mô hình đã lưu sau khi huấn luyện
60
+ │ └── best_model.pth # File trọng số mô hình
61
+
62
+ ├── outputs/ # Kết quả, biểu đồ, log, metrics
63
+ │ ├── logs/ # Nhật ký huấn luyện (tensorboard/logging)
64
+ │ └── figures/ # Biểu đồ trực quan hóa
65
+
66
+ ├── configs/ # File cấu hình cho mô hình, huấn luyện
67
+ │ └── config.yaml
68
+
69
+ ├── tests/ # Unit test cho các hàm chính
70
+
71
+ ├── requirements.txt # Thư viện cần cài đặt
72
+ ├── environment.yml # Môi trường Conda
73
+ ├── README.md # Giới thiệu dự án
74
+ └── run.py # Script chính để chạy toàn bộ pipeline
75
+ ```
76
+
77
+ ---
78
+
79
+ ## 📚 Additional Resources (Optional)
80
+
81
+ If you have any questions about the project structure, consider reading these helpful articles first:
82
+
83
+ * [Understanding `__init__.py`](https://zetcode.com/python/init-file/)
84
+ * [Markdown Basic Syntax](https://www.markdownguide.org/basic-syntax/#escaping-characters)
85
+ * [Difference Between `requirements.txt` and `environment.yml`](https://www.reddit.com/r/learnpython/comments/xvlpdz/why_do_people_provide_a_requirementstxt_or/)
86
+
87
+ These resources could be useful for you!
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """Marks the directory as a Python package."""
2
+ __version__ = "1.0.0"
3
+ __author__ = "Duc Lai"
4
+ PACKAGE_NAME = "src"
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from src.predict import predict_demo
4
+ from src.front import render_html
5
+
6
+ st.set_page_config(page_title="Vietnamese NER", layout="wide")
7
+
8
+ # ===== Tiêu đề chính =====
9
+ st.title("🔍 Ứng dụng nhận diện thực thể có tên (NER) cho tiếng Việt")
10
+
11
+ # Tabs
12
+ tab1, tab2, tab3 = st.tabs(["📊 Phân tích dữ liệu", "📈 Kết quả huấn luyện", "🧪 Demo mô hình"])
13
+
14
+ # --- Tab 1: PHÂN TÍCH DỮ LIỆU ---
15
+ with tab1:
16
+ st.header("📊 Phân tích dữ liệu")
17
+
18
+ df = pd.DataFrame({
19
+ "Loại thực thể": ["PER", "LOC", "ORG", "MISC"],
20
+ "Số lượng": [3200, 2500, 1800, 900]
21
+ })
22
+
23
+ st.bar_chart(df.set_index("Loại thực thể"))
24
+
25
+ # --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
26
+ with tab2:
27
+ st.header("📈 Kết quả huấn luyện")
28
+
29
+ loss = [0.9, 0.7, 0.5, 0.35, 0.28]
30
+ epoch = [1, 2, 3, 4, 5]
31
+ df_loss = pd.DataFrame({"Epoch": epoch, "Loss": loss})
32
+ st.line_chart(df_loss.set_index("Epoch"))
33
+
34
+ st.subheader("Đánh giá mô hình")
35
+ df_eval = pd.DataFrame({
36
+ "Phiên bản": ["v1", "v2", "v3"],
37
+ "F1-score": [0.78, 0.83, 0.86],
38
+ "Accuracy": [0.81, 0.85, 0.88]
39
+ })
40
+ st.dataframe(df_eval)
41
+
42
+ # --- Tab 3: DEMO MÔ HÌNH ---
43
+ with tab3:
44
+ st.header("🧪 Vietnamese Named Entity Recognition")
45
+
46
+ text = st.text_input("Nhập văn bản tiếng Việt:", "Nguyễn Văn A đang làm việc tại Hà Nội")
47
+
48
+ if st.button("Phân tích"):
49
+ if not text.strip():
50
+ st.warning("Vui lòng nhập văn bản!")
51
+ else:
52
+ tokens, labels = predict_demo(text)
53
+
54
+ st.subheader("Thực thể được phát hiện")
55
+ entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"]
56
+
57
+ if entities:
58
+ for tok, lab in entities:
59
+ st.markdown(f"🔹 **{tok}** — *{lab}*")
60
+ else:
61
+ st.info("Không phát hiện thực thể.")
62
+
63
+ st.subheader("Highlight trong văn bản:")
64
+ st.markdown(render_html(tokens, labels), unsafe_allow_html=True)
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/configs.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ configs = {
2
+ # Init
3
+ "project": "NER",
4
+ "name": "CRF_VLSP2016_Ultra",
5
+ "model": "Linear/CRF",
6
+
7
+ # Hyperparameters
8
+ "optim": "Adam",
9
+ "learning_rate": 1e-3,
10
+ "batch_size": 16,
11
+ "epochs": 20,
12
+ "train_ratio": 0.7,
13
+ "val_ratio": 0.15,
14
+ "test_ratio": 0.15
15
+ }
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/data_set.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data import Dataset
2
+ import torch
3
+
4
+ class NerDataset(Dataset):
5
+ def __init__(self, embeddings, labels):
6
+ super().__init__()
7
+ self.embeddings = embeddings
8
+ self.labels = labels
9
+
10
+ def __len__(self):
11
+ return len(self.embeddings)
12
+
13
+ def __getitem__(self, idx):
14
+ return self.embeddings[idx], self.labels[idx]
15
+
16
+ def collate_fn(batch): # Batch_size x Seq_length x 768
17
+ embeddings, labels = zip(*batch)
18
+ lengths = [e.size(0) for e in embeddings]
19
+ max_len = max(lengths)
20
+
21
+ padded_embs = torch.stack([
22
+ torch.cat([e, torch.zeros(max_len - e.size(0), e.size(1))]) for e in embeddings
23
+ ])
24
+
25
+ padded_labels = torch.stack([
26
+ torch.cat([l, torch.full((max_len - l.size(0),), -1, dtype=torch.long)]) for l in labels
27
+ ])
28
+
29
+ return padded_embs, padded_labels, lengths
30
+
31
+
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/evaluate.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.predict import predict
2
+ from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
3
+
4
+ def evaluate(model, loader, count_loss=True, report=False):
5
+
6
+ # Model Preidction (Inference)
7
+ all_preds, all_true, loss = predict(model, loader, count_loss)
8
+ class_report = None
9
+
10
+ # Get evaluation metric
11
+ precision, recall, f1, _ = precision_recall_fscore_support(all_true, all_preds, average='macro', zero_division=0)
12
+ acc = accuracy_score(all_true, all_preds)
13
+
14
+ # Get classification report
15
+ if report:
16
+ class_report = classification_report(all_true, all_preds)
17
+
18
+ return precision, recall, f1, acc, loss, class_report
19
+
20
+ def evaluate_ignore_O(model, loader):
21
+ pass
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/front.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def render_html(tokens, labels):
2
+ """
3
+ Tô màu highlight theo nhãn IOB, với màu khác nhau cho PER, ORG, LOC
4
+ """
5
+ label_colors = {
6
+ "PER": "lightcoral", # đỏ nhạt
7
+ "ORG": "lightblue", # xanh nhạt
8
+ "LOC": "lightgreen", # xanh lá nhạt
9
+ }
10
+
11
+ html = ""
12
+ current_label = None
13
+
14
+ for tok, label in zip(tokens, labels):
15
+ if label.startswith("B-"):
16
+ if current_label:
17
+ html += "</span> "
18
+ current_label = label[2:]
19
+ color = label_colors.get(current_label, "lightgray")
20
+ html += f"<span style='background-color:{color};padding:2px;border-radius:4px;' title='{current_label}'>{tok}"
21
+ elif label.startswith("I-") and current_label:
22
+ html += f" {tok}"
23
+ else:
24
+ if current_label:
25
+ html += "</span> "
26
+ current_label = None
27
+ html += f"{tok} "
28
+
29
+ if current_label:
30
+ html += "</span>"
31
+
32
+ return f"<div style='font-family:monospace;font-size:16px'>{html.strip()}</div>"
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/model.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torchcrf import CRF
2
+ import torch.nn as nn
3
+
4
+ class CRF_Tagger(nn.Module):
5
+ def __init__(self, input_dim, num_tags):
6
+ super().__init__()
7
+ self.embed2tag = nn.Linear(input_dim, num_tags)
8
+ self.crf = CRF(num_tags, batch_first=True)
9
+
10
+ def forward(self, x, labels, mask):
11
+ emissions = self.embed2tag(x)
12
+ return -self.crf(emissions, labels, mask=mask, reduction="mean")
13
+
14
+ def decode(self, x, mask=None):
15
+ emissions = self.embed2tag(x)
16
+ return self.crf.decode(emissions, mask)
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from src.model import CRF_Tagger
3
+ from src.preprocessing import process_demo_sentence
4
+
5
+ def predict(model, loader, count_loss=True):
6
+
7
+ model.eval() # Evaluation Mode, Ignore Dropout, BatchNorm, ...
8
+ all_preds, all_true = [], []
9
+ loss = 0.0
10
+
11
+ with torch.no_grad(): # Stop track gradient
12
+ for x, y, _ in loader:
13
+ mask = (y != -1)
14
+
15
+ # Get loss
16
+ if count_loss:
17
+ loss += model(x, y, mask).item()
18
+
19
+ # Get prediction
20
+ preds = model.decode(x, mask)
21
+
22
+ # Loop for each sentence in mini-batch
23
+ for pred_seq, true_seq, m in zip(preds, y, mask):
24
+ true_labels = true_seq[m].tolist() # tensor[mask tensor boolean]
25
+ all_preds.extend(pred_seq)
26
+ all_true.extend(true_labels)
27
+
28
+ return all_preds, all_true, loss/len(loader)
29
+
30
+ def predict_demo(text):
31
+
32
+
33
+ id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
34
+
35
+ x, tokens = process_demo_sentence(text) # 1 x seq_length x 768
36
+ NUM_TAGS = 7
37
+
38
+ model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
39
+ model.load_state_dict(torch.load(".\models\best_epoch_16.pt"))
40
+ model.eval()
41
+ with torch.no_grad():
42
+ preds = model.decode(x)
43
+
44
+ labels = [id_tag[lab] for lab in preds[0]] # preds[0] vì sẽ trả về nhiều batch nhưng chúng ta chỉ có 1
45
+
46
+ return tokens, labels
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/preprocessing.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModel
4
+ from tqdm import tqdm
5
+ from sklearn.model_selection import train_test_split
6
+ from src.configs import configs
7
+ from pyvi import ViTokenizer
8
+
9
+ def join_tokens(tokens):
10
+ text = ' '.join(tokens)
11
+ return text
12
+
13
+ def reform_raw_text(tokens):
14
+ text = ' '.join(tokens)
15
+ return text.replace("_", " ")
16
+
17
+ def label(x, ):
18
+ id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
19
+ return [id_tag[int(i)] for i in x]
20
+
21
+ def replace_7_8(lst):
22
+ return [0 if x in (7, 8) else x for x in lst]
23
+
24
+ # Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece
25
+ def group_embeddings(tokens, embeddings):
26
+ word_embeddings = []
27
+ current_vecs = []
28
+
29
+ for token, emb in zip(tokens, embeddings):
30
+ if token in ["<s>", "</s>"]:
31
+ continue
32
+
33
+ if token.endswith("@@"):
34
+ current_vecs.append(emb)
35
+ else:
36
+ current_vecs.append(emb)
37
+ word_emb = torch.mean(torch.stack(current_vecs), dim=0)
38
+ word_embeddings.append(word_emb)
39
+ current_vecs = []
40
+
41
+ if current_vecs: # Trong trường hợp sót lại cuối câu
42
+ word_emb = torch.mean(torch.stack(current_vecs), dim=0)
43
+ word_embeddings.append(word_emb)
44
+
45
+ return word_embeddings
46
+
47
+
48
+ # Download the dataset form Hugging Face
49
+ def download_raw_data():
50
+ splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}
51
+ df_train = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["train"])
52
+ df_valid = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["valid"])
53
+ df = pd.concat([df_train, df_valid]).reset_index(drop=True)
54
+
55
+ return df
56
+
57
+ # Process dataframe for EDA
58
+ def preprocess_data_for_EDA(df):
59
+ # Define tag - id
60
+ tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
61
+ id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
62
+
63
+ # Add columns and remove inappropriate tags
64
+ df['ner_tags'] = df['ner_tags'].apply(replace_7_8)
65
+ df['text_withseg'] = df['tokens'].apply(join_tokens)
66
+ df['text_raw'] = df['tokens'].apply(reform_raw_text)
67
+ df["ner_labels"] = df.ner_tags.apply(label)
68
+ df.columns = ['tokens', 'id_labels', 'seg_text', 'raw_text', 'labels']
69
+
70
+ return df
71
+
72
+
73
+
74
+
75
+ def load_phoBERT_model_and_tokenizer():
76
+ # Load PhoBERT tokenizer và model
77
+ tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
78
+ model = AutoModel.from_pretrained("vinai/phobert-base")
79
+ model.eval()
80
+ return model, tokenizer
81
+
82
+
83
+ # Embedding text
84
+ def create_embeddings(df, model, tokenizer):
85
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
86
+ model.to(device)
87
+
88
+ all_embeddings = [] # list of [seq_len_i, 768] tensors
89
+ all_labels = [] # list of [seq_len_i,] tensors
90
+ remove_index = []
91
+
92
+ for i, row in tqdm(df.iterrows(), total=len(df)):
93
+
94
+ # Truy cập phần tử từng dòng
95
+ sentence = row['seg_text']
96
+ gold_labels = row["id_labels"]
97
+
98
+ # Cho sentence đi qua SentencePiece
99
+ input_ids = tokenizer.encode(sentence, return_tensors="pt").to(device)
100
+
101
+ tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
102
+
103
+ # Encode tạo embeddings
104
+ with torch.no_grad():
105
+ outputs = model(input_ids)
106
+ last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()
107
+
108
+ # Gộp các embeddings đã bị tách khi đi qua SentencePiece
109
+ word_embeds = group_embeddings(tokens, last_hidden_state)
110
+
111
+ # Kiểm tra số lượng embeddings và số lượng labels, nếu conflict -> xóa dòng đó
112
+ if len(word_embeds) != len(gold_labels):
113
+ # print(f"Warning: Skip row {i} - length mismatch")
114
+ remove_index.append(i)
115
+ continue
116
+
117
+ # Thêm vào list tổng & Tới đây là data đã sẵn sàng cho training
118
+ all_embeddings.append(torch.stack(word_embeds))
119
+ all_labels.append(torch.tensor(gold_labels))
120
+
121
+ # Create Dict
122
+ processed_data = {
123
+ "embeddings": all_embeddings,
124
+ "labels": all_labels
125
+ }
126
+
127
+ return processed_data
128
+
129
+
130
+ def split_dataset(data):
131
+
132
+ # Train_Val / Test Split
133
+ X_train_val, X_test, Y_train_val, Y_test = train_test_split(data["embeddings"], data["labels"], test_size=configs["test_ratio"], random_state=42)
134
+
135
+ # Train / Val Split
136
+ val_rest_ratio = configs["val_ratio"] / (configs["val_ratio"] + configs["train_ratio"])
137
+ X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size = val_rest_ratio, random_state=42)
138
+
139
+ return X_train, Y_train, X_val, Y_val, X_test, Y_test
140
+
141
+
142
+ # TODO: Refactor hàm process_demo_sentence, và hàm predict demo, warning nếu độ dài tokens_word không bằng độ dài sau group_embeddings
143
+
144
+ def process_demo_sentence(text):
145
+ """
146
+ Trả về tensor shape 1 x Seq_length x 768
147
+ """
148
+ segmented_text = ViTokenizer.tokenize(text)
149
+ tokens_word = segmented_text.strip().split(" ")
150
+
151
+ model, tokenizer = load_phoBERT_model_and_tokenizer()
152
+
153
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
154
+ model.to(device)
155
+
156
+ input_ids = tokenizer.encode(segmented_text, return_tensors="pt").to(device)
157
+
158
+ tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
159
+
160
+ with torch.no_grad():
161
+ outputs = model(input_ids)
162
+ last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()
163
+
164
+ word_embeds = group_embeddings(tokens, last_hidden_state)
165
+
166
+ all_embeddings = torch.stack(word_embeds) # seq_length x 768
167
+
168
+ all_embeddings = all_embeddings.unsqueeze(0) # Thêm chiều batch size là 1 -> 1 x seq_length x 768
169
+
170
+ return all_embeddings, tokens_word
171
+
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/torchcrf/__init__.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __version__ = '0.7.2'
2
+
3
+ from typing import List, Optional
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+
9
+ class CRF(nn.Module):
10
+ """Conditional random field.
11
+
12
+ This module implements a conditional random field [LMP01]_. The forward computation
13
+ of this class computes the log likelihood of the given sequence of tags and
14
+ emission score tensor. This class also has `~CRF.decode` method which finds
15
+ the best tag sequence given an emission score tensor using `Viterbi algorithm`_.
16
+
17
+ Args:
18
+ num_tags: Number of tags.
19
+ batch_first: Whether the first dimension corresponds to the size of a minibatch.
20
+
21
+ Attributes:
22
+ start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
23
+ ``(num_tags,)``.
24
+ end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
25
+ ``(num_tags,)``.
26
+ transitions (`~torch.nn.Parameter`): Transition score tensor of size
27
+ ``(num_tags, num_tags)``.
28
+
29
+
30
+ .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
31
+ "Conditional random fields: Probabilistic models for segmenting and
32
+ labeling sequence data". *Proc. 18th International Conf. on Machine
33
+ Learning*. Morgan Kaufmann. pp. 282–289.
34
+
35
+ .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
36
+ """
37
+
38
+ def __init__(self, num_tags: int, batch_first: bool = False) -> None:
39
+ if num_tags <= 0:
40
+ raise ValueError(f'invalid number of tags: {num_tags}')
41
+ super().__init__()
42
+ self.num_tags = num_tags
43
+ self.batch_first = batch_first
44
+ self.start_transitions = nn.Parameter(torch.empty(num_tags))
45
+ self.end_transitions = nn.Parameter(torch.empty(num_tags))
46
+ self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))
47
+
48
+ self.reset_parameters()
49
+
50
+ def reset_parameters(self) -> None:
51
+ """Initialize the transition parameters.
52
+
53
+ The parameters will be initialized randomly from a uniform distribution
54
+ between -0.1 and 0.1.
55
+ """
56
+ nn.init.uniform_(self.start_transitions, -0.1, 0.1)
57
+ nn.init.uniform_(self.end_transitions, -0.1, 0.1)
58
+ nn.init.uniform_(self.transitions, -0.1, 0.1)
59
+
60
+ def __repr__(self) -> str:
61
+ return f'{self.__class__.__name__}(num_tags={self.num_tags})'
62
+
63
+ def forward(
64
+ self,
65
+ emissions: torch.Tensor,
66
+ tags: torch.LongTensor,
67
+ mask: Optional[torch.ByteTensor] = None,
68
+ reduction: str = 'sum',
69
+ ) -> torch.Tensor:
70
+ """Compute the conditional log likelihood of a sequence of tags given emission scores.
71
+
72
+ Args:
73
+ emissions (`~torch.Tensor`): Emission score tensor of size
74
+ ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
75
+ ``(batch_size, seq_length, num_tags)`` otherwise.
76
+ tags (`~torch.LongTensor`): Sequence of tags tensor of size
77
+ ``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
78
+ ``(batch_size, seq_length)`` otherwise.
79
+ mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
80
+ if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
81
+ reduction: Specifies the reduction to apply to the output:
82
+ ``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
83
+ ``sum``: the output will be summed over batches. ``mean``: the output will be
84
+ averaged over batches. ``token_mean``: the output will be averaged over tokens.
85
+
86
+ Returns:
87
+ `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
88
+ reduction is ``none``, ``()`` otherwise.
89
+ """
90
+ self._validate(emissions, tags=tags, mask=mask)
91
+ if reduction not in ('none', 'sum', 'mean', 'token_mean'):
92
+ raise ValueError(f'invalid reduction: {reduction}')
93
+ if mask is None:
94
+ mask = torch.ones_like(tags, dtype=torch.uint8)
95
+
96
+ if self.batch_first:
97
+ emissions = emissions.transpose(0, 1)
98
+ tags = tags.transpose(0, 1)
99
+ mask = mask.transpose(0, 1)
100
+
101
+ # shape: (batch_size,)
102
+ numerator = self._compute_score(emissions, tags, mask)
103
+ # shape: (batch_size,)
104
+ denominator = self._compute_normalizer(emissions, mask)
105
+ # shape: (batch_size,)
106
+ llh = numerator - denominator
107
+
108
+ if reduction == 'none':
109
+ return llh
110
+ if reduction == 'sum':
111
+ return llh.sum()
112
+ if reduction == 'mean':
113
+ return llh.mean()
114
+ assert reduction == 'token_mean'
115
+ return llh.sum() / mask.type_as(emissions).sum()
116
+
117
+ @torch.jit.export
118
+ def decode(self, emissions: torch.Tensor,
119
+ mask: Optional[torch.ByteTensor] = None) -> List[List[int]]:
120
+ """Find the most likely tag sequence using Viterbi algorithm.
121
+
122
+ Args:
123
+ emissions (`~torch.Tensor`): Emission score tensor of size
124
+ ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
125
+ ``(batch_size, seq_length, num_tags)`` otherwise.
126
+ mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
127
+ if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
128
+
129
+ Returns:
130
+ List of list containing the best tag sequence for each batch.
131
+ """
132
+ self._validate(emissions, mask=mask)
133
+ if mask is None:
134
+ mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)
135
+
136
+ if self.batch_first:
137
+ emissions = emissions.transpose(0, 1)
138
+ mask = mask.transpose(0, 1)
139
+
140
+ return self._viterbi_decode(emissions, mask)
141
+
142
+ def _validate(
143
+ self,
144
+ emissions: torch.Tensor,
145
+ tags: Optional[torch.LongTensor] = None,
146
+ mask: Optional[torch.ByteTensor] = None) -> None:
147
+ if emissions.dim() != 3:
148
+ raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
149
+ if emissions.size(2) != self.num_tags:
150
+ raise ValueError(
151
+ f'expected last dimension of emissions is {self.num_tags}, '
152
+ f'got {emissions.size(2)}')
153
+
154
+ if tags is not None:
155
+ if emissions.shape[:2] != tags.shape:
156
+ raise ValueError(
157
+ 'the first two dimensions of emissions and tags must match, '
158
+ f'got {(emissions.shape[0], emissions.shape[1])} and {(tags.shape[0], tags.shape[1])}'
159
+ )
160
+
161
+ if mask is not None:
162
+ if emissions.shape[:2] != mask.shape:
163
+ raise ValueError(
164
+ 'the first two dimensions of emissions and mask must match, '
165
+ f'got {(emissions.shape[0], emissions.shape[1])} and {(mask.shape[0], mask.shape[1])}'
166
+ )
167
+ no_empty_seq = not self.batch_first and mask[0].all()
168
+ no_empty_seq_bf = self.batch_first and mask[:, 0].all()
169
+ if not no_empty_seq and not no_empty_seq_bf:
170
+ raise ValueError('mask of the first timestep must all be on')
171
+
172
+ def _compute_score(
173
+ self, emissions: torch.Tensor, tags: torch.LongTensor,
174
+ mask: torch.ByteTensor) -> torch.Tensor:
175
+ # emissions: (seq_length, batch_size, num_tags)
176
+ # tags: (seq_length, batch_size)
177
+ # mask: (seq_length, batch_size)
178
+ assert emissions.dim() == 3 and tags.dim() == 2
179
+ assert emissions.shape[:2] == tags.shape
180
+ assert emissions.size(2) == self.num_tags
181
+ assert mask.shape == tags.shape
182
+ assert mask[0].all()
183
+
184
+ seq_length, batch_size = tags.shape
185
+ mask = mask.type_as(emissions)
186
+
187
+ # Start transition score and first emission
188
+ # shape: (batch_size,)
189
+ score = self.start_transitions[tags[0]]
190
+ score += emissions[0, torch.arange(batch_size), tags[0]]
191
+
192
+ for i in range(1, seq_length):
193
+ # Transition score to next tag, only added if next timestep is valid (mask == 1)
194
+ # shape: (batch_size,)
195
+ score += self.transitions[tags[i - 1], tags[i]] * mask[i]
196
+
197
+ # Emission score for next tag, only added if next timestep is valid (mask == 1)
198
+ # shape: (batch_size,)
199
+ score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i]
200
+
201
+ # End transition score
202
+ # shape: (batch_size,)
203
+ seq_ends = mask.long().sum(dim=0) - 1
204
+ # shape: (batch_size,)
205
+ last_tags = tags[seq_ends, torch.arange(batch_size)]
206
+ # shape: (batch_size,)
207
+ score += self.end_transitions[last_tags]
208
+
209
+ return score
210
+
211
+ def _compute_normalizer(
212
+ self, emissions: torch.Tensor, mask: torch.ByteTensor) -> torch.Tensor:
213
+ # emissions: (seq_length, batch_size, num_tags)
214
+ # mask: (seq_length, batch_size)
215
+ assert emissions.dim() == 3 and mask.dim() == 2
216
+ assert emissions.shape[:2] == mask.shape
217
+ assert emissions.size(2) == self.num_tags
218
+ assert mask[0].all()
219
+
220
+ seq_length = emissions.size(0)
221
+
222
+ # Start transition score and first emission; score has size of
223
+ # (batch_size, num_tags) where for each batch, the j-th column stores
224
+ # the score that the first timestep has tag j
225
+ # shape: (batch_size, num_tags)
226
+ score = self.start_transitions + emissions[0]
227
+
228
+ for i in range(1, seq_length):
229
+ # Broadcast score for every possible next tag
230
+ # shape: (batch_size, num_tags, 1)
231
+ broadcast_score = score.unsqueeze(2)
232
+
233
+ # Broadcast emission score for every possible current tag
234
+ # shape: (batch_size, 1, num_tags)
235
+ broadcast_emissions = emissions[i].unsqueeze(1)
236
+
237
+ # Compute the score tensor of size (batch_size, num_tags, num_tags) where
238
+ # for each sample, entry at row i and column j stores the sum of scores of all
239
+ # possible tag sequences so far that end with transitioning from tag i to tag j
240
+ # and emitting
241
+ # shape: (batch_size, num_tags, num_tags)
242
+ next_score = broadcast_score + self.transitions + broadcast_emissions
243
+
244
+ # Sum over all possible current tags, but we're in score space, so a sum
245
+ # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
246
+ # all possible tag sequences so far, that end in tag i
247
+ # shape: (batch_size, num_tags)
248
+ next_score = torch.logsumexp(next_score, dim=1)
249
+
250
+ # Set score to the next score if this timestep is valid (mask == 1)
251
+ # shape: (batch_size, num_tags)
252
+ score = torch.where(mask[i].unsqueeze(1), next_score, score)
253
+
254
+ # End transition score
255
+ # shape: (batch_size, num_tags)
256
+ score += self.end_transitions
257
+
258
+ # Sum (log-sum-exp) over all possible tags
259
+ # shape: (batch_size,)
260
+ return torch.logsumexp(score, dim=1)
261
+
262
+ def _viterbi_decode(self, emissions: torch.FloatTensor,
263
+ mask: torch.ByteTensor) -> List[List[int]]:
264
+ # emissions: (seq_length, batch_size, num_tags)
265
+ # mask: (seq_length, batch_size)
266
+ assert emissions.dim() == 3 and mask.dim() == 2
267
+ assert emissions.shape[:2] == mask.shape
268
+ assert emissions.size(2) == self.num_tags
269
+ assert mask[0].all()
270
+
271
+ seq_length, batch_size = mask.shape
272
+
273
+ # Start transition and first emission
274
+ # shape: (batch_size, num_tags)
275
+ score = self.start_transitions + emissions[0]
276
+ history: List[torch.Tensor] = []
277
+
278
+ # score is a tensor of size (batch_size, num_tags) where for every batch,
279
+ # value at column j stores the score of the best tag sequence so far that ends
280
+ # with tag j
281
+ # history saves where the best tags candidate transitioned from; this is used
282
+ # when we trace back the best tag sequence
283
+
284
+ # Viterbi algorithm recursive case: we compute the score of the best tag sequence
285
+ # for every possible next tag
286
+ for i in range(1, seq_length):
287
+ # Broadcast viterbi score for every possible next tag
288
+ # shape: (batch_size, num_tags, 1)
289
+ broadcast_score = score.unsqueeze(2)
290
+
291
+ # Broadcast emission score for every possible current tag
292
+ # shape: (batch_size, 1, num_tags)
293
+ broadcast_emission = emissions[i].unsqueeze(1)
294
+
295
+ # Compute the score tensor of size (batch_size, num_tags, num_tags) where
296
+ # for each sample, entry at row i and column j stores the score of the best
297
+ # tag sequence so far that ends with transitioning from tag i to tag j and emitting
298
+ # shape: (batch_size, num_tags, num_tags)
299
+ next_score = broadcast_score + self.transitions + broadcast_emission
300
+
301
+ # Find the maximum score over all possible current tag
302
+ # shape: (batch_size, num_tags)
303
+ next_score, indices = next_score.max(dim=1)
304
+
305
+ # Set score to the next score if this timestep is valid (mask == 1)
306
+ # and save the index that produces the next score
307
+ # shape: (batch_size, num_tags)
308
+ score = torch.where(mask[i].unsqueeze(1), next_score, score)
309
+ history.append(indices)
310
+
311
+ # End transition score
312
+ # shape: (batch_size, num_tags)
313
+ score += self.end_transitions
314
+
315
+ # Now, compute the best path for each sample
316
+
317
+ # shape: (batch_size,)
318
+ seq_ends = mask.long().sum(dim=0) - 1
319
+ best_tags_list: List[List[int]] = []
320
+
321
+ for idx in range(batch_size):
322
+ # Find the tag which maximizes the score at the last timestep; this is our best tag
323
+ # for the last timestep
324
+ _, best_last_tag = score[idx].max(dim=0)
325
+ best_tags: List[int] = []
326
+ best_tags.append(best_last_tag.item())
327
+
328
+ # We trace back where the best last tag comes from, append that to our best tag
329
+ # sequence, and trace it back again, and so on
330
+ # NOTE: reversed() cannot be used here because it is not supported by TorchScript,
331
+ # see https://github.com/pytorch/pytorch/issues/31772.
332
+ for hist in history[:seq_ends[idx]][::-1]:
333
+ best_last_tag = hist[idx][best_tags[-1]]
334
+ best_tags.append(best_last_tag.item())
335
+
336
+ # Reverse the order because we start from the last timestep
337
+ best_tags.reverse()
338
+ best_tags_list.append(best_tags)
339
+
340
+ return best_tags_list
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/train.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wandb
2
+ from tqdm import tqdm
3
+ from src.evaluate import evaluate
4
+ import torch
5
+
6
+ def train_model(model, optimizer, configs, loaders):
7
+
8
+ # Login wandb
9
+ wandb.login()
10
+
11
+ # Init Wandb for tracking training phase
12
+ wandb.init(
13
+ project=configs["project"],
14
+ name=configs["name"],
15
+ config=configs
16
+ )
17
+
18
+ # Log gradient of parameter
19
+ wandb.watch(model, log="all")
20
+
21
+ # Save model checkpoint by best F1
22
+ best_val_f1 = 0.0
23
+
24
+ # Training Loop
25
+ for epoch in range(1, configs["epochs"] + 1):
26
+ model.train()
27
+ total_loss = 0.0
28
+
29
+ # Create progress bar
30
+ train_bar = tqdm(loaders['train'], desc=f"Train Epoch {epoch}/{configs['epochs']}")
31
+
32
+ for batch_idx, (x, y, _) in enumerate(train_bar, start=1):
33
+ mask = (y != -1)
34
+ loss = model(x, y, mask)
35
+ optimizer.zero_grad()
36
+ loss.backward()
37
+ optimizer.step()
38
+ total_loss += loss.item()
39
+
40
+ train_bar.set_postfix(batch_loss=loss.item(), avg_loss=total_loss / batch_idx)
41
+
42
+ # Evaluate model after each epoch
43
+ avg_train_loss = total_loss / len(loaders['train'])
44
+ train_precision, train_recall, train_f1, train_acc, _, _ = evaluate(model, loaders['train'], count_loss=False)
45
+ val_precision, val_recall, val_f1, val_acc, avg_val_loss, _= evaluate(model, loaders['val'], count_loss=True)
46
+
47
+ # Log metric for train and val set
48
+ print(f"Epoch {epoch}: train_loss={avg_train_loss:.4f}, train_f1={train_f1:.4f}, val_loss={avg_val_loss:.4f}, val_f1={val_f1:.4f}")
49
+ wandb.log({
50
+
51
+ "epoch": epoch,
52
+
53
+ # Group: Training metrics
54
+ "Train/Loss": avg_train_loss,
55
+ "Train/Precision": train_precision,
56
+ "Train/Recall": train_recall,
57
+ "Train/F1": train_f1,
58
+ "Train/Accuracy": train_acc,
59
+
60
+ # Group: Validation metrics
61
+ "Val/Loss": avg_val_loss,
62
+ "Val/Precision": val_precision,
63
+ "Val/Recall": val_recall,
64
+ "Val/F1": val_f1,
65
+ "Val/Accuracy": val_acc
66
+ })
67
+
68
+ # Save best model based on val_f1
69
+ if val_f1 > best_val_f1:
70
+ best_val_f1 = val_f1
71
+ ckpt_path = f"./models/best_epoch_{epoch}.pt"
72
+ torch.save(model.state_dict(), ckpt_path)
73
+ wandb.save(ckpt_path)
74
+ print(f"Saved imporved model to {ckpt_path}")
75
+
76
+ print()
77
+
78
+ # Load best model before test
79
+ print(f"Loading best model from {ckpt_path} for final evaluation...")
80
+ model.load_state_dict(torch.load(ckpt_path))
81
+ print("Done \n")
82
+
83
+
84
+ # Log metric for test set
85
+ print("Evaluation on test set ...")
86
+ test_precision, test_recall, test_f1, test_acc, avg_test_loss, report = evaluate(model, loaders['test'], count_loss=True, report=True)
87
+ wandb.log({
88
+ "Test/Loss": avg_test_loss,
89
+ "Test/Precision": test_precision,
90
+ "Test/Recall": test_recall,
91
+ "Test/F1": test_f1,
92
+ "Test/Accuracy": test_acc,
93
+ })
94
+ print(f"Test_loss={avg_test_loss:.4f}, Test_f1={test_f1:.4f}")
95
+ print(report)
96
+
97
+ # Finish W&B run
98
+ wandb.finish()
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py CHANGED
@@ -1,7 +1,10 @@
1
  import streamlit as st
2
  import pandas as pd
 
 
3
  from src.predict import predict_demo
4
  from src.front import render_html
 
5
 
6
  st.set_page_config(page_title="Vietnamese NER", layout="wide")
7
 
@@ -24,20 +27,99 @@ with tab1:
24
 
25
  # --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
26
  with tab2:
27
- st.header("📈 Kết quả huấn luyện")
28
-
29
- loss = [0.9, 0.7, 0.5, 0.35, 0.28]
30
- epoch = [1, 2, 3, 4, 5]
31
- df_loss = pd.DataFrame({"Epoch": epoch, "Loss": loss})
32
- st.line_chart(df_loss.set_index("Epoch"))
33
-
34
- st.subheader("Đánh giá mô hình")
35
- df_eval = pd.DataFrame({
36
- "Phiên bản": ["v1", "v2", "v3"],
37
- "F1-score": [0.78, 0.83, 0.86],
38
- "Accuracy": [0.81, 0.85, 0.88]
39
- })
40
- st.dataframe(df_eval)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # --- Tab 3: DEMO MÔ HÌNH ---
43
  with tab3:
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import plotly.graph_objects as go
4
+
5
  from src.predict import predict_demo
6
  from src.front import render_html
7
+ from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare
8
 
9
  st.set_page_config(page_title="Vietnamese NER", layout="wide")
10
 
 
27
 
28
  # --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
29
  with tab2:
30
+ st.set_page_config(
31
+ page_title="My NER App",
32
+ layout="wide",
33
+ initial_sidebar_state="expanded"
34
+ )
35
+
36
+ # ==== TẠO FIGURES ====
37
+
38
+ # 1️⃣ Loss
39
+ fig_loss = go.Figure()
40
+ fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
41
+ mode='lines+markers', name='Train Loss'))
42
+ fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
43
+ mode='lines+markers', name='Val Loss'))
44
+ fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
45
+
46
+ # 2️⃣ F1-Score
47
+ fig_f1 = go.Figure()
48
+ fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
49
+ mode='lines+markers', name='Train F1'))
50
+ fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
51
+ mode='lines+markers', name='Val F1'))
52
+ fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
53
+
54
+ # 3️⃣ Classification Report Table & Bar
55
+ labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
56
+ report_data = [[lbl,
57
+ report_dict[lbl]["precision"],
58
+ report_dict[lbl]["recall"],
59
+ report_dict[lbl]["f1-score"]]
60
+ for lbl in labels]
61
+ df_report = pd.DataFrame(report_data,
62
+ columns=["Label", "Precision", "Recall", "F1-Score"])
63
+
64
+ fig_report = go.Figure()
65
+ for col in ["Precision", "Recall", "F1-Score"]:
66
+ fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
67
+ fig_report.update_layout(barmode='group',
68
+ title="Class Report Metrics of PhoBert + CRF",
69
+ xaxis_title="Label", yaxis_title="Score",
70
+ yaxis=dict(range=[0,1.0]))
71
+
72
+ labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
73
+ report_data2 = [[lbl,
74
+ report_dict_2[lbl]["precision"],
75
+ report_dict_2[lbl]["recall"],
76
+ report_dict_2[lbl]["f1-score"]]
77
+ for lbl in labels2]
78
+ df_report2 = pd.DataFrame(report_data2,
79
+ columns=["Label", "Precision", "Recall", "F1-Score"])
80
+
81
+ fig_report2 = go.Figure()
82
+ for col in ["Precision", "Recall", "F1-Score"]:
83
+ fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
84
+ fig_report2.update_layout(barmode='group',
85
+ title="Class Report Metrics of PhoBert + Softmax",
86
+ xaxis_title="Label", yaxis_title="Score",
87
+ yaxis=dict(range=[0,1.0]))
88
+
89
+ # 4️⃣ Model & Data Comparison Tables
90
+ df_model = pd.DataFrame(
91
+ [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()],
92
+ columns=["Model", "F1-Score", "Accuracy"]
93
+ )
94
+ df_data = pd.DataFrame(
95
+ [[s, f1] for s, f1 in data_compare["Data"].items()],
96
+ columns=["Preprocessing", "F1-Score"]
97
+ )
98
+
99
+ # ==== LAYOUT RAO GỌN VỚI COLUMNS ====
100
+
101
+ # Row 1: Loss | F1
102
+ col1, col2 = st.columns(2)
103
+ with col1:
104
+ st.plotly_chart(fig_loss, use_container_width=True)
105
+ with col2:
106
+ st.plotly_chart(fig_f1, use_container_width=True)
107
+
108
+ # Row 2: Class Report Table | Bar Chart
109
+ col3, col4 = st.columns(2)
110
+ with col3:
111
+ st.plotly_chart(fig_report2, use_container_width=True)
112
+ with col4:
113
+ st.plotly_chart(fig_report, use_container_width=True)
114
+
115
+ # Row 3: Model Compare | Data Compare
116
+ col5, col6 = st.columns(2)
117
+ with col5:
118
+ st.markdown("**Model Comparison**")
119
+ st.dataframe(df_model, use_container_width=True)
120
+ with col6:
121
+ st.markdown("**Data Preprocessing Comparison**")
122
+ st.dataframe(df_data, use_container_width=True)
123
 
124
  # --- Tab 3: DEMO MÔ HÌNH ---
125
  with tab3:
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py CHANGED
@@ -36,7 +36,7 @@ def predict_demo(text):
36
  NUM_TAGS = 7
37
 
38
  model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
39
- model.load_state_dict(torch.load(".\models\best_epoch_16.pt"))
40
  model.eval()
41
  with torch.no_grad():
42
  preds = model.decode(x)
 
36
  NUM_TAGS = 7
37
 
38
  model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
39
+ model.load_state_dict(torch.load("../models/best_epoch_16.pt"))
40
  model.eval()
41
  with torch.no_grad():
42
  preds = model.decode(x)
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/st.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.graph_objects as go
4
+ from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare
5
+
6
+ st.set_page_config(
7
+ page_title="My NER App",
8
+ layout="wide",
9
+ initial_sidebar_state="expanded"
10
+ )
11
+
12
+ # ==== TẠO FIGURES ====
13
+
14
+ # 1️⃣ Loss
15
+ fig_loss = go.Figure()
16
+ fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
17
+ mode='lines+markers', name='Train Loss'))
18
+ fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
19
+ mode='lines+markers', name='Val Loss'))
20
+ fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
21
+
22
+ # 2️⃣ F1-Score
23
+ fig_f1 = go.Figure()
24
+ fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
25
+ mode='lines+markers', name='Train F1'))
26
+ fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
27
+ mode='lines+markers', name='Val F1'))
28
+ fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
29
+
30
+ # 3️⃣ Classification Report Table & Bar
31
+ labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
32
+ report_data = [[lbl,
33
+ report_dict[lbl]["precision"],
34
+ report_dict[lbl]["recall"],
35
+ report_dict[lbl]["f1-score"]]
36
+ for lbl in labels]
37
+ df_report = pd.DataFrame(report_data,
38
+ columns=["Label", "Precision", "Recall", "F1-Score"])
39
+
40
+ fig_report = go.Figure()
41
+ for col in ["Precision", "Recall", "F1-Score"]:
42
+ fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
43
+ fig_report.update_layout(barmode='group',
44
+ title="Class Report Metrics of PhoBert + CRF",
45
+ xaxis_title="Label", yaxis_title="Score",
46
+ yaxis=dict(range=[0,1.0]))
47
+
48
+ labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
49
+ report_data2 = [[lbl,
50
+ report_dict_2[lbl]["precision"],
51
+ report_dict_2[lbl]["recall"],
52
+ report_dict_2[lbl]["f1-score"]]
53
+ for lbl in labels2]
54
+ df_report2 = pd.DataFrame(report_data2,
55
+ columns=["Label", "Precision", "Recall", "F1-Score"])
56
+
57
+ fig_report2 = go.Figure()
58
+ for col in ["Precision", "Recall", "F1-Score"]:
59
+ fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
60
+ fig_report2.update_layout(barmode='group',
61
+ title="Class Report Metrics of PhoBert + Softmax",
62
+ xaxis_title="Label", yaxis_title="Score",
63
+ yaxis=dict(range=[0,1.0]))
64
+
65
+ # 4️⃣ Model & Data Comparison Tables
66
+ df_model = pd.DataFrame(
67
+ [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()],
68
+ columns=["Model", "F1-Score", "Accuracy"]
69
+ )
70
+ df_data = pd.DataFrame(
71
+ [[s, f1] for s, f1 in data_compare["Data"].items()],
72
+ columns=["Preprocessing", "F1-Score"]
73
+ )
74
+
75
+ # ==== LAYOUT RAO GỌN VỚI COLUMNS ====
76
+
77
+ # Row 1: Loss | F1
78
+ col1, col2 = st.columns(2)
79
+ with col1:
80
+ st.plotly_chart(fig_loss, use_container_width=True)
81
+ with col2:
82
+ st.plotly_chart(fig_f1, use_container_width=True)
83
+
84
+ # Row 2: Class Report Table | Bar Chart
85
+ col3, col4 = st.columns(2)
86
+ with col3:
87
+ st.plotly_chart(fig_report2, use_container_width=True)
88
+ with col4:
89
+ st.plotly_chart(fig_report, use_container_width=True)
90
+
91
+ # Row 3: Model Compare | Data Compare
92
+ col5, col6 = st.columns(2)
93
+ with col5:
94
+ st.markdown("**Model Comparison**")
95
+ st.dataframe(df_model, use_container_width=True)
96
+ with col6:
97
+ st.markdown("**Data Preprocessing Comparison**")
98
+ st.dataframe(df_data, use_container_width=True)
space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py CHANGED
@@ -1,6 +1,7 @@
1
  import torch
2
  from src.model import CRF_Tagger
3
  from src.preprocessing import process_demo_sentence
 
4
 
5
  def predict(model, loader, count_loss=True):
6
 
@@ -29,6 +30,9 @@ def predict(model, loader, count_loss=True):
29
 
30
  def predict_demo(text):
31
 
 
 
 
32
 
33
  id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
34
 
@@ -36,7 +40,7 @@ def predict_demo(text):
36
  NUM_TAGS = 7
37
 
38
  model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
39
- model.load_state_dict(torch.load("../models/best_epoch_16.pt"))
40
  model.eval()
41
  with torch.no_grad():
42
  preds = model.decode(x)
 
1
  import torch
2
  from src.model import CRF_Tagger
3
  from src.preprocessing import process_demo_sentence
4
+ import os
5
 
6
  def predict(model, loader, count_loss=True):
7
 
 
30
 
31
  def predict_demo(text):
32
 
33
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
34
+ model_path = os.path.join(BASE_DIR, "models", "best_epoch_16.pt")
35
+
36
 
37
  id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
38
 
 
40
  NUM_TAGS = 7
41
 
42
  model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
43
+ model.load_state_dict(torch.load(model_path))
44
  model.eval()
45
  with torch.no_grad():
46
  preds = model.decode(x)
space/space/space/space/space/space/space/space/space/space/space/space/space/st.py CHANGED
@@ -1,98 +1,7 @@
1
  import streamlit as st
2
- import pandas as pd
3
- import plotly.graph_objects as go
4
- from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare
5
 
6
- st.set_page_config(
7
- page_title="My NER App",
8
- layout="wide",
9
- initial_sidebar_state="expanded"
10
- )
11
 
12
- # ==== TẠO FIGURES ====
13
-
14
- # 1️⃣ Loss
15
- fig_loss = go.Figure()
16
- fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
17
- mode='lines+markers', name='Train Loss'))
18
- fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
19
- mode='lines+markers', name='Val Loss'))
20
- fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
21
-
22
- # 2️⃣ F1-Score
23
- fig_f1 = go.Figure()
24
- fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
25
- mode='lines+markers', name='Train F1'))
26
- fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
27
- mode='lines+markers', name='Val F1'))
28
- fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
29
-
30
- # 3️⃣ Classification Report Table & Bar
31
- labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
32
- report_data = [[lbl,
33
- report_dict[lbl]["precision"],
34
- report_dict[lbl]["recall"],
35
- report_dict[lbl]["f1-score"]]
36
- for lbl in labels]
37
- df_report = pd.DataFrame(report_data,
38
- columns=["Label", "Precision", "Recall", "F1-Score"])
39
-
40
- fig_report = go.Figure()
41
- for col in ["Precision", "Recall", "F1-Score"]:
42
- fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
43
- fig_report.update_layout(barmode='group',
44
- title="Class Report Metrics of PhoBert + CRF",
45
- xaxis_title="Label", yaxis_title="Score",
46
- yaxis=dict(range=[0,1.0]))
47
-
48
- labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
49
- report_data2 = [[lbl,
50
- report_dict_2[lbl]["precision"],
51
- report_dict_2[lbl]["recall"],
52
- report_dict_2[lbl]["f1-score"]]
53
- for lbl in labels2]
54
- df_report2 = pd.DataFrame(report_data2,
55
- columns=["Label", "Precision", "Recall", "F1-Score"])
56
-
57
- fig_report2 = go.Figure()
58
- for col in ["Precision", "Recall", "F1-Score"]:
59
- fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
60
- fig_report2.update_layout(barmode='group',
61
- title="Class Report Metrics of PhoBert + Softmax",
62
- xaxis_title="Label", yaxis_title="Score",
63
- yaxis=dict(range=[0,1.0]))
64
-
65
- # 4️⃣ Model & Data Comparison Tables
66
- df_model = pd.DataFrame(
67
- [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()],
68
- columns=["Model", "F1-Score", "Accuracy"]
69
- )
70
- df_data = pd.DataFrame(
71
- [[s, f1] for s, f1 in data_compare["Data"].items()],
72
- columns=["Preprocessing", "F1-Score"]
73
- )
74
-
75
- # ==== LAYOUT RAO GỌN VỚI COLUMNS ====
76
-
77
- # Row 1: Loss | F1
78
- col1, col2 = st.columns(2)
79
- with col1:
80
- st.plotly_chart(fig_loss, use_container_width=True)
81
- with col2:
82
- st.plotly_chart(fig_f1, use_container_width=True)
83
-
84
- # Row 2: Class Report Table | Bar Chart
85
- col3, col4 = st.columns(2)
86
- with col3:
87
- st.plotly_chart(fig_report2, use_container_width=True)
88
- with col4:
89
- st.plotly_chart(fig_report, use_container_width=True)
90
-
91
- # Row 3: Model Compare | Data Compare
92
- col5, col6 = st.columns(2)
93
- with col5:
94
- st.markdown("**Model Comparison**")
95
- st.dataframe(df_model, use_container_width=True)
96
- with col6:
97
- st.markdown("**Data Preprocessing Comparison**")
98
- st.dataframe(df_data, use_container_width=True)
 
1
  import streamlit as st
 
 
 
2
 
3
+ # Load ảnh từ file local
4
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/W%26B%20Chart%206_18_2025%2C%207_23_58%20PM.png", caption="Ảnh minh hoạ", use_column_width=True)
 
 
 
5
 
6
+ # Load ảnh từ URL
7
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/W%26B%20Chart%206_18_2025%2C%207_24_20%20PM.png", caption="Ảnh từ URL", use_column_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
space/space/space/space/space/space/space/space/space/space/space/src/app.py CHANGED
@@ -16,14 +16,27 @@ tab1, tab2, tab3 = st.tabs(["📊 Phân tích dữ liệu", "📈 Kết quả hu
16
 
17
  # --- Tab 1: PHÂN TÍCH DỮ LIỆU ---
18
  with tab1:
19
- st.header("📊 Phân tích dữ liệu")
20
-
21
- df = pd.DataFrame({
22
- "Loại thực thể": ["PER", "LOC", "ORG", "MISC"],
23
- "Số lượng": [3200, 2500, 1800, 900]
24
- })
25
-
26
- st.bar_chart(df.set_index("Loại thực thể"))
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
29
  with tab2:
 
16
 
17
  # --- Tab 1: PHÂN TÍCH DỮ LIỆU ---
18
  with tab1:
19
+ col1, col2 = st.columns(2)
20
+
21
+ # ==== Distribution of NER Label Frequency ====
22
+ with col1:
23
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png")
24
+
25
+ # ==== Distribution of NER Label Frequency (Add crawled data) ====
26
+ with col2:
27
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png")
28
+
29
+ # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
30
+ with col1:
31
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png")
32
+
33
+ # ==== Distribution of Sentence Lengths ====
34
+ with col2:
35
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png")
36
+
37
+ # ==== Distribution of Token Lengths ====
38
+ with col1:
39
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png")
40
 
41
  # --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
42
  with tab2:
space/space/space/space/space/space/space/space/space/space/space/st.py CHANGED
@@ -1,7 +1,23 @@
1
  import streamlit as st
2
 
3
- # Load ảnh từ file local
4
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/W%26B%20Chart%206_18_2025%2C%207_23_58%20PM.png", caption="Ảnh minh hoạ", use_column_width=True)
5
 
6
- # Load ảnh từ URL
7
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/W%26B%20Chart%206_18_2025%2C%207_24_20%20PM.png", caption="Ảnh từ URL", use_column_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
 
3
+ col1, col2 = st.columns(2)
 
4
 
5
+ # ==== Distribution of NER Label Frequency ====
6
+ with col1:
7
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png")
8
+
9
+ # ==== Distribution of NER Label Frequency (Add crawled data) ====
10
+ with col2:
11
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png")
12
+
13
+ # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
14
+ with col1:
15
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png")
16
+
17
+ # ==== Distribution of Sentence Lengths ====
18
+ with col2:
19
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png")
20
+
21
+ # ==== Distribution of Token Lengths ====
22
+ with col1:
23
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png")
space/space/space/space/space/space/space/space/src/app.py CHANGED
@@ -8,52 +8,57 @@ from results.output import training_log, report_dict, report_dict_2, model_compa
8
 
9
  st.set_page_config(page_title="Vietnamese NER", layout="wide")
10
 
11
- # ===== Tiêu đề chính =====
12
- st.title("🔍 Ứng dụng nhận diện thực thể có tên (NER) cho tiếng Việt")
13
 
14
  # Tabs
15
- tab1, tab2, tab3 = st.tabs(["📊 Phân tích dữ liệu", "📈 Kết quả huấn luyện", "🧪 Demo mô hình"])
16
 
17
- # --- Tab 1: PHÂN TÍCH DỮ LIỆU ---
18
  with tab1:
19
  col1, col2 = st.columns(2)
20
 
21
  # ==== Distribution of NER Label Frequency ====
22
  with col1:
23
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png")
 
24
 
25
  # ==== Distribution of NER Label Frequency (Add crawled data) ====
26
  with col2:
27
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png")
 
28
 
29
  # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
30
  with col1:
31
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png")
 
32
 
33
  # ==== Distribution of Sentence Lengths ====
34
  with col2:
35
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png")
 
36
 
37
  # ==== Distribution of Token Lengths ====
38
  with col1:
39
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png")
 
40
 
41
- # --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
42
  with tab2:
43
  st.set_page_config(
44
- page_title="My NER App",
45
  layout="wide",
46
  initial_sidebar_state="expanded"
47
  )
48
 
49
- # ==== TẠO FIGURES ====
50
 
51
  # 1️⃣ Loss
52
  fig_loss = go.Figure()
53
  fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
54
- mode='lines+markers', name='Train Loss'))
55
  fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
56
- mode='lines+markers', name='Val Loss'))
57
  fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
58
 
59
  # 2️⃣ F1-Score
@@ -61,7 +66,7 @@ with tab2:
61
  fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
62
  mode='lines+markers', name='Train F1'))
63
  fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
64
- mode='lines+markers', name='Val F1'))
65
  fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
66
 
67
  # 3️⃣ Classification Report Table & Bar
@@ -70,34 +75,34 @@ with tab2:
70
  report_dict[lbl]["precision"],
71
  report_dict[lbl]["recall"],
72
  report_dict[lbl]["f1-score"]]
73
- for lbl in labels]
74
  df_report = pd.DataFrame(report_data,
75
- columns=["Label", "Precision", "Recall", "F1-Score"])
76
 
77
  fig_report = go.Figure()
78
  for col in ["Precision", "Recall", "F1-Score"]:
79
  fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
80
  fig_report.update_layout(barmode='group',
81
- title="Class Report Metrics of PhoBert + CRF",
82
- xaxis_title="Label", yaxis_title="Score",
83
- yaxis=dict(range=[0,1.0]))
84
 
85
  labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
86
  report_data2 = [[lbl,
87
- report_dict_2[lbl]["precision"],
88
- report_dict_2[lbl]["recall"],
89
- report_dict_2[lbl]["f1-score"]]
90
  for lbl in labels2]
91
  df_report2 = pd.DataFrame(report_data2,
92
- columns=["Label", "Precision", "Recall", "F1-Score"])
93
 
94
  fig_report2 = go.Figure()
95
  for col in ["Precision", "Recall", "F1-Score"]:
96
  fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
97
  fig_report2.update_layout(barmode='group',
98
- title="Class Report Metrics of PhoBert + Softmax",
99
- xaxis_title="Label", yaxis_title="Score",
100
- yaxis=dict(range=[0,1.0]))
101
 
102
  # 4️⃣ Model & Data Comparison Tables
103
  df_model = pd.DataFrame(
@@ -109,7 +114,7 @@ with tab2:
109
  columns=["Preprocessing", "F1-Score"]
110
  )
111
 
112
- # ==== LAYOUT RAO GỌN VỚI COLUMNS ====
113
 
114
  # Row 1: Loss | F1
115
  col1, col2 = st.columns(2)
@@ -134,26 +139,26 @@ with tab2:
134
  st.markdown("**Data Preprocessing Comparison**")
135
  st.dataframe(df_data, use_container_width=True)
136
 
137
- # --- Tab 3: DEMO MÔ HÌNH ---
138
  with tab3:
139
- st.header("🧪 Vietnamese Named Entity Recognition")
140
 
141
- text = st.text_input("Nhập văn bản tiếng Việt:", "Nguyễn Văn A đang làm việc tại Hà Nội")
142
 
143
- if st.button("Phân tích"):
144
  if not text.strip():
145
- st.warning("Vui lòng nhập văn bản!")
146
  else:
147
  tokens, labels = predict_demo(text)
148
 
149
- st.subheader("Thực thể được phát hiện")
150
  entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"]
151
 
152
  if entities:
153
  for tok, lab in entities:
154
  st.markdown(f"🔹 **{tok}** — *{lab}*")
155
  else:
156
- st.info("Không phát hiện thực thể.")
157
 
158
- st.subheader("Highlight trong văn bản:")
159
  st.markdown(render_html(tokens, labels), unsafe_allow_html=True)
 
8
 
9
  st.set_page_config(page_title="Vietnamese NER", layout="wide")
10
 
11
+ # ===== Main Title =====
12
+ st.title("🔍 Vietnamese Named Entity Recognition (NER) Application")
13
 
14
  # Tabs
15
+ tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"])
16
 
17
+ # --- Tab 1: DATA ANALYSIS ---
18
  with tab1:
19
  col1, col2 = st.columns(2)
20
 
21
  # ==== Distribution of NER Label Frequency ====
22
  with col1:
23
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png",
24
+ caption="NER Label Frequency Distribution")
25
 
26
  # ==== Distribution of NER Label Frequency (Add crawled data) ====
27
  with col2:
28
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png",
29
+ caption="NER Label Frequency (Extended with Crawled Data)")
30
 
31
  # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
32
  with col1:
33
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png",
34
+ caption="Number of Entities per Sentence")
35
 
36
  # ==== Distribution of Sentence Lengths ====
37
  with col2:
38
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png",
39
+ caption="Sentence Length Distribution")
40
 
41
  # ==== Distribution of Token Lengths ====
42
  with col1:
43
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png",
44
+ caption="Token Length Distribution")
45
 
46
+ # --- Tab 2: TRAINING RESULTS ---
47
  with tab2:
48
  st.set_page_config(
49
+ page_title="Vietnamese NER",
50
  layout="wide",
51
  initial_sidebar_state="expanded"
52
  )
53
 
54
+ # ==== CREATE FIGURES ====
55
 
56
  # 1️⃣ Loss
57
  fig_loss = go.Figure()
58
  fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
59
+ mode='lines+markers', name='Train Loss'))
60
  fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
61
+ mode='lines+markers', name='Validation Loss'))
62
  fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
63
 
64
  # 2️⃣ F1-Score
 
66
  fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
67
  mode='lines+markers', name='Train F1'))
68
  fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
69
+ mode='lines+markers', name='Validation F1'))
70
  fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
71
 
72
  # 3️⃣ Classification Report Table & Bar
 
75
  report_dict[lbl]["precision"],
76
  report_dict[lbl]["recall"],
77
  report_dict[lbl]["f1-score"]]
78
+ for lbl in labels]
79
  df_report = pd.DataFrame(report_data,
80
+ columns=["Label", "Precision", "Recall", "F1-Score"])
81
 
82
  fig_report = go.Figure()
83
  for col in ["Precision", "Recall", "F1-Score"]:
84
  fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
85
  fig_report.update_layout(barmode='group',
86
+ title="Class Metrics: PhoBERT + CRF",
87
+ xaxis_title="Label", yaxis_title="Score",
88
+ yaxis=dict(range=[0, 1.0]))
89
 
90
  labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
91
  report_data2 = [[lbl,
92
+ report_dict_2[lbl]["precision"],
93
+ report_dict_2[lbl]["recall"],
94
+ report_dict_2[lbl]["f1-score"]]
95
  for lbl in labels2]
96
  df_report2 = pd.DataFrame(report_data2,
97
+ columns=["Label", "Precision", "Recall", "F1-Score"])
98
 
99
  fig_report2 = go.Figure()
100
  for col in ["Precision", "Recall", "F1-Score"]:
101
  fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
102
  fig_report2.update_layout(barmode='group',
103
+ title="Class Metrics: PhoBERT + Softmax",
104
+ xaxis_title="Label", yaxis_title="Score",
105
+ yaxis=dict(range=[0, 1.0]))
106
 
107
  # 4️⃣ Model & Data Comparison Tables
108
  df_model = pd.DataFrame(
 
114
  columns=["Preprocessing", "F1-Score"]
115
  )
116
 
117
+ # ==== CLEAN LAYOUT WITH COLUMNS ====
118
 
119
  # Row 1: Loss | F1
120
  col1, col2 = st.columns(2)
 
139
  st.markdown("**Data Preprocessing Comparison**")
140
  st.dataframe(df_data, use_container_width=True)
141
 
142
+ # --- Tab 3: MODEL DEMO ---
143
  with tab3:
144
+ st.header("🧪 Vietnamese Named Entity Recognition Demo")
145
 
146
+ text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội")
147
 
148
+ if st.button("Analyze"):
149
  if not text.strip():
150
+ st.warning("Please enter some text!")
151
  else:
152
  tokens, labels = predict_demo(text)
153
 
154
+ st.subheader("Detected Entities")
155
  entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"]
156
 
157
  if entities:
158
  for tok, lab in entities:
159
  st.markdown(f"🔹 **{tok}** — *{lab}*")
160
  else:
161
+ st.info("No named entities detected.")
162
 
163
+ st.subheader("Highlighted Text")
164
  st.markdown(render_html(tokens, labels), unsafe_allow_html=True)
space/space/space/space/space/space/space/src/app.py CHANGED
@@ -9,7 +9,7 @@ from results.output import training_log, report_dict, report_dict_2, model_compa
9
  st.set_page_config(page_title="Vietnamese NER", layout="wide")
10
 
11
  # ===== Main Title =====
12
- st.title("🔍 Vietnamese Named Entity Recognition (NER) Application")
13
 
14
  # Tabs
15
  tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"])
@@ -20,28 +20,23 @@ with tab1:
20
 
21
  # ==== Distribution of NER Label Frequency ====
22
  with col1:
23
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png",
24
- caption="NER Label Frequency Distribution")
25
 
26
  # ==== Distribution of NER Label Frequency (Add crawled data) ====
27
  with col2:
28
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png",
29
- caption="NER Label Frequency (Extended with Crawled Data)")
30
 
31
  # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
32
  with col1:
33
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png",
34
- caption="Number of Entities per Sentence")
35
 
36
  # ==== Distribution of Sentence Lengths ====
37
  with col2:
38
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png",
39
- caption="Sentence Length Distribution")
40
 
41
  # ==== Distribution of Token Lengths ====
42
  with col1:
43
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png",
44
- caption="Token Length Distribution")
45
 
46
  # --- Tab 2: TRAINING RESULTS ---
47
  with tab2:
@@ -141,8 +136,6 @@ with tab2:
141
 
142
  # --- Tab 3: MODEL DEMO ---
143
  with tab3:
144
- st.header("🧪 Vietnamese Named Entity Recognition Demo")
145
-
146
  text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội")
147
 
148
  if st.button("Analyze"):
 
9
  st.set_page_config(page_title="Vietnamese NER", layout="wide")
10
 
11
  # ===== Main Title =====
12
+ st.title("🔍 Vietnamese Named Entity Recognition Demo")
13
 
14
  # Tabs
15
  tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"])
 
20
 
21
  # ==== Distribution of NER Label Frequency ====
22
  with col1:
23
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png")
 
24
 
25
  # ==== Distribution of NER Label Frequency (Add crawled data) ====
26
  with col2:
27
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png")
 
28
 
29
  # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
30
  with col1:
31
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png")
 
32
 
33
  # ==== Distribution of Sentence Lengths ====
34
  with col2:
35
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png")
 
36
 
37
  # ==== Distribution of Token Lengths ====
38
  with col1:
39
+ st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png")
 
40
 
41
  # --- Tab 2: TRAINING RESULTS ---
42
  with tab2:
 
136
 
137
  # --- Tab 3: MODEL DEMO ---
138
  with tab3:
 
 
139
  text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội")
140
 
141
  if st.button("Analyze"):
space/space/space/space/space/src/app.py CHANGED
@@ -1,157 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import plotly.graph_objects as go
4
-
5
- from src.predict import predict_demo
6
- from src.front import render_html
7
- from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare
8
-
9
- st.set_page_config(page_title="Vietnamese NER", layout="wide")
10
-
11
- # ===== Main Title =====
12
- st.title("🔍 Vietnamese Named Entity Recognition Demo")
13
-
14
- # Tabs
15
- tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"])
16
-
17
- # --- Tab 1: DATA ANALYSIS ---
18
- with tab1:
19
- col1, col2 = st.columns(2)
20
-
21
- # ==== Distribution of NER Label Frequency ====
22
- with col1:
23
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png")
24
-
25
- # ==== Distribution of NER Label Frequency (Add crawled data) ====
26
- with col2:
27
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png")
28
-
29
- # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
30
- with col1:
31
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png")
32
-
33
- # ==== Distribution of Sentence Lengths ====
34
- with col2:
35
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png")
36
-
37
- # ==== Distribution of Token Lengths ====
38
- with col1:
39
- st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png")
40
-
41
- # --- Tab 2: TRAINING RESULTS ---
42
- with tab2:
43
- st.set_page_config(
44
- page_title="Vietnamese NER",
45
- layout="wide",
46
- initial_sidebar_state="expanded"
47
- )
48
-
49
- # ==== CREATE FIGURES ====
50
-
51
- # 1️⃣ Loss
52
- fig_loss = go.Figure()
53
- fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
54
- mode='lines+markers', name='Train Loss'))
55
- fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
56
- mode='lines+markers', name='Validation Loss'))
57
- fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
58
-
59
- # 2️⃣ F1-Score
60
- fig_f1 = go.Figure()
61
- fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
62
- mode='lines+markers', name='Train F1'))
63
- fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
64
- mode='lines+markers', name='Validation F1'))
65
- fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
66
-
67
- # 3️⃣ Classification Report Table & Bar
68
- labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
69
- report_data = [[lbl,
70
- report_dict[lbl]["precision"],
71
- report_dict[lbl]["recall"],
72
- report_dict[lbl]["f1-score"]]
73
- for lbl in labels]
74
- df_report = pd.DataFrame(report_data,
75
- columns=["Label", "Precision", "Recall", "F1-Score"])
76
-
77
- fig_report = go.Figure()
78
- for col in ["Precision", "Recall", "F1-Score"]:
79
- fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
80
- fig_report.update_layout(barmode='group',
81
- title="Class Metrics: PhoBERT + CRF",
82
- xaxis_title="Label", yaxis_title="Score",
83
- yaxis=dict(range=[0, 1.0]))
84
-
85
- labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
86
- report_data2 = [[lbl,
87
- report_dict_2[lbl]["precision"],
88
- report_dict_2[lbl]["recall"],
89
- report_dict_2[lbl]["f1-score"]]
90
- for lbl in labels2]
91
- df_report2 = pd.DataFrame(report_data2,
92
- columns=["Label", "Precision", "Recall", "F1-Score"])
93
-
94
- fig_report2 = go.Figure()
95
- for col in ["Precision", "Recall", "F1-Score"]:
96
- fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
97
- fig_report2.update_layout(barmode='group',
98
- title="Class Metrics: PhoBERT + Softmax",
99
- xaxis_title="Label", yaxis_title="Score",
100
- yaxis=dict(range=[0, 1.0]))
101
-
102
- # 4️⃣ Model & Data Comparison Tables
103
- df_model = pd.DataFrame(
104
- [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()],
105
- columns=["Model", "F1-Score", "Accuracy"]
106
- )
107
- df_data = pd.DataFrame(
108
- [[s, f1] for s, f1 in data_compare["Data"].items()],
109
- columns=["Preprocessing", "F1-Score"]
110
- )
111
-
112
- # ==== CLEAN LAYOUT WITH COLUMNS ====
113
-
114
- # Row 1: Loss | F1
115
- col1, col2 = st.columns(2)
116
- with col1:
117
- st.plotly_chart(fig_loss, use_container_width=True)
118
- with col2:
119
- st.plotly_chart(fig_f1, use_container_width=True)
120
-
121
- # Row 2: Class Report Table | Bar Chart
122
- col3, col4 = st.columns(2)
123
- with col3:
124
- st.plotly_chart(fig_report2, use_container_width=True)
125
- with col4:
126
- st.plotly_chart(fig_report, use_container_width=True)
127
-
128
- # Row 3: Model Compare | Data Compare
129
- col5, col6 = st.columns(2)
130
- with col5:
131
- st.markdown("**Model Comparison**")
132
- st.dataframe(df_model, use_container_width=True)
133
- with col6:
134
- st.markdown("**Data Preprocessing Comparison**")
135
- st.dataframe(df_data, use_container_width=True)
136
-
137
- # --- Tab 3: MODEL DEMO ---
138
- with tab3:
139
- text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội")
140
-
141
- if st.button("Analyze"):
142
- if not text.strip():
143
- st.warning("Please enter some text!")
144
- else:
145
- tokens, labels = predict_demo(text)
146
-
147
- st.subheader("Detected Entities")
148
- entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"]
149
-
150
- if entities:
151
- for tok, lab in entities:
152
- st.markdown(f"🔹 **{tok}** — *{lab}*")
153
- else:
154
- st.info("No named entities detected.")
155
-
156
- st.subheader("Highlighted Text")
157
- st.markdown(render_html(tokens, labels), unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
space/space/space/space/space/src/predict.py CHANGED
@@ -1,8 +1,6 @@
1
  import torch
2
- from src.model import CRF_Tagger
3
- from src.preprocessing import process_demo_sentence
4
- import os
5
-
6
  def predict(model, loader, count_loss=True):
7
 
8
  model.eval() # Evaluation Mode, Ignore Dropout, BatchNorm, ...
@@ -30,9 +28,6 @@ def predict(model, loader, count_loss=True):
30
 
31
  def predict_demo(text):
32
 
33
- BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
34
- model_path = os.path.join(BASE_DIR, "models", "best_epoch_16.pt")
35
-
36
 
37
  id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
38
 
@@ -40,7 +35,7 @@ def predict_demo(text):
40
  NUM_TAGS = 7
41
 
42
  model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
43
- model.load_state_dict(torch.load(model_path))
44
  model.eval()
45
  with torch.no_grad():
46
  preds = model.decode(x)
 
1
  import torch
2
+ from model import CRF_Tagger
3
+ from preprocessing import process_demo_sentence
 
 
4
  def predict(model, loader, count_loss=True):
5
 
6
  model.eval() # Evaluation Mode, Ignore Dropout, BatchNorm, ...
 
28
 
29
  def predict_demo(text):
30
 
 
 
 
31
 
32
  id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
33
 
 
35
  NUM_TAGS = 7
36
 
37
  model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
38
+ model.load_state_dict(torch.load("models/best_epoch_16.pt"))
39
  model.eval()
40
  with torch.no_grad():
41
  preds = model.decode(x)
space/space/space/space/space/src/preprocessing.py CHANGED
@@ -3,7 +3,7 @@ import torch
3
  from transformers import AutoTokenizer, AutoModel
4
  from tqdm import tqdm
5
  from sklearn.model_selection import train_test_split
6
- from src.configs import configs
7
  from pyvi import ViTokenizer
8
 
9
  def join_tokens(tokens):
 
3
  from transformers import AutoTokenizer, AutoModel
4
  from tqdm import tqdm
5
  from sklearn.model_selection import train_test_split
6
+ from configs import configs
7
  from pyvi import ViTokenizer
8
 
9
  def join_tokens(tokens):
space/space/space/space/space/src/templates/demo.html ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html lang="en">
2
+ <head>
3
+ <meta charset="utf-8"/>
4
+ <meta content="width=device-width, initial-scale=1" name="viewport"/>
5
+ <title>
6
+ Model Demo
7
+ </title>
8
+ <script src="https://cdn.tailwindcss.com">
9
+ </script>
10
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css" rel="stylesheet"/>
11
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&amp;display=swap" rel="stylesheet"/>
12
+ <style>
13
+ body {
14
+ font-family: "Inter", sans-serif;
15
+ }
16
+ </style>
17
+ </head>
18
+ <body class="min-h-screen bg-gradient-to-br from-[#e07db7] via-[#e07db7]/40 to-[#f47a2f] flex items-center justify-center p-6">
19
+ <div class="max-w-6xl w-full rounded-xl flex overflow-hidden drop-shadow-lg bg-gradient-to-br from-[#f7f8fa] to-[#e9ebf0]">
20
+ <!-- Sidebar -->
21
+ <aside class="w-48 flex flex-col bg-[#F0F0F5] border-r border-gray-200 select-none">
22
+ <nav class="flex flex-col mt-6 space-y-1 px-2">
23
+ <button id="homeBtn" class="flex items-center gap-3 text-xs font-semibold text-[#FF6A00] rounded-md py-3 px-4 border border-[#FF6A00] bg-white shadow-sm hover:bg-[#ff6a0040] transition" type="button">
24
+ <i class="fas fa-home text-sm"></i>
25
+ <span data-i18n="home">HOME</span>
26
+ </button>
27
+ <button id="historyBtn" class="flex items-center gap-3 text-xs font-semibold text-gray-700 rounded-md py-3 px-4 hover:bg-[#ff6a0040] hover:text-[#FF6A00] transition" type="button">
28
+ <i class="fas fa-clock text-sm"></i>
29
+ <span data-i18n="history">DETECT HISTORY</span>
30
+ </button>
31
+ <button id="statBtn" class="flex items-center gap-3 text-xs font-semibold text-gray-700 rounded-md py-3 px-4 hover:bg-[#ff6a0040] hover:text-[#FF6A00] transition" type="button">
32
+ <i class="fas fa-chart-bar text-sm"></i>
33
+ <span data-i18n="stat">STATISTICS</span>
34
+ </button>
35
+ <button id="settingSidebarBtn" class="flex items-center gap-3 text-xs font-semibold text-gray-700 rounded-md py-3 px-4 hover:bg-[#ff6a0040] hover:text-[#FF6A00] transition" type="button">
36
+ <i class="fas fa-sliders-h text-sm"></i>
37
+ <span data-i18n="settings">SETTINGS</span>
38
+ </button>
39
+ </nav>
40
+ </aside>
41
+ <!-- Main content -->
42
+ <main class="flex-1 flex flex-col p-6">
43
+ <!-- Topbar: Tabs + Search -->
44
+ <div class="flex flex-col md:flex-row md:items-center md:justify-between gap-4 md:gap-0 mb-8">
45
+ <!-- Tabs as segmented control card -->
46
+ <div aria-label="Main navigation tabs" class="inline-flex rounded-lg bg-white shadow-sm border border-gray-300 overflow-hidden text-xs font-semibold text-gray-600" role="tablist">
47
+ <button aria-selected="false" class="px-4 py-2 flex items-center gap-1 hover:text-[#FF6A00] transition focus:outline-none" role="tab" tabindex="0" type="button">
48
+ <img alt="Data Analysis icon, a small colorful bar chart" class="w-4 h-4" height="16" src="https://storage.googleapis.com/a1aa/image/97104fd7-5bb0-41e8-f1d2-7860810595dd.jpg" width="16"/>
49
+ <span data-i18n="dataAnalysis">Data Analysis</span>
50
+ </button>
51
+ <button aria-selected="false" class="px-4 py-2 flex items-center gap-1 hover:text-[#FF6A00] transition focus:outline-none" role="tab" tabindex="-1" type="button">
52
+ <img alt="Training Results icon, a small line chart with upward trend" class="w-4 h-4" height="16" src="https://storage.googleapis.com/a1aa/image/ac9104af-fd5c-488d-8f07-2e00044563e3.jpg" width="16"/>
53
+ <span data-i18n="trainingResults">Training Results</span>
54
+ </button>
55
+ <button aria-selected="true" class="px-4 py-2 flex items-center gap-1 bg-[#FF6A00] text-white rounded-lg focus:outline-none" role="tab" tabindex="0" type="button">
56
+ <img alt="Model Demo icon, a small green pencil" class="w-4 h-4" height="16" src="https://storage.googleapis.com/a1aa/image/ee1b8981-47ae-4e2b-3171-3006c09b5080.jpg" width="16"/>
57
+ <span data-i18n="modelDemo">Model Demo</span>
58
+ </button>
59
+ </div>
60
+ <!-- Search bar on right -->
61
+ <form aria-label="Search form" class="w-full max-w-xs md:max-w-sm" onsubmit="event.preventDefault()">
62
+ <label class="sr-only" for="search">
63
+ Search
64
+ </label>
65
+ <div class="relative w-full">
66
+ <!-- Search icon or input here if needed -->
67
+ </div>
68
+ </form>
69
+ </div>
70
+
71
+ <!-- Input type tabs (ẩn khi ở trang Setting) -->
72
+ <div id="innerTabs" class="flex gap-6 w-full mb-6">
73
+ <button id="textTabBtn" class="flex-1 bg-white font-semibold text-sm py-3 rounded-xl border border-gray-300 shadow-sm hover:shadow-md transition text-center tab-btn active" type="button">
74
+ <span data-i18n="enterText">✍️ Enter text</span>
75
+ </button>
76
+ <button id="fileTabBtn" class="flex-1 bg-white font-semibold text-sm py-3 rounded-xl border border-gray-300 shadow-sm hover:shadow-md transition text-center tab-btn" type="button">
77
+ 📄
78
+ <span class="font-bold" data-i18n="uploadFile">
79
+ Upload .txt or .docx file
80
+ </span>
81
+ </button>
82
+ </div>
83
+ <!-- Tab contents -->
84
+ <div id="tabContents" class="relative w-full flex-1">
85
+ <!-- Text input area -->
86
+ <form id="textForm" aria-label="Vietnamese text input form" class="w-full bg-white rounded-2xl p-6 shadow-sm border border-gray-200 tab-content transition-all duration-300" onsubmit="event.preventDefault()">
87
+ <label class="block text-xs font-normal text-gray-600 mb-2" for="vietnamese-text" data-i18n="enterTextLabel">
88
+ Enter Vietnamese text:
89
+ </label>
90
+ <textarea class="w-full rounded-[12px] border border-gray-300 bg-gray-50 text-gray-700 text-sm p-3 mb-2 resize-none shadow-sm focus:outline-none focus:ring-2 focus:ring-[#FF6A00] transition" id="vietnamese-text" maxlength="300" placeholder="Ví dụ: Nguyễn Văn A sinh sống tại TP.HCM" rows="5">Nguyễn Văn A đang làm việc tại Hà Nội</textarea>
91
+ <div aria-live="polite" class="flex justify-between items-center mb-4 text-xs text-gray-500 select-none">
92
+ <span id="charCount">
93
+ Characters: 38 / 300
94
+ </span>
95
+ <span id="wordCount">
96
+ Words: 7
97
+ </span>
98
+ </div>
99
+ <button class="inline-flex items-center gap-2 bg-[#FF6A00] text-white text-xs font-semibold py-2 px-4 rounded-xl shadow-md hover:bg-[#e65a00] disabled:bg-[#ffb380] disabled:cursor-not-allowed transition" type="submit">
100
+ <i class="fas fa-brain"></i>
101
+ <span data-i18n="analyze">Analyze</span>
102
+ </button>
103
+ <div id="textResult" class="mt-4 text-green-700 font-semibold hidden"></div>
104
+ </form>
105
+ <!-- File upload area -->
106
+ <form id="fileForm" aria-label="File upload form" class="w-full bg-white rounded-2xl p-6 shadow-sm border border-gray-200 tab-content transition-all duration-300 absolute top-0 left-0 opacity-0 pointer-events-none" onsubmit="event.preventDefault()">
107
+ <label class="block text-xs font-normal text-gray-600 mb-2" for="file-upload" data-i18n="uploadFileLabel">
108
+ Upload .txt or .docx file:
109
+ </label>
110
+ <input id="file-upload" type="file" accept=".txt,.docx" class="mb-4 block"/>
111
+ <button class="inline-flex items-center gap-2 bg-[#FF6A00] text-white text-xs font-semibold py-2 px-4 rounded-xl shadow-md hover:bg-[#e65a00] disabled:bg-[#ffb380] disabled:cursor-not-allowed transition" type="submit" id="analyzeFileBtn" disabled>
112
+ <i class="fas fa-brain"></i>
113
+ <span data-i18n="analyze">Analyze</span>
114
+ </button>
115
+ <div id="fileResult" class="mt-4 text-green-700 font-semibold hidden"></div>
116
+ </form>
117
+ <!-- Setting area (chỉ hiện khi vào Setting) -->
118
+ <form id="settingForm" class="w-full bg-white rounded-2xl p-6 shadow-sm border border-gray-200 tab-content transition-all duration-300 absolute top-0 left-0 opacity-0 pointer-events-none" onsubmit="event.preventDefault()">
119
+ <label class="block text-xs font-normal text-gray-600 mb-2" for="language-select" data-i18n="chooseLanguage">
120
+ Chọn ngôn ngữ / Select language:
121
+ </label>
122
+ <select id="language-select" class="w-full rounded-[12px] border border-gray-300 bg-gray-50 text-gray-700 text-sm p-3 mb-4 shadow-sm focus:outline-none focus:ring-2 focus:ring-[#FF6A00] transition">
123
+ <option value="vi">Tiếng Việt</option>
124
+ <option value="en">English</option>
125
+ <option value="zh">中文</option>
126
+ <option value="ja">日本語</option>
127
+ </select>
128
+ <button class="inline-flex items-center gap-2 bg-[#FF6A00] text-white text-xs font-semibold py-2 px-4 rounded-xl shadow-md hover:bg-[#e65a00] transition" type="submit" id="confirmSettingBtn">
129
+ <i class="fas fa-check"></i>
130
+ </button>
131
+ <div id="settingResult" class="mt-4 text-green-700 font-semibold hidden"></div>
132
+ </form>
133
+ </div>
134
+ </main>
135
+ <script>
136
+ const translations = {
137
+ en: {
138
+ home: "HOME",
139
+ history: "DETECT HISTORY",
140
+ stat: "STATISTICS",
141
+ settings: "SETTINGS",
142
+ dataAnalysis: "Data Analysis",
143
+ trainingResults: "Training Results",
144
+ modelDemo: "Model Demo",
145
+ enterText: "✍️ Enter text",
146
+ uploadFile: "Upload .txt or .docx file",
147
+ enterTextLabel: "Enter Vietnamese text:",
148
+ uploadFileLabel: "Upload .txt or .docx file:",
149
+ chooseLanguage: "Select language:",
150
+ confirm: "Confirm",
151
+ analyze: "Analyze"
152
+ },
153
+ vi: {
154
+ home: "TRANG CHỦ",
155
+ history: "LỊCH SỬ PHÁT HIỆN",
156
+ stat: "THỐNG KÊ",
157
+ settings: "CÀI ĐẶT",
158
+ dataAnalysis: "Phân tích dữ liệu",
159
+ trainingResults: "Kết quả huấn luyện",
160
+ modelDemo: "Demo mô hình",
161
+ enterText: "✍️ Nhập văn bản",
162
+ uploadFile: "Tải lên file .txt hoặc .docx",
163
+ enterTextLabel: "Nhập văn bản tiếng Việt:",
164
+ uploadFileLabel: "Tải lên file .txt hoặc .docx:",
165
+ chooseLanguage: "Chọn ngôn ngữ:",
166
+ confirm: "Xác nhận",
167
+ analyze: "Phân tích"
168
+ },
169
+ zh: {
170
+ home: "主页",
171
+ history: "检测历史",
172
+ stat: "统计",
173
+ settings: "设置",
174
+ dataAnalysis: "数据分析",
175
+ trainingResults: "训练结果",
176
+ modelDemo: "模型演示",
177
+ enterText: "✍️ 输入文本",
178
+ uploadFile: "上传 .txt 或 .docx 文件",
179
+ enterTextLabel: "输入越南语文本:",
180
+ uploadFileLabel: "上传 .txt 或 .docx 文件:",
181
+ chooseLanguage: "选择语言:",
182
+ confirm: "确认",
183
+ analyze: "分析"
184
+ },
185
+ ja: {
186
+ home: "ホーム",
187
+ history: "検出履歴",
188
+ stat: "統計",
189
+ settings: "設定",
190
+ dataAnalysis: "データ分析",
191
+ trainingResults: "トレーニング結果",
192
+ modelDemo: "モデルデモ",
193
+ enterText: "✍️ テキスト入力",
194
+ uploadFile: ".txt または .docx ファイルをアップロード",
195
+ enterTextLabel: "ベトナム語のテキストを入力:",
196
+ uploadFileLabel: ".txt または .docx ファイルをアップロード:",
197
+ chooseLanguage: "言語を選択:",
198
+ confirm: "確認",
199
+ analyze: "解析"
200
+ }
201
+ };
202
+
203
+ let currentLang = 'en';
204
+
205
+ function setLanguage(lang) {
206
+ currentLang = lang;
207
+ document.querySelectorAll('[data-i18n]').forEach(el => {
208
+ const key = el.getAttribute('data-i18n');
209
+ if (translations[lang][key]) {
210
+ el.textContent = translations[lang][key];
211
+ }
212
+ });
213
+ }
214
+
215
+ // Sidebar button logic
216
+ const homeBtn = document.getElementById('homeBtn');
217
+ const historyBtn = document.getElementById('historyBtn');
218
+ const statBtn = document.getElementById('statBtn');
219
+ const settingSidebarBtn = document.getElementById('settingSidebarBtn');
220
+ const innerTabs = document.getElementById('innerTabs');
221
+ const settingForm = document.getElementById('settingForm');
222
+ const textForm = document.getElementById('textForm');
223
+ const fileForm = document.getElementById('fileForm');
224
+
225
+ function showMainTabs() {
226
+ innerTabs.style.display = '';
227
+ textForm.style.position = '';
228
+ fileForm.style.position = 'absolute';
229
+ settingForm.classList.add('opacity-0', 'pointer-events-none');
230
+ settingForm.classList.remove('opacity-100');
231
+ activateTab('text');
232
+ }
233
+ function showSettingTab() {
234
+ innerTabs.style.display = 'none';
235
+ textForm.classList.add('opacity-0', 'pointer-events-none');
236
+ fileForm.classList.add('opacity-0', 'pointer-events-none');
237
+ settingForm.style.position = '';
238
+ settingForm.classList.remove('opacity-0', 'pointer-events-none');
239
+ settingForm.classList.add('opacity-100');
240
+ }
241
+
242
+ // Sidebar events
243
+ homeBtn.addEventListener('click', showMainTabs);
244
+ historyBtn.addEventListener('click', showMainTabs);
245
+ statBtn.addEventListener('click', showMainTabs);
246
+ settingSidebarBtn.addEventListener('click', showSettingTab);
247
+
248
+ // Language change logic
249
+ document.getElementById('language-select').addEventListener('change', function() {
250
+ setLanguage(this.value);
251
+ });
252
+ document.getElementById('settingForm').addEventListener('submit', function(e) {
253
+ e.preventDefault();
254
+ setLanguage(document.getElementById('language-select').value);
255
+ document.getElementById('settingResult').textContent =
256
+ currentLang === 'vi' ? '✔️ Đã đổi ngôn ngữ!' :
257
+ currentLang === 'en' ? '✔️ Language changed!' :
258
+ currentLang === 'zh' ? '✔️ 语言已更改!' :
259
+ '✔️ 言語が変更されました!';
260
+ document.getElementById('settingResult').classList.remove('hidden');
261
+ });
262
+
263
+ // Tab switching logic
264
+ const textTabBtn = document.getElementById('textTabBtn');
265
+ const fileTabBtn = document.getElementById('fileTabBtn');
266
+
267
+ function activateTab(tab) {
268
+ if (tab === 'text') {
269
+ textTabBtn.classList.add('active');
270
+ fileTabBtn.classList.remove('active');
271
+ textForm.classList.remove('opacity-0', 'pointer-events-none');
272
+ textForm.classList.add('opacity-100');
273
+ fileForm.classList.add('opacity-0', 'pointer-events-none');
274
+ fileForm.classList.remove('opacity-100');
275
+ } else {
276
+ fileTabBtn.classList.add('active');
277
+ textTabBtn.classList.remove('active');
278
+ fileForm.classList.remove('opacity-0', 'pointer-events-none');
279
+ fileForm.classList.add('opacity-100');
280
+ textForm.classList.add('opacity-0', 'pointer-events-none');
281
+ textForm.classList.remove('opacity-100');
282
+ }
283
+ }
284
+ textTabBtn.addEventListener('click', () => activateTab('text'));
285
+ fileTabBtn.addEventListener('click', () => activateTab('file'));
286
+
287
+ // Textarea character/word count
288
+ const textarea = document.getElementById('vietnamese-text');
289
+ const charCount = document.getElementById('charCount');
290
+ const wordCount = document.getElementById('wordCount');
291
+ textarea.addEventListener('input', () => {
292
+ charCount.textContent = `Characters: ${textarea.value.length} / 300`;
293
+ wordCount.textContent = `Words: ${textarea.value.trim().split(/\s+/).filter(Boolean).length}`;
294
+ });
295
+
296
+ // Analyze text
297
+ textForm.addEventListener('submit', () => {
298
+ const result = document.getElementById('textResult');
299
+ result.textContent = 'Processing...';
300
+ result.classList.remove('hidden');
301
+
302
+ fetch('http://localhost:5000/predict', {
303
+ method: 'POST',
304
+ headers: { 'Content-Type': 'application/json' },
305
+ body: JSON.stringify({ text: textarea.value })
306
+ })
307
+ .then(response => response.json())
308
+ .then(data => {
309
+ if (data.html_result) {
310
+ result.innerHTML = data.html_result;
311
+ } else {
312
+ result.textContent = 'No result.';
313
+ }
314
+ })
315
+ .catch(err => {
316
+ result.textContent = 'Error processing request.';
317
+ });
318
+ });
319
+
320
+
321
+ // File upload logic
322
+ const fileInput = document.getElementById('file-upload');
323
+ const analyzeFileBtn = document.getElementById('analyzeFileBtn');
324
+ fileInput.addEventListener('change', () => {
325
+ analyzeFileBtn.disabled = !fileInput.files.length;
326
+ });
327
+ fileForm.addEventListener('submit', () => {
328
+ const file = fileInput.files[0];
329
+ const result = document.getElementById('fileResult');
330
+ if (file) {
331
+ result.textContent = `Đã tải lên và phân tích file: ${file.name}`;
332
+ result.classList.remove('hidden');
333
+ }
334
+ });
335
+
336
+ // Khởi tạo tab đầu tiên và ngôn ngữ mặc định
337
+ activateTab('text');
338
+ setLanguage(currentLang);
339
+ </script>
340
+ <style>
341
+ .tab-content {
342
+ transition: opacity 0.3s;
343
+ }
344
+ .tab-btn.active {
345
+ background: #FF6A00 !important;
346
+ color: #fff !important;
347
+ box-shadow: 0 2px 8px #ff6a0033;
348
+ }
349
+ </style>
space/space/space/space/space/src/train.py CHANGED
@@ -1,6 +1,6 @@
1
  import wandb
2
  from tqdm import tqdm
3
- from src.evaluate import evaluate
4
  import torch
5
 
6
  def train_model(model, optimizer, configs, loaders):
 
1
  import wandb
2
  from tqdm import tqdm
3
+ from evaluate import evaluate
4
  import torch
5
 
6
  def train_model(model, optimizer, configs, loaders):