Spaces:
Sleeping
Sleeping
GitHub Actions
commited on
Commit
·
95062a5
1
Parent(s):
2f9d738
Auto-deploy from GitHub (binary files removed)
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +1 -1
- requirements.txt +0 -0
- space/README.md +1 -1
- space/space/space/README.md +1 -1
- space/space/space/space/space/requirements.txt +0 -0
- space/space/space/space/space/space/README.md +1 -1
- space/space/space/space/space/space/space/space/space/space/README.md +156 -52
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.log +88 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.py +116 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore +0 -2
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt +0 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.github/workflows/main.yml +47 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore +23 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/LICENSE +201 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/configs/config.yaml +1 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/environment.yml +9 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/models/best_epoch_16.pt +3 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb +0 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_RF_lightgbm.ipynb +741 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_Rule_base.ipynb +0 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Softmax_PhoBERT.ipynb +0 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt +0 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/run.py +73 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitattributes +35 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md +87 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__init__.py +4 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py +64 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/configs.py +15 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/data_set.py +31 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/evaluate.py +21 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/front.py +32 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/model.py +16 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py +46 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/preprocessing.py +171 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/torchcrf/__init__.py +340 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/train.py +98 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py +96 -14
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py +1 -1
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/st.py +98 -0
- space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py +5 -1
- space/space/space/space/space/space/space/space/space/space/space/space/space/st.py +4 -95
- space/space/space/space/space/space/space/space/space/space/space/src/app.py +21 -8
- space/space/space/space/space/space/space/space/space/space/space/st.py +20 -4
- space/space/space/space/space/space/space/space/src/app.py +41 -36
- space/space/space/space/space/space/space/src/app.py +6 -13
- space/space/space/space/space/src/app.py +0 -157
- space/space/space/space/space/src/predict.py +3 -8
- space/space/space/space/space/src/preprocessing.py +1 -1
- space/space/space/space/space/src/templates/demo.html +349 -0
- space/space/space/space/space/src/train.py +1 -1
README.md
CHANGED
@@ -17,7 +17,7 @@ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-ar
|
|
17 |
|
18 |
Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
|
19 |
|
20 |
-
**
|
19 |
|
20 |
+

|
21 |
|
22 |
## 🔄 Project Workflow
|
23 |
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
space/README.md
CHANGED
@@ -17,7 +17,7 @@ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-ar
|
|
17 |
|
18 |
Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
|
19 |
|
20 |
-
**
|
19 |
|
20 |
+

|
21 |
|
22 |
## 🔄 Project Workflow
|
23 |
|
space/space/space/README.md
CHANGED
@@ -17,7 +17,7 @@ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-ar
|
|
17 |
|
18 |
Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
|
19 |
|
20 |
-
**
|
19 |
|
20 |
+

|
21 |
|
22 |
## 🔄 Project Workflow
|
23 |
|
space/space/space/space/space/requirements.txt
CHANGED
Binary files a/space/space/space/space/space/requirements.txt and b/space/space/space/space/space/requirements.txt differ
|
|
space/space/space/space/space/space/README.md
CHANGED
@@ -17,7 +17,7 @@ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-ar
|
|
17 |
|
18 |
Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
|
19 |
|
20 |
-
**
|
19 |
|
20 |
+

|
21 |
|
22 |
## 🔄 Project Workflow
|
23 |
|
space/space/space/space/space/space/space/space/space/space/README.md
CHANGED
@@ -8,80 +8,184 @@ sdk_version: 1.46.1
|
|
8 |
app_file: src/app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
-
# Vietnamese Named Entity Recognition
|
12 |
|
13 |
-
|
14 |
|
15 |
-
### Option 1: Using `requirements.txt`
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
```bash
|
|
|
18 |
conda create --name vnner python=3.10
|
19 |
conda activate vnner
|
|
|
|
|
20 |
pip install -r requirements.txt
|
21 |
```
|
22 |
|
23 |
### Option 2: Using `environment.yml`
|
24 |
-
|
25 |
```bash
|
|
|
26 |
conda env create -f environment.yml
|
27 |
conda activate vnner
|
28 |
```
|
29 |
|
30 |
-
##
|
|
|
|
|
31 |
```bash
|
32 |
python run.py
|
33 |
```
|
34 |
-
---
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
my_ai_project/
|
40 |
-
│
|
41 |
-
├── data/
|
42 |
-
│ ├── raw_data.csv # Dữ liệu gốc
|
43 |
-
│ ├── processed_data_EDA.csv # Dữ liệu sau khi tiền xử lý
|
44 |
-
│ └── processed_data_full.csv # Dữ liệu sẵn sàng training
|
45 |
-
│
|
46 |
-
├── notebooks/ # Thử nghiệm và khám phá dữ liệu
|
47 |
-
│ ├── Duc_Notebook.ipynb # CRF + RandomForest
|
48 |
-
│ ├── Softmax_PhoBERT.ipynb # Softmax
|
49 |
-
│
|
50 |
-
├── src/ # Mã nguồn chính của dự án
|
51 |
-
│ ├── __init__.py
|
52 |
-
│ ├── data_loader.py # Nạp và xử lý dữ liệu
|
53 |
-
│ ├── preprocessing.py # Hàm tiền xử lý dữ liệu
|
54 |
-
│ ├── model.py # Định nghĩa kiến trúc mô hình
|
55 |
-
│ ├── train.py # Huấn luyện mô hình
|
56 |
-
│ ├── evaluate.py # Đánh giá mô hình
|
57 |
-
│ └── predict.py # Dự đoán với mô hình đã huấn luyện
|
58 |
-
│
|
59 |
-
├── models/ # Mô hình đã lưu sau khi huấn luyện
|
60 |
-
│ └── best_model.pth # File trọng số mô hình
|
61 |
-
│
|
62 |
-
├── outputs/ # Kết quả, biểu đồ, log, metrics
|
63 |
-
│ ├── logs/ # Nhật ký huấn luyện (tensorboard/logging)
|
64 |
-
│ └── figures/ # Biểu đồ trực quan hóa
|
65 |
-
│
|
66 |
-
├── configs/ # File cấu hình cho mô hình, huấn luyện
|
67 |
-
│ └── config.yaml
|
68 |
-
│
|
69 |
-
├── tests/ # Unit test cho các hàm chính
|
70 |
-
│
|
71 |
-
├── requirements.txt # Thư viện cần cài đặt
|
72 |
-
├── environment.yml # Môi trường Conda
|
73 |
-
├── README.md # Giới thiệu dự án
|
74 |
-
└── run.py # Script chính để chạy toàn bộ pipeline
|
75 |
```
|
76 |
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
-
##
|
80 |
|
81 |
-
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
-
|
|
|
8 |
app_file: src/app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
+
# Vietnamese Named Entity Recognition (NER) 🧠
|
12 |
|
13 |
+
A comprehensive Vietnamese Named Entity Recognition system using state-of-the-art deep learning models including PhoBERT, CRF, and ensemble methods.
|
14 |
|
|
|
15 |
|
16 |
+
## 🚀 Live Demo
|
17 |
+
|
18 |
+
Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)**
|
19 |
+
|
20 |
+

|
21 |
+
|
22 |
+
## 🔄 Project Workflow
|
23 |
+
|
24 |
+

|
25 |
+
|
26 |
+
## 🎯 Overview
|
27 |
+
|
28 |
+
This project implements a robust Vietnamese Named Entity Recognition system that can identify and classify entities in Vietnamese text. The system combines multiple approaches including:
|
29 |
+
|
30 |
+
- **PhoBERT-based embeddings** for contextual understanding
|
31 |
+
- **Conditional Random Fields (CRF)** for sequence labeling
|
32 |
+
- **Random Forest** with semantic embeddings
|
33 |
+
- **Rule-based methods** for enhanced accuracy
|
34 |
+
|
35 |
+
## 📂 Project Structure
|
36 |
+
|
37 |
+
```
|
38 |
+
VIETNAMESE_NER/
|
39 |
+
│
|
40 |
+
├── .github/workflows
|
41 |
+
│ └── main.yml # Auto deploy to Hugging Space
|
42 |
+
│
|
43 |
+
├── data/ # Dataset files
|
44 |
+
│ └── raw_data.csv # Raw training data
|
45 |
+
│
|
46 |
+
├── notebooks/ # Jupyter notebooks for experimentation
|
47 |
+
│ ├── Duc_Notebook.ipynb # CRF + RandomForest experiments
|
48 |
+
│ ├── Softmax_PhoBERT.ipynb # Softmax approach
|
49 |
+
│ ├── Kien_Rule_base.ipynb # Rule-based method with RF
|
50 |
+
│ └── Kien_RF_lightgbm.ipynb # RF with semantic embeddings
|
51 |
+
│
|
52 |
+
├── src/ # Main source code
|
53 |
+
│ ├── __init__.py
|
54 |
+
│ ├── app.py # Streamlit web application
|
55 |
+
│ ├── front.py # Highlight function
|
56 |
+
│ ├── config.py # Project configuration
|
57 |
+
│ ├── data_loader.py # Data loading utilities
|
58 |
+
│ ├── preprocessing.py # Data preprocessing functions
|
59 |
+
│ ├── model.py # Model architecture definitions
|
60 |
+
│ ├── train.py # Training pipeline
|
61 |
+
│ ├── evaluate.py # Model evaluation
|
62 |
+
│ └── predict.py # Inference utilities
|
63 |
+
│
|
64 |
+
├── models/ # Saved model artifacts
|
65 |
+
│ └── best_model.pt # Best trained model weights
|
66 |
+
│
|
67 |
+
├── outputs/ # Training outputs
|
68 |
+
│ ├── output.log # Training logs (TensorBoard)
|
69 |
+
│ └── figures/ # Visualization plots
|
70 |
+
│
|
71 |
+
├── tests/ # Unit tests (planned)
|
72 |
+
│
|
73 |
+
├── requirements.txt # Python dependencies
|
74 |
+
├── environment.yml # Conda environment file
|
75 |
+
├── README.md # Project documentation
|
76 |
+
└── run.py # Main training script
|
77 |
+
```
|
78 |
+
|
79 |
+
|
80 |
+
## 🏗️ Model Architecture
|
81 |
+
|
82 |
+
The system uses a hybrid architecture combining the strengths of different approaches:
|
83 |
+
|
84 |
+

|
85 |
+
|
86 |
+
### Core Components:
|
87 |
+
- **PhoBERT-Base**: Generates contextual embeddings for Vietnamese text
|
88 |
+
- **Linear + CRF Layer**: Handles sequence labeling with context awareness
|
89 |
+
- **Softmax/Random Forest**: Provides single-label prediction capabilities
|
90 |
+
|
91 |
+
## 📊 Dataset & Performance
|
92 |
+
|
93 |
+
### Dataset: VLSP2016
|
94 |
+
The model is trained on the VLSP2016 dataset extracted from Vietnamese news articles.
|
95 |
+
|
96 |
+
#### Dataset Statistics:
|
97 |
+
<table>
|
98 |
+
<tr>
|
99 |
+
<td><img src="https://github.com/user-attachments/assets/20116929-1556-44b2-86e9-086b72320f22" alt="Entity Frequency" width="600"/></td>
|
100 |
+
<td><img src="https://github.com/user-attachments/assets/9cafb068-bbda-4ee1-9fc9-bd4edded1438" alt="Entity Distribution" width="600"/></td>
|
101 |
+
</tr>
|
102 |
+
<tr>
|
103 |
+
<td><img src="https://github.com/user-attachments/assets/db9421c0-4e9c-4654-92d0-d924932384dc" alt="Token Length Distribution" width="600"/></td>
|
104 |
+
<td><img src="https://github.com/user-attachments/assets/70871bc5-ccb4-4186-9538-ac479c771415" alt="Sentence Length Distribution" width="600"/></td>
|
105 |
+
</tr>
|
106 |
+
</table>
|
107 |
+
|
108 |
+
|
109 |
+
### Model Performance:
|
110 |
+
<table>
|
111 |
+
<tr>
|
112 |
+
<td>
|
113 |
+
<img src="https://github.com/user-attachments/assets/9fb24f3a-466c-46f1-94d2-bcb6f26abd72" alt="F1 Score" width="600"/>
|
114 |
+
</td>
|
115 |
+
<td>
|
116 |
+
<img src="https://github.com/user-attachments/assets/11b8080a-38d6-4ea2-b350-21361345fd1e" alt="Training Loss" width="600"/>
|
117 |
+
</td>
|
118 |
+
</tr>
|
119 |
+
</table>
|
120 |
+
|
121 |
+

|
122 |
+
|
123 |
+
## 🛠️ Installation & Setup
|
124 |
+
|
125 |
+
### Prerequisites
|
126 |
+
- Python 3.10+
|
127 |
+
- Conda (recommended)
|
128 |
+
|
129 |
+
### Option 1: Using `requirements.txt`
|
130 |
```bash
|
131 |
+
# Create and activate conda environment
|
132 |
conda create --name vnner python=3.10
|
133 |
conda activate vnner
|
134 |
+
|
135 |
+
# Install dependencies
|
136 |
pip install -r requirements.txt
|
137 |
```
|
138 |
|
139 |
### Option 2: Using `environment.yml`
|
|
|
140 |
```bash
|
141 |
+
# Create environment from yml file
|
142 |
conda env create -f environment.yml
|
143 |
conda activate vnner
|
144 |
```
|
145 |
|
146 |
+
## 🚀 Quick Start
|
147 |
+
|
148 |
+
### Training the Model
|
149 |
```bash
|
150 |
python run.py
|
151 |
```
|
|
|
152 |
|
153 |
+
### Running the Streamlit App
|
154 |
+
```bash
|
155 |
+
python src/app.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
```
|
157 |
|
158 |
+
## 🧪 Experimental Approaches
|
159 |
+
|
160 |
+
The project explores multiple methodologies:
|
161 |
+
|
162 |
+
1. **PhoBERT + CRF**: Sequential labeling with contextual embeddings
|
163 |
+
2. **PhoBERT + Softmax**: Direct classification approach
|
164 |
+
3. **Random Forest + Rule-based**: Traditional ML with linguistic rules
|
165 |
+
4. **Random Forest + Semantic Embeddings**: Enhanced feature engineering
|
166 |
|
167 |
+
## 🤝 Contributing
|
168 |
|
169 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
170 |
|
171 |
+
## 📄 License
|
172 |
+
|
173 |
+
This project is open source. Please check the repository for license details.
|
174 |
+
|
175 |
+
## 🙏 Acknowledgments
|
176 |
+
|
177 |
+
- VLSP2016 dataset providers
|
178 |
+
- PhoBERT model creators
|
179 |
+
- Hugging Face for hosting the demo
|
180 |
+
|
181 |
+
## 📚 Additional Resources
|
182 |
+
|
183 |
+
For better understanding of the project structure and technologies used:
|
184 |
+
|
185 |
+
- [Understanding `__init__.py`](https://zetcode.com/python/init-file/)
|
186 |
+
- [Markdown Basic Syntax](https://www.markdownguide.org/basic-syntax/#escaping-characters)
|
187 |
+
- [Requirements.txt vs Environment.yml](https://www.reddit.com/r/learnpython/comments/xvlpdz/why_do_people_provide_a_requirementstxt_or/)
|
188 |
+
|
189 |
+
---
|
190 |
|
191 |
+
**Happy NER-ing! 🎯**
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.log
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Train Epoch 1/20: 100%|██████████| 736/736 [00:22<00:00, 32.46it/s, avg_loss=2.69, batch_loss=0.947]
|
2 |
+
Epoch 1: train_loss=2.6912, train_f1=0.8224, val_loss=1.0848, val_f1=0.8273
|
3 |
+
Saved imporved model to ./models/best_epoch_1.pt
|
4 |
+
Train Epoch 2/20: 100%|██████████| 736/736 [00:21<00:00, 33.55it/s, avg_loss=0.806, batch_loss=0.998]
|
5 |
+
|
6 |
+
Epoch 2: train_loss=0.8061, train_f1=0.8674, val_loss=0.7191, val_f1=0.8613
|
7 |
+
Saved imporved model to ./models/best_epoch_2.pt
|
8 |
+
Train Epoch 3/20: 100%|██████████| 736/736 [00:22<00:00, 32.59it/s, avg_loss=0.584, batch_loss=0.0527]
|
9 |
+
|
10 |
+
Epoch 3: train_loss=0.5842, train_f1=0.8996, val_loss=0.5643, val_f1=0.8895
|
11 |
+
Saved imporved model to ./models/best_epoch_3.pt
|
12 |
+
Train Epoch 4/20: 100%|██████████| 736/736 [00:23<00:00, 31.34it/s, avg_loss=0.478, batch_loss=1.06]
|
13 |
+
|
14 |
+
Epoch 4: train_loss=0.4782, train_f1=0.9122, val_loss=0.4838, val_f1=0.8994
|
15 |
+
Saved imporved model to ./models/best_epoch_4.pt
|
16 |
+
Train Epoch 5/20: 100%|██████████| 736/736 [00:22<00:00, 32.59it/s, avg_loss=0.406, batch_loss=0.421]
|
17 |
+
|
18 |
+
Epoch 5: train_loss=0.4056, train_f1=0.9254, val_loss=0.4281, val_f1=0.9101
|
19 |
+
Saved imporved model to ./models/best_epoch_5.pt
|
20 |
+
Train Epoch 6/20: 100%|██████████| 736/736 [00:21<00:00, 34.15it/s, avg_loss=0.36, batch_loss=1.01]
|
21 |
+
|
22 |
+
Epoch 6: train_loss=0.3599, train_f1=0.9343, val_loss=0.3934, val_f1=0.9190
|
23 |
+
Saved imporved model to ./models/best_epoch_6.pt
|
24 |
+
Train Epoch 7/20: 100%|██████████| 736/736 [00:22<00:00, 33.08it/s, avg_loss=0.322, batch_loss=0.392]
|
25 |
+
|
26 |
+
Epoch 7: train_loss=0.3218, train_f1=0.9383, val_loss=0.3751, val_f1=0.9192
|
27 |
+
Saved imporved model to ./models/best_epoch_7.pt
|
28 |
+
Train Epoch 8/20: 100%|██████████| 736/736 [00:22<00:00, 32.66it/s, avg_loss=0.294, batch_loss=0.468]
|
29 |
+
|
30 |
+
Epoch 8: train_loss=0.2942, train_f1=0.9424, val_loss=0.3560, val_f1=0.9189
|
31 |
+
Train Epoch 9/20: 100%|██████████| 736/736 [00:23<00:00, 31.68it/s, avg_loss=0.27, batch_loss=0.681]
|
32 |
+
|
33 |
+
Epoch 9: train_loss=0.2699, train_f1=0.9429, val_loss=0.3521, val_f1=0.9177
|
34 |
+
Train Epoch 10/20: 100%|██████████| 736/736 [00:21<00:00, 33.46it/s, avg_loss=0.252, batch_loss=0.525]
|
35 |
+
|
36 |
+
Epoch 10: train_loss=0.2517, train_f1=0.9493, val_loss=0.3413, val_f1=0.9222
|
37 |
+
Saved imporved model to ./models/best_epoch_10.pt
|
38 |
+
Train Epoch 11/20: 100%|██████████| 736/736 [00:22<00:00, 32.92it/s, avg_loss=0.238, batch_loss=0.022]
|
39 |
+
|
40 |
+
Epoch 11: train_loss=0.2383, train_f1=0.9551, val_loss=0.3292, val_f1=0.9232
|
41 |
+
Saved imporved model to ./models/best_epoch_11.pt
|
42 |
+
Train Epoch 12/20: 100%|██████████| 736/736 [00:23<00:00, 31.72it/s, avg_loss=0.222, batch_loss=0.529]
|
43 |
+
|
44 |
+
Epoch 12: train_loss=0.2223, train_f1=0.9543, val_loss=0.3305, val_f1=0.9207
|
45 |
+
Train Epoch 13/20: 100%|██████████| 736/736 [00:23<00:00, 31.74it/s, avg_loss=0.213, batch_loss=0.381]
|
46 |
+
|
47 |
+
Epoch 13: train_loss=0.2127, train_f1=0.9593, val_loss=0.3244, val_f1=0.9221
|
48 |
+
Train Epoch 14/20: 100%|██████████| 736/736 [00:23<00:00, 31.69it/s, avg_loss=0.203, batch_loss=0.279]
|
49 |
+
|
50 |
+
Epoch 14: train_loss=0.2026, train_f1=0.9609, val_loss=0.3213, val_f1=0.9224
|
51 |
+
Train Epoch 15/20: 100%|██████████| 736/736 [00:23<00:00, 31.84it/s, avg_loss=0.193, batch_loss=0.0462]
|
52 |
+
|
53 |
+
Epoch 15: train_loss=0.1925, train_f1=0.9574, val_loss=0.3392, val_f1=0.9117
|
54 |
+
Train Epoch 16/20: 100%|██████████| 736/736 [00:22<00:00, 32.11it/s, avg_loss=0.186, batch_loss=0.943]
|
55 |
+
|
56 |
+
Epoch 16: train_loss=0.1863, train_f1=0.9654, val_loss=0.3169, val_f1=0.9250
|
57 |
+
Saved imporved model to ./models/best_epoch_16.pt
|
58 |
+
Train Epoch 17/20: 100%|██████████| 736/736 [00:22<00:00, 32.38it/s, avg_loss=0.18, batch_loss=0.113]
|
59 |
+
|
60 |
+
Epoch 17: train_loss=0.1795, train_f1=0.9677, val_loss=0.3187, val_f1=0.9237
|
61 |
+
Train Epoch 18/20: 100%|██████████| 736/736 [00:22<00:00, 33.30it/s, avg_loss=0.173, batch_loss=0.00558]
|
62 |
+
|
63 |
+
Epoch 18: train_loss=0.1728, train_f1=0.9692, val_loss=0.3219, val_f1=0.9173
|
64 |
+
Train Epoch 19/20: 100%|██████████| 736/736 [00:23<00:00, 31.48it/s, avg_loss=0.167, batch_loss=0.115]
|
65 |
+
|
66 |
+
Epoch 19: train_loss=0.1673, train_f1=0.9681, val_loss=0.3261, val_f1=0.9195
|
67 |
+
Train Epoch 20/20: 100%|██████████| 736/736 [00:22<00:00, 32.17it/s, avg_loss=0.164, batch_loss=0.0463]
|
68 |
+
|
69 |
+
Epoch 20: train_loss=0.1640, train_f1=0.9715, val_loss=0.3230, val_f1=0.9185
|
70 |
+
|
71 |
+
Loading best model from ./models/best_epoch_16.pt for final evaluation...
|
72 |
+
Done
|
73 |
+
|
74 |
+
Evaluation on test set ...
|
75 |
+
Test_loss=0.2967, Test_f1=0.9087
|
76 |
+
precision recall f1-score support
|
77 |
+
|
78 |
+
0 1.00 1.00 1.00 51036
|
79 |
+
1 0.99 0.98 0.99 1112
|
80 |
+
2 0.97 0.99 0.98 506
|
81 |
+
3 0.86 0.79 0.82 180
|
82 |
+
4 0.84 0.80 0.82 291
|
83 |
+
5 0.89 0.91 0.90 939
|
84 |
+
6 0.87 0.84 0.86 428
|
85 |
+
|
86 |
+
accuracy 0.99 54492
|
87 |
+
macro avg 0.92 0.90 0.91 54492
|
88 |
+
weighted avg 0.99 0.99 0.99 54492
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Model Results
|
2 |
+
training_log = {
|
3 |
+
"epoch": list(range(1, 21)),
|
4 |
+
"train_loss": [
|
5 |
+
2.6912, 0.8061, 0.5842, 0.4782, 0.4056,
|
6 |
+
0.3599, 0.3218, 0.2942, 0.2699, 0.2517,
|
7 |
+
0.2383, 0.2223, 0.2127, 0.2026, 0.1925,
|
8 |
+
0.1863, 0.1795, 0.1728, 0.1673, 0.1640
|
9 |
+
],
|
10 |
+
"val_loss": [
|
11 |
+
1.0848, 0.7191, 0.5643, 0.4838, 0.4281,
|
12 |
+
0.3934, 0.3751, 0.3560, 0.3521, 0.3413,
|
13 |
+
0.3292, 0.3305, 0.3244, 0.3213, 0.3392,
|
14 |
+
0.3169, 0.3187, 0.3219, 0.3261, 0.3230
|
15 |
+
],
|
16 |
+
"train_f1": [
|
17 |
+
0.8224, 0.8674, 0.8996, 0.9122, 0.9254,
|
18 |
+
0.9343, 0.9383, 0.9424, 0.9429, 0.9493,
|
19 |
+
0.9551, 0.9543, 0.9593, 0.9609, 0.9574,
|
20 |
+
0.9654, 0.9677, 0.9692, 0.9681, 0.9715
|
21 |
+
],
|
22 |
+
"val_f1": [
|
23 |
+
0.8273, 0.8613, 0.8895, 0.8994, 0.9101,
|
24 |
+
0.9190, 0.9192, 0.9189, 0.9177, 0.9222,
|
25 |
+
0.9232, 0.9207, 0.9221, 0.9224, 0.9117,
|
26 |
+
0.9250, 0.9237, 0.9173, 0.9195, 0.9185
|
27 |
+
]
|
28 |
+
}
|
29 |
+
|
30 |
+
report_dict = {
|
31 |
+
'O': {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 51036},
|
32 |
+
'B-PER': {"precision": 0.99, "recall": 0.98, "f1-score": 0.98, "support": 1112},
|
33 |
+
'I-PER': {"precision": 0.97, "recall": 0.99, "f1-score": 0.98, "support": 506},
|
34 |
+
'B-ORG': {"precision": 0.93, "recall": 0.95, "f1-score": 0.94, "support": 939},
|
35 |
+
'I-ORG': {"precision": 0.93, "recall": 0.91, "f1-score": 0.92, "support": 428},
|
36 |
+
'B-LOC': {"precision": 0.83, "recall": 0.84, "f1-score": 0.84, "support": 180},
|
37 |
+
'I-LOC': {"precision": 0.88, "recall": 0.84, "f1-score": 0.86, "support": 291},
|
38 |
+
"accuracy": 0.99,
|
39 |
+
"macro avg": {"precision": 0.93, "recall": 0.93, "f1-score": 0.93, "support": 54492},
|
40 |
+
"weighted avg": {"precision": 0.99, "recall": 0.99, "f1-score": 0.99, "support": 54492}
|
41 |
+
}
|
42 |
+
|
43 |
+
|
44 |
+
report_dict_2 = {
|
45 |
+
'O': {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 68476},
|
46 |
+
'B-PER': {"precision": 0.99, "recall": 0.98, "f1-score": 0.98, "support": 1464},
|
47 |
+
'I-PER': {"precision": 0.98, "recall": 0.98, "f1-score": 0.98, "support": 686},
|
48 |
+
'B-ORG': {"precision": 0.77, "recall": 0.82, "f1-score": 0.80, "support": 257},
|
49 |
+
'I-ORG': {"precision": 0.80, "recall": 0.77, "f1-score": 0.78, "support": 430},
|
50 |
+
'B-LOC': {"precision": 0.88, "recall": 0.90, "f1-score": 0.89, "support": 1241},
|
51 |
+
'I-LOC': {"precision": 0.83, "recall": 0.82, "f1-score": 0.82, "support": 554},
|
52 |
+
"accuracy": 0.99,
|
53 |
+
"macro avg": {"precision": 0.89, "recall": 0.89, "f1-score": 0.89, "support": 73108},
|
54 |
+
"weighted avg": {"precision": 0.99, "recall": 0.99, "f1-score": 0.99, "support": 73108}
|
55 |
+
}
|
56 |
+
|
57 |
+
|
58 |
+
model_compare = {
|
59 |
+
"Header": ["Model", "F1", "Accuracy"],
|
60 |
+
"Data": {
|
61 |
+
"PhoBERT + CRF": {"F1": 0.93, "Accuracy": 0.99},
|
62 |
+
"CRF": {"F1": 0.91, "Accuracy": 0.99},
|
63 |
+
"Softmax": {"F1": 0.89, "Accuracy": 0.99},
|
64 |
+
"Random Forest": {"F1": 0.78, "Accuracy": 0.98}
|
65 |
+
}
|
66 |
+
}
|
67 |
+
|
68 |
+
data_compare = {
|
69 |
+
"Header": ["Data Preprocessing Strategy", "F1"],
|
70 |
+
"Data": {
|
71 |
+
"Raw": 0.93,
|
72 |
+
"Crawl for Balance": 0.91,
|
73 |
+
"Remove Sentences with Only 'O' Tags": 0.91
|
74 |
+
}
|
75 |
+
}
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
# EDA
|
80 |
+
data_aug_count_sorted = {
|
81 |
+
'B-PER': 474,
|
82 |
+
'I-PER': 121,
|
83 |
+
'B-LOC': 874,
|
84 |
+
'I-LOC': 289,
|
85 |
+
'B-ORG': 1110,
|
86 |
+
'I-ORG': 761
|
87 |
+
}
|
88 |
+
|
89 |
+
raw_data_count_sorted = {
|
90 |
+
'B-PER': 7479,
|
91 |
+
'I-PER': 3522,
|
92 |
+
'B-LOC': 6244,
|
93 |
+
'I-LOC': 2783,
|
94 |
+
'B-ORG': 1212,
|
95 |
+
'I-ORG': 2055,
|
96 |
+
'B-NAT': 282,
|
97 |
+
'I-NAT': 279
|
98 |
+
}
|
99 |
+
|
100 |
+
raw_data_count_withoutNAT_sorted = {
|
101 |
+
'B-PER': 7479,
|
102 |
+
'I-PER': 3522,
|
103 |
+
'B-LOC': 6244,
|
104 |
+
'I-LOC': 2783,
|
105 |
+
'B-ORG': 1212,
|
106 |
+
'I-ORG': 2055
|
107 |
+
}
|
108 |
+
|
109 |
+
combined_count_sorted = {
|
110 |
+
'B-PER': 7953,
|
111 |
+
'I-PER': 3643,
|
112 |
+
'B-LOC': 7118,
|
113 |
+
'I-LOC': 3072,
|
114 |
+
'B-ORG': 2322,
|
115 |
+
'I-ORG': 2816
|
116 |
+
}
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore
CHANGED
@@ -10,8 +10,6 @@ __pycache__/
|
|
10 |
|
11 |
# Dataset and results folders
|
12 |
data/
|
13 |
-
results/
|
14 |
-
outputs/
|
15 |
logs/
|
16 |
|
17 |
# Large files
|
|
|
10 |
|
11 |
# Dataset and results folders
|
12 |
data/
|
|
|
|
|
13 |
logs/
|
14 |
|
15 |
# Large files
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt
CHANGED
Binary files a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt and b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt differ
|
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.github/workflows/main.yml
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Deploy to Hugging Face Space
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main # hoặc branch bạn dùng
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
deploy:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
steps:
|
13 |
+
- name: Checkout repo
|
14 |
+
uses: actions/checkout@v3
|
15 |
+
|
16 |
+
- name: Set up Git
|
17 |
+
run: |
|
18 |
+
git config --global user.email "[email protected]"
|
19 |
+
git config --global user.name "GitHub Actions"
|
20 |
+
|
21 |
+
- name: Push to Hugging Face Spaces
|
22 |
+
env:
|
23 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
24 |
+
run: |
|
25 |
+
git clone https://huggingface.co/spaces/DucLai/Vietnamese_NER space
|
26 |
+
|
27 |
+
# Đồng bộ code vào repo Space (không copy .git)
|
28 |
+
rsync -av --exclude '.git' ./ space/
|
29 |
+
|
30 |
+
# Xoá file binary ra khỏi Git index trước khi commit
|
31 |
+
cd space
|
32 |
+
find . -type f \( \
|
33 |
+
-iname "*.png" -o \
|
34 |
+
-iname "*.jpg" -o \
|
35 |
+
-iname "*.jpeg" -o \
|
36 |
+
-iname "*.mp4" -o \
|
37 |
+
-iname "*.zip" -o \
|
38 |
+
-iname "*.pth" -o \
|
39 |
+
-iname "*.h5" -o \
|
40 |
+
-iname "*.tar.gz" -o \
|
41 |
+
-iname "*.wav" \
|
42 |
+
\) -exec git rm --cached {} \; || true
|
43 |
+
|
44 |
+
# Commit và push
|
45 |
+
git add .
|
46 |
+
git commit -m "Auto-deploy from GitHub (binary files removed)" || echo "No changes to commit"
|
47 |
+
git push https://DucLai:${HF_TOKEN}@huggingface.co/spaces/DucLai/Vietnamese_NER HEAD
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python cache
|
2 |
+
__pycache__/
|
3 |
+
*.pyc
|
4 |
+
*.pyo
|
5 |
+
|
6 |
+
# Hugging Face binary/model outputs
|
7 |
+
*.pth
|
8 |
+
*.h5
|
9 |
+
*.ckpt
|
10 |
+
|
11 |
+
# Dataset and results folders
|
12 |
+
data/
|
13 |
+
results/
|
14 |
+
outputs/
|
15 |
+
logs/
|
16 |
+
|
17 |
+
# Large files
|
18 |
+
*.zip
|
19 |
+
*.tar.gz
|
20 |
+
*.mp4
|
21 |
+
*.png
|
22 |
+
*.jpg
|
23 |
+
*.jpeg
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/configs/config.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ECHO is on.
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/environment.yml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: vnner
|
2 |
+
channels:
|
3 |
+
- defaults
|
4 |
+
- conda-forge
|
5 |
+
dependencies:
|
6 |
+
- python=3.10
|
7 |
+
- pip
|
8 |
+
- pip:
|
9 |
+
- -r requirements.txt
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/models/best_epoch_16.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:622cac3a55eec6a245f70c2ec7591d8fbfa8c18e13db7555915405fb57b145a0
|
3 |
+
size 24130
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_RF_lightgbm.ipynb
ADDED
@@ -0,0 +1,741 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"id": "10ec017cb658e125",
|
6 |
+
"metadata": {
|
7 |
+
"ExecuteTime": {
|
8 |
+
"end_time": "2025-06-11T00:21:33.244538Z",
|
9 |
+
"start_time": "2025-06-11T00:21:05.317283Z"
|
10 |
+
}
|
11 |
+
},
|
12 |
+
"source": [
|
13 |
+
"import pandas as pd\n",
|
14 |
+
"\n",
|
15 |
+
"splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}\n",
|
16 |
+
"df_train = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"train\"])\n",
|
17 |
+
"df_valid = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"valid\"])\n",
|
18 |
+
"df = pd.concat([df_train, df_valid]).reset_index(drop=True)"
|
19 |
+
],
|
20 |
+
"outputs": [],
|
21 |
+
"execution_count": 1
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"id": "c533c55a2ad7b16e",
|
26 |
+
"metadata": {
|
27 |
+
"ExecuteTime": {
|
28 |
+
"end_time": "2025-06-11T00:21:33.499341Z",
|
29 |
+
"start_time": "2025-06-11T00:21:33.262933Z"
|
30 |
+
}
|
31 |
+
},
|
32 |
+
"source": [
|
33 |
+
"# Tạo thêm các cột khác\n",
|
34 |
+
"def join_tokens(tokens):\n",
|
35 |
+
" text = ' '.join(tokens)\n",
|
36 |
+
" return text\n",
|
37 |
+
"\n",
|
38 |
+
"def reform_raw_text(tokens):\n",
|
39 |
+
" text = ' '.join(tokens)\n",
|
40 |
+
" return text.replace(\"_\", \" \")\n",
|
41 |
+
"\n",
|
42 |
+
"def label(x):\n",
|
43 |
+
" return [id_tag[int(i)] for i in x]\n",
|
44 |
+
"\n",
|
45 |
+
"def replace_7_8(lst):\n",
|
46 |
+
" return [0 if x in (7, 8) else x for x in lst]\n",
|
47 |
+
"\n",
|
48 |
+
"\n",
|
49 |
+
"tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}\n",
|
50 |
+
"id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}\n",
|
51 |
+
"\n",
|
52 |
+
"\n",
|
53 |
+
"df['ner_tags'] = df['ner_tags'].apply(replace_7_8)\n",
|
54 |
+
"df['text_withseg'] = df['tokens'].apply(join_tokens)\n",
|
55 |
+
"df['text_raw'] = df['tokens'].apply(reform_raw_text)\n",
|
56 |
+
"df[\"ner_labels\"] = df.ner_tags.apply(label)\n",
|
57 |
+
"df.columns = ['tokens', 'id', 'seg_text', 'raw_text', 'labels']\n",
|
58 |
+
"df\n"
|
59 |
+
],
|
60 |
+
"outputs": [
|
61 |
+
{
|
62 |
+
"data": {
|
63 |
+
"text/plain": [
|
64 |
+
" tokens \\\n",
|
65 |
+
"0 [Không_khí, thật, náo_nhiệt, .] \n",
|
66 |
+
"1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n",
|
67 |
+
"2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n",
|
68 |
+
"3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n",
|
69 |
+
"4 [Nhật_ký, của, thuyền_viên, .] \n",
|
70 |
+
"... ... \n",
|
71 |
+
"16853 [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ... \n",
|
72 |
+
"16854 [Nhưng, mọi, chuyện, không, dừng, ở, đó, .] \n",
|
73 |
+
"16855 [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,... \n",
|
74 |
+
"16856 [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ... \n",
|
75 |
+
"16857 [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t... \n",
|
76 |
+
"\n",
|
77 |
+
" id \\\n",
|
78 |
+
"0 [0, 0, 0, 0] \n",
|
79 |
+
"1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n",
|
80 |
+
"2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n",
|
81 |
+
"3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n",
|
82 |
+
"4 [0, 0, 0, 0] \n",
|
83 |
+
"... ... \n",
|
84 |
+
"16853 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... \n",
|
85 |
+
"16854 [0, 0, 0, 0, 0, 0, 0, 0] \n",
|
86 |
+
"16855 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n",
|
87 |
+
"16856 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n",
|
88 |
+
"16857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n",
|
89 |
+
"\n",
|
90 |
+
" seg_text \\\n",
|
91 |
+
"0 Không_khí thật náo_nhiệt . \n",
|
92 |
+
"1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n",
|
93 |
+
"2 Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n",
|
94 |
+
"3 Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,... \n",
|
95 |
+
"4 Nhật_ký của thuyền_viên . \n",
|
96 |
+
"... ... \n",
|
97 |
+
"16853 Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ... \n",
|
98 |
+
"16854 Nhưng mọi chuyện không dừng ở đó . \n",
|
99 |
+
"16855 Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh... \n",
|
100 |
+
"16856 Biết_bao người đã tình_nguyện hiến_dâng cả cuộ... \n",
|
101 |
+
"16857 Trên đây mới là “ thành_tích ” tiêu tiền của m... \n",
|
102 |
+
"\n",
|
103 |
+
" raw_text \\\n",
|
104 |
+
"0 Không khí thật náo nhiệt . \n",
|
105 |
+
"1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n",
|
106 |
+
"2 Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n",
|
107 |
+
"3 Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,... \n",
|
108 |
+
"4 Nhật ký của thuyền viên . \n",
|
109 |
+
"... ... \n",
|
110 |
+
"16853 Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ... \n",
|
111 |
+
"16854 Nhưng mọi chuyện không dừng ở đó . \n",
|
112 |
+
"16855 Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh... \n",
|
113 |
+
"16856 Biết bao người đã tình nguyện hiến dâng cả cuộ... \n",
|
114 |
+
"16857 Trên đây mới là “ thành tích ” tiêu tiền của m... \n",
|
115 |
+
"\n",
|
116 |
+
" labels \n",
|
117 |
+
"0 [O, O, O, O] \n",
|
118 |
+
"1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n",
|
119 |
+
"2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n",
|
120 |
+
"3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n",
|
121 |
+
"4 [O, O, O, O] \n",
|
122 |
+
"... ... \n",
|
123 |
+
"16853 [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,... \n",
|
124 |
+
"16854 [O, O, O, O, O, O, O, O] \n",
|
125 |
+
"16855 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n",
|
126 |
+
"16856 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n",
|
127 |
+
"16857 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n",
|
128 |
+
"\n",
|
129 |
+
"[16858 rows x 5 columns]"
|
130 |
+
],
|
131 |
+
"text/html": [
|
132 |
+
"<div>\n",
|
133 |
+
"<style scoped>\n",
|
134 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
135 |
+
" vertical-align: middle;\n",
|
136 |
+
" }\n",
|
137 |
+
"\n",
|
138 |
+
" .dataframe tbody tr th {\n",
|
139 |
+
" vertical-align: top;\n",
|
140 |
+
" }\n",
|
141 |
+
"\n",
|
142 |
+
" .dataframe thead th {\n",
|
143 |
+
" text-align: right;\n",
|
144 |
+
" }\n",
|
145 |
+
"</style>\n",
|
146 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
147 |
+
" <thead>\n",
|
148 |
+
" <tr style=\"text-align: right;\">\n",
|
149 |
+
" <th></th>\n",
|
150 |
+
" <th>tokens</th>\n",
|
151 |
+
" <th>id</th>\n",
|
152 |
+
" <th>seg_text</th>\n",
|
153 |
+
" <th>raw_text</th>\n",
|
154 |
+
" <th>labels</th>\n",
|
155 |
+
" </tr>\n",
|
156 |
+
" </thead>\n",
|
157 |
+
" <tbody>\n",
|
158 |
+
" <tr>\n",
|
159 |
+
" <th>0</th>\n",
|
160 |
+
" <td>[Không_khí, thật, náo_nhiệt, .]</td>\n",
|
161 |
+
" <td>[0, 0, 0, 0]</td>\n",
|
162 |
+
" <td>Không_khí thật náo_nhiệt .</td>\n",
|
163 |
+
" <td>Không khí thật náo nhiệt .</td>\n",
|
164 |
+
" <td>[O, O, O, O]</td>\n",
|
165 |
+
" </tr>\n",
|
166 |
+
" <tr>\n",
|
167 |
+
" <th>1</th>\n",
|
168 |
+
" <td>[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...</td>\n",
|
169 |
+
" <td>[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
|
170 |
+
" <td>Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...</td>\n",
|
171 |
+
" <td>Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...</td>\n",
|
172 |
+
" <td>[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...</td>\n",
|
173 |
+
" </tr>\n",
|
174 |
+
" <tr>\n",
|
175 |
+
" <th>2</th>\n",
|
176 |
+
" <td>[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...</td>\n",
|
177 |
+
" <td>[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
|
178 |
+
" <td>Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...</td>\n",
|
179 |
+
" <td>Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...</td>\n",
|
180 |
+
" <td>[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...</td>\n",
|
181 |
+
" </tr>\n",
|
182 |
+
" <tr>\n",
|
183 |
+
" <th>3</th>\n",
|
184 |
+
" <td>[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...</td>\n",
|
185 |
+
" <td>[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...</td>\n",
|
186 |
+
" <td>Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...</td>\n",
|
187 |
+
" <td>Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...</td>\n",
|
188 |
+
" <td>[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...</td>\n",
|
189 |
+
" </tr>\n",
|
190 |
+
" <tr>\n",
|
191 |
+
" <th>4</th>\n",
|
192 |
+
" <td>[Nhật_ký, của, thuyền_viên, .]</td>\n",
|
193 |
+
" <td>[0, 0, 0, 0]</td>\n",
|
194 |
+
" <td>Nhật_ký của thuyền_viên .</td>\n",
|
195 |
+
" <td>Nhật ký của thuyền viên .</td>\n",
|
196 |
+
" <td>[O, O, O, O]</td>\n",
|
197 |
+
" </tr>\n",
|
198 |
+
" <tr>\n",
|
199 |
+
" <th>...</th>\n",
|
200 |
+
" <td>...</td>\n",
|
201 |
+
" <td>...</td>\n",
|
202 |
+
" <td>...</td>\n",
|
203 |
+
" <td>...</td>\n",
|
204 |
+
" <td>...</td>\n",
|
205 |
+
" </tr>\n",
|
206 |
+
" <tr>\n",
|
207 |
+
" <th>16853</th>\n",
|
208 |
+
" <td>[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...</td>\n",
|
209 |
+
" <td>[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...</td>\n",
|
210 |
+
" <td>Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...</td>\n",
|
211 |
+
" <td>Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...</td>\n",
|
212 |
+
" <td>[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...</td>\n",
|
213 |
+
" </tr>\n",
|
214 |
+
" <tr>\n",
|
215 |
+
" <th>16854</th>\n",
|
216 |
+
" <td>[Nhưng, mọi, chuyện, không, dừng, ở, đó, .]</td>\n",
|
217 |
+
" <td>[0, 0, 0, 0, 0, 0, 0, 0]</td>\n",
|
218 |
+
" <td>Nhưng mọi chuyện không dừng ở đó .</td>\n",
|
219 |
+
" <td>Nhưng mọi chuyện không dừng ở đó .</td>\n",
|
220 |
+
" <td>[O, O, O, O, O, O, O, O]</td>\n",
|
221 |
+
" </tr>\n",
|
222 |
+
" <tr>\n",
|
223 |
+
" <th>16855</th>\n",
|
224 |
+
" <td>[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...</td>\n",
|
225 |
+
" <td>[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
|
226 |
+
" <td>Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...</td>\n",
|
227 |
+
" <td>Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...</td>\n",
|
228 |
+
" <td>[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...</td>\n",
|
229 |
+
" </tr>\n",
|
230 |
+
" <tr>\n",
|
231 |
+
" <th>16856</th>\n",
|
232 |
+
" <td>[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...</td>\n",
|
233 |
+
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]</td>\n",
|
234 |
+
" <td>Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...</td>\n",
|
235 |
+
" <td>Biết bao người đã tình nguyện hiến dâng cả cuộ...</td>\n",
|
236 |
+
" <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]</td>\n",
|
237 |
+
" </tr>\n",
|
238 |
+
" <tr>\n",
|
239 |
+
" <th>16857</th>\n",
|
240 |
+
" <td>[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...</td>\n",
|
241 |
+
" <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
|
242 |
+
" <td>Trên đây mới là “ thành_tích ” tiêu tiền của m...</td>\n",
|
243 |
+
" <td>Trên đây mới là “ thành tích ” tiêu tiền của m...</td>\n",
|
244 |
+
" <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...</td>\n",
|
245 |
+
" </tr>\n",
|
246 |
+
" </tbody>\n",
|
247 |
+
"</table>\n",
|
248 |
+
"<p>16858 rows × 5 columns</p>\n",
|
249 |
+
"</div>"
|
250 |
+
]
|
251 |
+
},
|
252 |
+
"execution_count": 2,
|
253 |
+
"metadata": {},
|
254 |
+
"output_type": "execute_result"
|
255 |
+
}
|
256 |
+
],
|
257 |
+
"execution_count": 2
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"cell_type": "code",
|
261 |
+
"id": "14d9b9fae58b7173",
|
262 |
+
"metadata": {
|
263 |
+
"ExecuteTime": {
|
264 |
+
"end_time": "2025-06-11T00:21:59.373985Z",
|
265 |
+
"start_time": "2025-06-11T00:21:34.524025Z"
|
266 |
+
}
|
267 |
+
},
|
268 |
+
"source": [
|
269 |
+
"import torch\n",
|
270 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
271 |
+
"from tqdm import tqdm\n",
|
272 |
+
"\n",
|
273 |
+
"# Load PhoBERT tokenizer và model\n",
|
274 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"vinai/phobert-base\", use_fast=False)\n",
|
275 |
+
"model = AutoModel.from_pretrained(\"vinai/phobert-base\")\n",
|
276 |
+
"model.eval()"
|
277 |
+
],
|
278 |
+
"outputs": [
|
279 |
+
{
|
280 |
+
"name": "stdout",
|
281 |
+
"output_type": "stream",
|
282 |
+
"text": [
|
283 |
+
"cuda\n"
|
284 |
+
]
|
285 |
+
},
|
286 |
+
{
|
287 |
+
"data": {
|
288 |
+
"text/plain": [
|
289 |
+
"RobertaModel(\n",
|
290 |
+
" (embeddings): RobertaEmbeddings(\n",
|
291 |
+
" (word_embeddings): Embedding(64001, 768, padding_idx=1)\n",
|
292 |
+
" (position_embeddings): Embedding(258, 768, padding_idx=1)\n",
|
293 |
+
" (token_type_embeddings): Embedding(1, 768)\n",
|
294 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
295 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
296 |
+
" )\n",
|
297 |
+
" (encoder): RobertaEncoder(\n",
|
298 |
+
" (layer): ModuleList(\n",
|
299 |
+
" (0-11): 12 x RobertaLayer(\n",
|
300 |
+
" (attention): RobertaAttention(\n",
|
301 |
+
" (self): RobertaSdpaSelfAttention(\n",
|
302 |
+
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
|
303 |
+
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
|
304 |
+
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
|
305 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
306 |
+
" )\n",
|
307 |
+
" (output): RobertaSelfOutput(\n",
|
308 |
+
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
|
309 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
310 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
311 |
+
" )\n",
|
312 |
+
" )\n",
|
313 |
+
" (intermediate): RobertaIntermediate(\n",
|
314 |
+
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
|
315 |
+
" (intermediate_act_fn): GELUActivation()\n",
|
316 |
+
" )\n",
|
317 |
+
" (output): RobertaOutput(\n",
|
318 |
+
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
|
319 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
320 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
321 |
+
" )\n",
|
322 |
+
" )\n",
|
323 |
+
" )\n",
|
324 |
+
" )\n",
|
325 |
+
" (pooler): RobertaPooler(\n",
|
326 |
+
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
|
327 |
+
" (activation): Tanh()\n",
|
328 |
+
" )\n",
|
329 |
+
")"
|
330 |
+
]
|
331 |
+
},
|
332 |
+
"execution_count": 3,
|
333 |
+
"metadata": {},
|
334 |
+
"output_type": "execute_result"
|
335 |
+
}
|
336 |
+
],
|
337 |
+
"execution_count": 3
|
338 |
+
},
|
339 |
+
{
|
340 |
+
"cell_type": "code",
|
341 |
+
"id": "a47ec382649c3036",
|
342 |
+
"metadata": {
|
343 |
+
"ExecuteTime": {
|
344 |
+
"end_time": "2025-06-11T00:23:23.888583Z",
|
345 |
+
"start_time": "2025-06-11T00:23:23.885204Z"
|
346 |
+
}
|
347 |
+
},
|
348 |
+
"source": [
|
349 |
+
"# Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece\n",
|
350 |
+
"def group_embeddings(tokens, embeddings):\n",
|
351 |
+
" word_embeddings = []\n",
|
352 |
+
" current_vecs = []\n",
|
353 |
+
"\n",
|
354 |
+
" for token, emb in zip(tokens, embeddings):\n",
|
355 |
+
" if token in [\"<s>\", \"</s>\"]:\n",
|
356 |
+
" continue\n",
|
357 |
+
"\n",
|
358 |
+
" if token.endswith(\"@@\"):\n",
|
359 |
+
" current_vecs.append(emb)\n",
|
360 |
+
" else:\n",
|
361 |
+
" current_vecs.append(emb)\n",
|
362 |
+
" word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n",
|
363 |
+
" word_embeddings.append(word_emb)\n",
|
364 |
+
" current_vecs = []\n",
|
365 |
+
"\n",
|
366 |
+
" if current_vecs: # Trong trường hợp sót lại cuối câu\n",
|
367 |
+
" word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n",
|
368 |
+
" word_embeddings.append(word_emb)\n",
|
369 |
+
"\n",
|
370 |
+
" return word_embeddings"
|
371 |
+
],
|
372 |
+
"outputs": [],
|
373 |
+
"execution_count": 4
|
374 |
+
},
|
375 |
+
{
|
376 |
+
"cell_type": "code",
|
377 |
+
"id": "f8c0ad89ae81b0c",
|
378 |
+
"metadata": {
|
379 |
+
"ExecuteTime": {
|
380 |
+
"end_time": "2025-06-11T00:25:52.567135Z",
|
381 |
+
"start_time": "2025-06-11T00:23:56.155322Z"
|
382 |
+
}
|
383 |
+
},
|
384 |
+
"source": [
|
385 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
386 |
+
"model.to(device)\n",
|
387 |
+
"\n",
|
388 |
+
"all_embeddings = [] # list of [seq_len_i, 768] tensors\n",
|
389 |
+
"all_labels = [] # list of [seq_len_i,] tensors\n",
|
390 |
+
"len_em = []\n",
|
391 |
+
"\n",
|
392 |
+
"# count = 0\n",
|
393 |
+
"\n",
|
394 |
+
"for i, row in df.iterrows():\n",
|
395 |
+
"\n",
|
396 |
+
" # count += 1\n",
|
397 |
+
" # if count == 500:\n",
|
398 |
+
" # break\n",
|
399 |
+
"\n",
|
400 |
+
" # Truy cập phần tử từng dòng\n",
|
401 |
+
" sentence = row['seg_text']\n",
|
402 |
+
" gold_labels = row[\"id\"]\n",
|
403 |
+
"\n",
|
404 |
+
" # Cho sentence đi qua SentencePiece\n",
|
405 |
+
" input_ids = tokenizer.encode(sentence, return_tensors=\"pt\").to(device)\n",
|
406 |
+
"\n",
|
407 |
+
" tokens = tokenizer.convert_ids_to_tokens(input_ids[0].to(device))\n",
|
408 |
+
"\n",
|
409 |
+
" # Encode tạo embeddings\n",
|
410 |
+
" with torch.no_grad():\n",
|
411 |
+
" outputs = model(input_ids)\n",
|
412 |
+
" last_hidden_state = outputs.last_hidden_state.squeeze(0)\n",
|
413 |
+
"\n",
|
414 |
+
" # Gộp các embeddings đã bị tách khi đi qua SentencePiece\n",
|
415 |
+
" word_embeds = group_embeddings(tokens, last_hidden_state)\n",
|
416 |
+
"\n",
|
417 |
+
" # Kiểm tra số lượng embeddings và số lượng labels\n",
|
418 |
+
" if len(word_embeds) != len(gold_labels):\n",
|
419 |
+
" continue\n",
|
420 |
+
"\n",
|
421 |
+
" # Thêm vào list tổng / Tới đây là data đã sẵn sàng cho training\n",
|
422 |
+
" all_embeddings.append(torch.stack(word_embeds))\n",
|
423 |
+
" all_labels.append(torch.tensor(gold_labels))"
|
424 |
+
],
|
425 |
+
"outputs": [],
|
426 |
+
"execution_count": 6
|
427 |
+
},
|
428 |
+
{
|
429 |
+
"metadata": {
|
430 |
+
"ExecuteTime": {
|
431 |
+
"end_time": "2025-06-11T00:35:23.255306Z",
|
432 |
+
"start_time": "2025-06-11T00:35:23.252026Z"
|
433 |
+
}
|
434 |
+
},
|
435 |
+
"cell_type": "code",
|
436 |
+
"source": "# We skip 43 data since they aren't convertable",
|
437 |
+
"id": "c3e406ad994802be",
|
438 |
+
"outputs": [
|
439 |
+
{
|
440 |
+
"name": "stdout",
|
441 |
+
"output_type": "stream",
|
442 |
+
"text": [
|
443 |
+
"-43\n"
|
444 |
+
]
|
445 |
+
}
|
446 |
+
],
|
447 |
+
"execution_count": 15
|
448 |
+
},
|
449 |
+
{
|
450 |
+
"cell_type": "code",
|
451 |
+
"id": "cadc3a861025b3b9",
|
452 |
+
"metadata": {
|
453 |
+
"ExecuteTime": {
|
454 |
+
"end_time": "2025-06-11T00:36:18.857012Z",
|
455 |
+
"start_time": "2025-06-11T00:36:08.257408Z"
|
456 |
+
}
|
457 |
+
},
|
458 |
+
"source": [
|
459 |
+
"import numpy as np\n",
|
460 |
+
"from sklearn.model_selection import train_test_split\n",
|
461 |
+
"\n",
|
462 |
+
"X_flat = []\n",
|
463 |
+
"y_flat = []\n",
|
464 |
+
"\n",
|
465 |
+
"for emb_seq, label_seq in zip(all_embeddings, all_labels):\n",
|
466 |
+
" for emb, label in zip(emb_seq, label_seq):\n",
|
467 |
+
" X_flat.append(emb.cpu().numpy()) # emb: [768]\n",
|
468 |
+
" y_flat.append(label.item()) # label: int\n",
|
469 |
+
"\n",
|
470 |
+
"X_flat = np.array(X_flat) # [N, 768]\n",
|
471 |
+
"y_flat = np.array(y_flat) # [N]\n"
|
472 |
+
],
|
473 |
+
"outputs": [],
|
474 |
+
"execution_count": 16
|
475 |
+
},
|
476 |
+
{
|
477 |
+
"cell_type": "code",
|
478 |
+
"id": "52a0fe72a50d4f73",
|
479 |
+
"metadata": {
|
480 |
+
"ExecuteTime": {
|
481 |
+
"end_time": "2025-06-11T00:39:58.211159Z",
|
482 |
+
"start_time": "2025-06-11T00:39:58.208074Z"
|
483 |
+
}
|
484 |
+
},
|
485 |
+
"source": [
|
486 |
+
"print(X_flat[0].shape)\n",
|
487 |
+
"print(y_flat.shape)"
|
488 |
+
],
|
489 |
+
"outputs": [
|
490 |
+
{
|
491 |
+
"name": "stdout",
|
492 |
+
"output_type": "stream",
|
493 |
+
"text": [
|
494 |
+
"(768,)\n",
|
495 |
+
"(368172,)\n"
|
496 |
+
]
|
497 |
+
}
|
498 |
+
],
|
499 |
+
"execution_count": 19
|
500 |
+
},
|
501 |
+
{
|
502 |
+
"cell_type": "code",
|
503 |
+
"id": "d6275df555f0c4c3",
|
504 |
+
"metadata": {
|
505 |
+
"ExecuteTime": {
|
506 |
+
"end_time": "2025-06-11T00:42:00.129778Z",
|
507 |
+
"start_time": "2025-06-11T00:42:00.096986Z"
|
508 |
+
}
|
509 |
+
},
|
510 |
+
"source": [
|
511 |
+
"# Kiểm tra độ lệch data\n",
|
512 |
+
"unique_values, counts = np.unique(y_flat, return_counts=True)\n",
|
513 |
+
"\n",
|
514 |
+
"# In ra từng giá trị và số lần xuất hiện\n",
|
515 |
+
"for val, count in zip(unique_values, counts):\n",
|
516 |
+
" print(f\"Label {val}: {count} times\")\n"
|
517 |
+
],
|
518 |
+
"outputs": [
|
519 |
+
{
|
520 |
+
"name": "stdout",
|
521 |
+
"output_type": "stream",
|
522 |
+
"text": [
|
523 |
+
"Label 0: 344986 times\n",
|
524 |
+
"Label 1: 7450 times\n",
|
525 |
+
"Label 2: 3504 times\n",
|
526 |
+
"Label 3: 1204 times\n",
|
527 |
+
"Label 4: 2050 times\n",
|
528 |
+
"Label 5: 6211 times\n",
|
529 |
+
"Label 6: 2767 times\n"
|
530 |
+
]
|
531 |
+
}
|
532 |
+
],
|
533 |
+
"execution_count": 24
|
534 |
+
},
|
535 |
+
{
|
536 |
+
"cell_type": "code",
|
537 |
+
"id": "664020977ba9a1e2",
|
538 |
+
"metadata": {
|
539 |
+
"ExecuteTime": {
|
540 |
+
"end_time": "2025-06-11T00:42:03.350616Z",
|
541 |
+
"start_time": "2025-06-11T00:42:02.915680Z"
|
542 |
+
}
|
543 |
+
},
|
544 |
+
"source": [
|
545 |
+
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
546 |
+
" X_flat, y_flat, test_size=0.2, random_state=42, stratify=y_flat)\n"
|
547 |
+
],
|
548 |
+
"outputs": [],
|
549 |
+
"execution_count": 25
|
550 |
+
},
|
551 |
+
{
|
552 |
+
"cell_type": "code",
|
553 |
+
"id": "d4acda9c7cae3214",
|
554 |
+
"metadata": {
|
555 |
+
"ExecuteTime": {
|
556 |
+
"end_time": "2025-06-11T00:42:25.235471Z",
|
557 |
+
"start_time": "2025-06-11T00:42:16.769480Z"
|
558 |
+
}
|
559 |
+
},
|
560 |
+
"source": [
|
561 |
+
"import lightgbm as lgb\n",
|
562 |
+
"from sklearn.metrics import accuracy_score, f1_score, classification_report\n",
|
563 |
+
"\n",
|
564 |
+
"\n",
|
565 |
+
"# Tạo Dataset cho LightGBM\n",
|
566 |
+
"train_data = lgb.Dataset(X_train, label=y_train)\n",
|
567 |
+
"test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)\n",
|
568 |
+
"\n",
|
569 |
+
"# Cấu hình tham số LightGBM (Random Forest mode)\n",
|
570 |
+
"params = {\n",
|
571 |
+
" \"objective\": \"multiclass\", # nếu multiclass classification\n",
|
572 |
+
" \"num_class\": len(np.unique(y_train)),\n",
|
573 |
+
" \"metric\": \"multi_logloss\",\n",
|
574 |
+
" \"boosting_type\": \"rf\", # random forest mode trong LightGBM\n",
|
575 |
+
" \"num_leaves\": 31,\n",
|
576 |
+
" \"bagging_freq\": 1,\n",
|
577 |
+
" \"bagging_fraction\": 0.8,\n",
|
578 |
+
" \"feature_fraction\": 0.8,\n",
|
579 |
+
" \"bagging_seed\": 42,\n",
|
580 |
+
" \"verbose\": -1,\n",
|
581 |
+
" \"seed\": 42,\n",
|
582 |
+
" \"is_unbalance\": True\n",
|
583 |
+
"}\n",
|
584 |
+
"\n",
|
585 |
+
"\n",
|
586 |
+
"\n",
|
587 |
+
"# Train model, tích hợp wandb callback để log metrics\n",
|
588 |
+
"model = lgb.train(\n",
|
589 |
+
" params,\n",
|
590 |
+
" train_data,\n",
|
591 |
+
" num_boost_round=2,\n",
|
592 |
+
" valid_sets=[train_data, test_data],\n",
|
593 |
+
" valid_names=[\"train\", \"test\"]\n",
|
594 |
+
")\n",
|
595 |
+
"\n",
|
596 |
+
"# Dự đoán trên test set\n",
|
597 |
+
"y_pred_prob = model.predict(X_test)\n",
|
598 |
+
"y_pred = np.argmax(y_pred_prob, axis=1)\n",
|
599 |
+
"\n",
|
600 |
+
"# Ánh xạ số về nhãn tên entity\n",
|
601 |
+
"label_map = {\n",
|
602 |
+
" 0: 'O',\n",
|
603 |
+
" 1: 'B-PER',\n",
|
604 |
+
" 2: 'I-PER',\n",
|
605 |
+
" 3: 'B-ORG',\n",
|
606 |
+
" 4: 'I-ORG',\n",
|
607 |
+
" 5: 'B-LOC',\n",
|
608 |
+
" 6: 'I-LOC'\n",
|
609 |
+
"}\n",
|
610 |
+
"\n",
|
611 |
+
"# Chuyển y_test và y_pred sang nhãn gốc\n",
|
612 |
+
"y_test_labels = [label_map[i] for i in y_test]\n",
|
613 |
+
"y_pred_labels = [label_map[i] for i in y_pred]\n",
|
614 |
+
"\n",
|
615 |
+
"# In classification report với nhãn thật\n",
|
616 |
+
"print(\"\\nClassification Report (theo label gốc):\")\n",
|
617 |
+
"print(classification_report(y_test_labels, y_pred_labels, digits=4))\n",
|
618 |
+
"\n",
|
619 |
+
"\n"
|
620 |
+
],
|
621 |
+
"outputs": [
|
622 |
+
{
|
623 |
+
"name": "stdout",
|
624 |
+
"output_type": "stream",
|
625 |
+
"text": [
|
626 |
+
"\n",
|
627 |
+
"Classification Report (theo label gốc):\n",
|
628 |
+
" precision recall f1-score support\n",
|
629 |
+
"\n",
|
630 |
+
" B-LOC 0.3679 0.5000 0.4239 1242\n",
|
631 |
+
" B-ORG 0.2639 0.3942 0.3161 241\n",
|
632 |
+
" B-PER 0.4395 0.7490 0.5540 1490\n",
|
633 |
+
" I-LOC 0.2321 0.4448 0.3050 553\n",
|
634 |
+
" I-ORG 0.1532 0.2878 0.2000 410\n",
|
635 |
+
" I-PER 0.4304 0.5863 0.4964 701\n",
|
636 |
+
" O 0.9869 0.9478 0.9669 68998\n",
|
637 |
+
"\n",
|
638 |
+
" accuracy 0.9235 73635\n",
|
639 |
+
" macro avg 0.4106 0.5586 0.4660 73635\n",
|
640 |
+
"weighted avg 0.9474 0.9235 0.9336 73635\n",
|
641 |
+
"\n"
|
642 |
+
]
|
643 |
+
}
|
644 |
+
],
|
645 |
+
"execution_count": 26
|
646 |
+
},
|
647 |
+
{
|
648 |
+
"metadata": {
|
649 |
+
"ExecuteTime": {
|
650 |
+
"end_time": "2025-06-11T00:45:00.649942Z",
|
651 |
+
"start_time": "2025-06-11T00:45:00.646595Z"
|
652 |
+
}
|
653 |
+
},
|
654 |
+
"cell_type": "code",
|
655 |
+
"source": "print(model.feature_importance().shape)",
|
656 |
+
"id": "b1cf76bc3e58bc93",
|
657 |
+
"outputs": [
|
658 |
+
{
|
659 |
+
"name": "stdout",
|
660 |
+
"output_type": "stream",
|
661 |
+
"text": [
|
662 |
+
"(768,)\n"
|
663 |
+
]
|
664 |
+
}
|
665 |
+
],
|
666 |
+
"execution_count": 35
|
667 |
+
},
|
668 |
+
{
|
669 |
+
"metadata": {
|
670 |
+
"ExecuteTime": {
|
671 |
+
"end_time": "2025-06-11T00:52:36.844604Z",
|
672 |
+
"start_time": "2025-06-11T00:52:36.827018Z"
|
673 |
+
}
|
674 |
+
},
|
675 |
+
"cell_type": "code",
|
676 |
+
"source": [
|
677 |
+
"correct = 0\n",
|
678 |
+
"for i in range(73635):\n",
|
679 |
+
" if y_pred[i] == y_test[i]:\n",
|
680 |
+
" correct += 1\n",
|
681 |
+
"correct"
|
682 |
+
],
|
683 |
+
"id": "39d391e67a51211c",
|
684 |
+
"outputs": [
|
685 |
+
{
|
686 |
+
"data": {
|
687 |
+
"text/plain": [
|
688 |
+
"68001"
|
689 |
+
]
|
690 |
+
},
|
691 |
+
"execution_count": 58,
|
692 |
+
"metadata": {},
|
693 |
+
"output_type": "execute_result"
|
694 |
+
}
|
695 |
+
],
|
696 |
+
"execution_count": 58
|
697 |
+
},
|
698 |
+
{
|
699 |
+
"metadata": {
|
700 |
+
"ExecuteTime": {
|
701 |
+
"end_time": "2025-06-11T00:57:45.109129Z",
|
702 |
+
"start_time": "2025-06-11T00:57:45.105078Z"
|
703 |
+
}
|
704 |
+
},
|
705 |
+
"cell_type": "code",
|
706 |
+
"source": "print(y_test.shape)",
|
707 |
+
"id": "1a0ba8f0410c5589",
|
708 |
+
"outputs": [
|
709 |
+
{
|
710 |
+
"name": "stdout",
|
711 |
+
"output_type": "stream",
|
712 |
+
"text": [
|
713 |
+
"(73635,)\n"
|
714 |
+
]
|
715 |
+
}
|
716 |
+
],
|
717 |
+
"execution_count": 61
|
718 |
+
}
|
719 |
+
],
|
720 |
+
"metadata": {
|
721 |
+
"kernelspec": {
|
722 |
+
"display_name": "Python 3",
|
723 |
+
"language": "python",
|
724 |
+
"name": "python3"
|
725 |
+
},
|
726 |
+
"language_info": {
|
727 |
+
"codemirror_mode": {
|
728 |
+
"name": "ipython",
|
729 |
+
"version": 2
|
730 |
+
},
|
731 |
+
"file_extension": ".py",
|
732 |
+
"mimetype": "text/x-python",
|
733 |
+
"name": "python",
|
734 |
+
"nbconvert_exporter": "python",
|
735 |
+
"pygments_lexer": "ipython2",
|
736 |
+
"version": "2.7.6"
|
737 |
+
}
|
738 |
+
},
|
739 |
+
"nbformat": 4,
|
740 |
+
"nbformat_minor": 5
|
741 |
+
}
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_Rule_base.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Softmax_PhoBERT.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt
ADDED
Binary file (2.43 kB). View file
|
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/run.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.preprocessing import download_raw_data, preprocess_data_for_EDA, load_phoBERT_model_and_tokenizer, create_embeddings, split_dataset
|
2 |
+
from src.data_set import NerDataset, collate_fn
|
3 |
+
from src.configs import configs
|
4 |
+
from src.model import CRF_Tagger
|
5 |
+
from src.train import train_model
|
6 |
+
|
7 |
+
import torch
|
8 |
+
from torch.utils.data import DataLoader
|
9 |
+
|
10 |
+
import warnings
|
11 |
+
warnings.filterwarnings("ignore")
|
12 |
+
|
13 |
+
|
14 |
+
def main():
|
15 |
+
|
16 |
+
# Download VLSP2016 from hgface
|
17 |
+
print("Download raw data ...")
|
18 |
+
df = download_raw_data()
|
19 |
+
|
20 |
+
# Save raw data
|
21 |
+
df.to_csv(r".\data\raw_data.csv", index=False)
|
22 |
+
print("Save at data\raw_data.csv \n")
|
23 |
+
|
24 |
+
# Process data for EDA
|
25 |
+
print("Process data for EDA ...")
|
26 |
+
df = preprocess_data_for_EDA(df)
|
27 |
+
df.to_csv(r".\data\processed_data_EDA.csv", index=False)
|
28 |
+
print("Save at data\processed_data_EDA.csv \n")
|
29 |
+
|
30 |
+
# Init PhoBERT Tokenizer and PhoBERT Model
|
31 |
+
print("Embedding data ...")
|
32 |
+
model, tokenizer = load_phoBERT_model_and_tokenizer()
|
33 |
+
|
34 |
+
# Embeddings data
|
35 |
+
processed_data = create_embeddings(df, model, tokenizer)
|
36 |
+
torch.save(processed_data, r".\data\processed_data_full.pt")
|
37 |
+
print("Save at data\processed_data_full.pt \n")
|
38 |
+
|
39 |
+
# Split data into train/valid/test
|
40 |
+
print("Train/Valid/Test Split ...")
|
41 |
+
X_train, Y_train, X_val, Y_val, X_test, Y_test = split_dataset(processed_data)
|
42 |
+
print("Done \n")
|
43 |
+
|
44 |
+
# Data Agumentation for training set
|
45 |
+
# Pass
|
46 |
+
|
47 |
+
# Init DataLoader
|
48 |
+
print("Init DataLoader ...")
|
49 |
+
datasets = {
|
50 |
+
'train': NerDataset(X_train, Y_train),
|
51 |
+
'val': NerDataset(X_val, Y_val),
|
52 |
+
'test': NerDataset(X_test, Y_test)
|
53 |
+
}
|
54 |
+
|
55 |
+
loaders = {
|
56 |
+
split: DataLoader(dataset, batch_size=configs["batch_size"], shuffle=(split=='train'), collate_fn=collate_fn)
|
57 |
+
for split, dataset in datasets.items()
|
58 |
+
}
|
59 |
+
print("Done \n")
|
60 |
+
|
61 |
+
# Init sequence label model
|
62 |
+
print("Init Model ...")
|
63 |
+
NUM_TAGS = 7
|
64 |
+
model = CRF_Tagger(input_dim=X_train[0].size(1), num_tags=NUM_TAGS)
|
65 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=configs["learning_rate"])
|
66 |
+
print("Done \n")
|
67 |
+
|
68 |
+
# Training Model
|
69 |
+
print("Start training ...")
|
70 |
+
train_model(model, optimizer, configs, loaders)
|
71 |
+
|
72 |
+
if __name__ == "__main__":
|
73 |
+
main()
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Vietnamese NER Demo
|
3 |
+
emoji: 🧠
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.46.1
|
8 |
+
app_file: src/app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
# Vietnamese Named Entity Recognition
|
12 |
+
|
13 |
+
## 🛠️ Set Up Your Environment With Conda
|
14 |
+
|
15 |
+
### Option 1: Using `requirements.txt`
|
16 |
+
|
17 |
+
```bash
|
18 |
+
conda create --name vnner python=3.10
|
19 |
+
conda activate vnner
|
20 |
+
pip install -r requirements.txt
|
21 |
+
```
|
22 |
+
|
23 |
+
### Option 2: Using `environment.yml`
|
24 |
+
|
25 |
+
```bash
|
26 |
+
conda env create -f environment.yml
|
27 |
+
conda activate vnner
|
28 |
+
```
|
29 |
+
|
30 |
+
## Run
|
31 |
+
```bash
|
32 |
+
python run.py
|
33 |
+
```
|
34 |
+
---
|
35 |
+
|
36 |
+
## 📂 Project Structure
|
37 |
+
|
38 |
+
```
|
39 |
+
my_ai_project/
|
40 |
+
│
|
41 |
+
├── data/
|
42 |
+
│ ├── raw_data.csv # Dữ liệu gốc
|
43 |
+
│ ├── processed_data_EDA.csv # Dữ liệu sau khi tiền xử lý
|
44 |
+
│ └── processed_data_full.csv # Dữ liệu sẵn sàng training
|
45 |
+
│
|
46 |
+
├── notebooks/ # Thử nghiệm và khám phá dữ liệu
|
47 |
+
│ ├── Duc_Notebook.ipynb # CRF + RandomForest
|
48 |
+
│ ├── Softmax_PhoBERT.ipynb # Softmax
|
49 |
+
│
|
50 |
+
├── src/ # Mã nguồn chính của dự án
|
51 |
+
│ ├── __init__.py
|
52 |
+
│ ├── data_loader.py # Nạp và xử lý dữ liệu
|
53 |
+
│ ├── preprocessing.py # Hàm tiền xử lý dữ liệu
|
54 |
+
│ ├── model.py # Định nghĩa kiến trúc mô hình
|
55 |
+
│ ├── train.py # Huấn luyện mô hình
|
56 |
+
│ ├── evaluate.py # Đánh giá mô hình
|
57 |
+
│ └── predict.py # Dự đoán với mô hình đã huấn luyện
|
58 |
+
│
|
59 |
+
├── models/ # Mô hình đã lưu sau khi huấn luyện
|
60 |
+
│ └── best_model.pth # File trọng số mô hình
|
61 |
+
│
|
62 |
+
├── outputs/ # Kết quả, biểu đồ, log, metrics
|
63 |
+
│ ├── logs/ # Nhật ký huấn luyện (tensorboard/logging)
|
64 |
+
│ └── figures/ # Biểu đồ trực quan hóa
|
65 |
+
│
|
66 |
+
├── configs/ # File cấu hình cho mô hình, huấn luyện
|
67 |
+
│ └── config.yaml
|
68 |
+
│
|
69 |
+
├── tests/ # Unit test cho các hàm chính
|
70 |
+
│
|
71 |
+
├── requirements.txt # Thư viện cần cài đặt
|
72 |
+
├── environment.yml # Môi trường Conda
|
73 |
+
├── README.md # Giới thiệu dự án
|
74 |
+
└── run.py # Script chính để chạy toàn bộ pipeline
|
75 |
+
```
|
76 |
+
|
77 |
+
---
|
78 |
+
|
79 |
+
## 📚 Additional Resources (Optional)
|
80 |
+
|
81 |
+
If you have any questions about the project structure, consider reading these helpful articles first:
|
82 |
+
|
83 |
+
* [Understanding `__init__.py`](https://zetcode.com/python/init-file/)
|
84 |
+
* [Markdown Basic Syntax](https://www.markdownguide.org/basic-syntax/#escaping-characters)
|
85 |
+
* [Difference Between `requirements.txt` and `environment.yml`](https://www.reddit.com/r/learnpython/comments/xvlpdz/why_do_people_provide_a_requirementstxt_or/)
|
86 |
+
|
87 |
+
These resources could be useful for you!
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Marks the directory as a Python package."""
|
2 |
+
__version__ = "1.0.0"
|
3 |
+
__author__ = "Duc Lai"
|
4 |
+
PACKAGE_NAME = "src"
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from src.predict import predict_demo
|
4 |
+
from src.front import render_html
|
5 |
+
|
6 |
+
st.set_page_config(page_title="Vietnamese NER", layout="wide")
|
7 |
+
|
8 |
+
# ===== Tiêu đề chính =====
|
9 |
+
st.title("🔍 Ứng dụng nhận diện thực thể có tên (NER) cho tiếng Việt")
|
10 |
+
|
11 |
+
# Tabs
|
12 |
+
tab1, tab2, tab3 = st.tabs(["📊 Phân tích dữ liệu", "📈 Kết quả huấn luyện", "🧪 Demo mô hình"])
|
13 |
+
|
14 |
+
# --- Tab 1: PHÂN TÍCH DỮ LIỆU ---
|
15 |
+
with tab1:
|
16 |
+
st.header("📊 Phân tích dữ liệu")
|
17 |
+
|
18 |
+
df = pd.DataFrame({
|
19 |
+
"Loại thực thể": ["PER", "LOC", "ORG", "MISC"],
|
20 |
+
"Số lượng": [3200, 2500, 1800, 900]
|
21 |
+
})
|
22 |
+
|
23 |
+
st.bar_chart(df.set_index("Loại thực thể"))
|
24 |
+
|
25 |
+
# --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
|
26 |
+
with tab2:
|
27 |
+
st.header("📈 Kết quả huấn luyện")
|
28 |
+
|
29 |
+
loss = [0.9, 0.7, 0.5, 0.35, 0.28]
|
30 |
+
epoch = [1, 2, 3, 4, 5]
|
31 |
+
df_loss = pd.DataFrame({"Epoch": epoch, "Loss": loss})
|
32 |
+
st.line_chart(df_loss.set_index("Epoch"))
|
33 |
+
|
34 |
+
st.subheader("Đánh giá mô hình")
|
35 |
+
df_eval = pd.DataFrame({
|
36 |
+
"Phiên bản": ["v1", "v2", "v3"],
|
37 |
+
"F1-score": [0.78, 0.83, 0.86],
|
38 |
+
"Accuracy": [0.81, 0.85, 0.88]
|
39 |
+
})
|
40 |
+
st.dataframe(df_eval)
|
41 |
+
|
42 |
+
# --- Tab 3: DEMO MÔ HÌNH ---
|
43 |
+
with tab3:
|
44 |
+
st.header("🧪 Vietnamese Named Entity Recognition")
|
45 |
+
|
46 |
+
text = st.text_input("Nhập văn bản tiếng Việt:", "Nguyễn Văn A đang làm việc tại Hà Nội")
|
47 |
+
|
48 |
+
if st.button("Phân tích"):
|
49 |
+
if not text.strip():
|
50 |
+
st.warning("Vui lòng nhập văn bản!")
|
51 |
+
else:
|
52 |
+
tokens, labels = predict_demo(text)
|
53 |
+
|
54 |
+
st.subheader("Thực thể được phát hiện")
|
55 |
+
entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"]
|
56 |
+
|
57 |
+
if entities:
|
58 |
+
for tok, lab in entities:
|
59 |
+
st.markdown(f"🔹 **{tok}** — *{lab}*")
|
60 |
+
else:
|
61 |
+
st.info("Không phát hiện thực thể.")
|
62 |
+
|
63 |
+
st.subheader("Highlight trong văn bản:")
|
64 |
+
st.markdown(render_html(tokens, labels), unsafe_allow_html=True)
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/configs.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
configs = {
|
2 |
+
# Init
|
3 |
+
"project": "NER",
|
4 |
+
"name": "CRF_VLSP2016_Ultra",
|
5 |
+
"model": "Linear/CRF",
|
6 |
+
|
7 |
+
# Hyperparameters
|
8 |
+
"optim": "Adam",
|
9 |
+
"learning_rate": 1e-3,
|
10 |
+
"batch_size": 16,
|
11 |
+
"epochs": 20,
|
12 |
+
"train_ratio": 0.7,
|
13 |
+
"val_ratio": 0.15,
|
14 |
+
"test_ratio": 0.15
|
15 |
+
}
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/data_set.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch.utils.data import Dataset
|
2 |
+
import torch
|
3 |
+
|
4 |
+
class NerDataset(Dataset):
|
5 |
+
def __init__(self, embeddings, labels):
|
6 |
+
super().__init__()
|
7 |
+
self.embeddings = embeddings
|
8 |
+
self.labels = labels
|
9 |
+
|
10 |
+
def __len__(self):
|
11 |
+
return len(self.embeddings)
|
12 |
+
|
13 |
+
def __getitem__(self, idx):
|
14 |
+
return self.embeddings[idx], self.labels[idx]
|
15 |
+
|
16 |
+
def collate_fn(batch): # Batch_size x Seq_length x 768
|
17 |
+
embeddings, labels = zip(*batch)
|
18 |
+
lengths = [e.size(0) for e in embeddings]
|
19 |
+
max_len = max(lengths)
|
20 |
+
|
21 |
+
padded_embs = torch.stack([
|
22 |
+
torch.cat([e, torch.zeros(max_len - e.size(0), e.size(1))]) for e in embeddings
|
23 |
+
])
|
24 |
+
|
25 |
+
padded_labels = torch.stack([
|
26 |
+
torch.cat([l, torch.full((max_len - l.size(0),), -1, dtype=torch.long)]) for l in labels
|
27 |
+
])
|
28 |
+
|
29 |
+
return padded_embs, padded_labels, lengths
|
30 |
+
|
31 |
+
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/evaluate.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.predict import predict
|
2 |
+
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
|
3 |
+
|
4 |
+
def evaluate(model, loader, count_loss=True, report=False):
|
5 |
+
|
6 |
+
# Model Preidction (Inference)
|
7 |
+
all_preds, all_true, loss = predict(model, loader, count_loss)
|
8 |
+
class_report = None
|
9 |
+
|
10 |
+
# Get evaluation metric
|
11 |
+
precision, recall, f1, _ = precision_recall_fscore_support(all_true, all_preds, average='macro', zero_division=0)
|
12 |
+
acc = accuracy_score(all_true, all_preds)
|
13 |
+
|
14 |
+
# Get classification report
|
15 |
+
if report:
|
16 |
+
class_report = classification_report(all_true, all_preds)
|
17 |
+
|
18 |
+
return precision, recall, f1, acc, loss, class_report
|
19 |
+
|
20 |
+
def evaluate_ignore_O(model, loader):
|
21 |
+
pass
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/front.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def render_html(tokens, labels):
|
2 |
+
"""
|
3 |
+
Tô màu highlight theo nhãn IOB, với màu khác nhau cho PER, ORG, LOC
|
4 |
+
"""
|
5 |
+
label_colors = {
|
6 |
+
"PER": "lightcoral", # đỏ nhạt
|
7 |
+
"ORG": "lightblue", # xanh nhạt
|
8 |
+
"LOC": "lightgreen", # xanh lá nhạt
|
9 |
+
}
|
10 |
+
|
11 |
+
html = ""
|
12 |
+
current_label = None
|
13 |
+
|
14 |
+
for tok, label in zip(tokens, labels):
|
15 |
+
if label.startswith("B-"):
|
16 |
+
if current_label:
|
17 |
+
html += "</span> "
|
18 |
+
current_label = label[2:]
|
19 |
+
color = label_colors.get(current_label, "lightgray")
|
20 |
+
html += f"<span style='background-color:{color};padding:2px;border-radius:4px;' title='{current_label}'>{tok}"
|
21 |
+
elif label.startswith("I-") and current_label:
|
22 |
+
html += f" {tok}"
|
23 |
+
else:
|
24 |
+
if current_label:
|
25 |
+
html += "</span> "
|
26 |
+
current_label = None
|
27 |
+
html += f"{tok} "
|
28 |
+
|
29 |
+
if current_label:
|
30 |
+
html += "</span>"
|
31 |
+
|
32 |
+
return f"<div style='font-family:monospace;font-size:16px'>{html.strip()}</div>"
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/model.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torchcrf import CRF
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
class CRF_Tagger(nn.Module):
|
5 |
+
def __init__(self, input_dim, num_tags):
|
6 |
+
super().__init__()
|
7 |
+
self.embed2tag = nn.Linear(input_dim, num_tags)
|
8 |
+
self.crf = CRF(num_tags, batch_first=True)
|
9 |
+
|
10 |
+
def forward(self, x, labels, mask):
|
11 |
+
emissions = self.embed2tag(x)
|
12 |
+
return -self.crf(emissions, labels, mask=mask, reduction="mean")
|
13 |
+
|
14 |
+
def decode(self, x, mask=None):
|
15 |
+
emissions = self.embed2tag(x)
|
16 |
+
return self.crf.decode(emissions, mask)
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from src.model import CRF_Tagger
|
3 |
+
from src.preprocessing import process_demo_sentence
|
4 |
+
|
5 |
+
def predict(model, loader, count_loss=True):
|
6 |
+
|
7 |
+
model.eval() # Evaluation Mode, Ignore Dropout, BatchNorm, ...
|
8 |
+
all_preds, all_true = [], []
|
9 |
+
loss = 0.0
|
10 |
+
|
11 |
+
with torch.no_grad(): # Stop track gradient
|
12 |
+
for x, y, _ in loader:
|
13 |
+
mask = (y != -1)
|
14 |
+
|
15 |
+
# Get loss
|
16 |
+
if count_loss:
|
17 |
+
loss += model(x, y, mask).item()
|
18 |
+
|
19 |
+
# Get prediction
|
20 |
+
preds = model.decode(x, mask)
|
21 |
+
|
22 |
+
# Loop for each sentence in mini-batch
|
23 |
+
for pred_seq, true_seq, m in zip(preds, y, mask):
|
24 |
+
true_labels = true_seq[m].tolist() # tensor[mask tensor boolean]
|
25 |
+
all_preds.extend(pred_seq)
|
26 |
+
all_true.extend(true_labels)
|
27 |
+
|
28 |
+
return all_preds, all_true, loss/len(loader)
|
29 |
+
|
30 |
+
def predict_demo(text):
|
31 |
+
|
32 |
+
|
33 |
+
id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
|
34 |
+
|
35 |
+
x, tokens = process_demo_sentence(text) # 1 x seq_length x 768
|
36 |
+
NUM_TAGS = 7
|
37 |
+
|
38 |
+
model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
|
39 |
+
model.load_state_dict(torch.load(".\models\best_epoch_16.pt"))
|
40 |
+
model.eval()
|
41 |
+
with torch.no_grad():
|
42 |
+
preds = model.decode(x)
|
43 |
+
|
44 |
+
labels = [id_tag[lab] for lab in preds[0]] # preds[0] vì sẽ trả về nhiều batch nhưng chúng ta chỉ có 1
|
45 |
+
|
46 |
+
return tokens, labels
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/preprocessing.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import torch
|
3 |
+
from transformers import AutoTokenizer, AutoModel
|
4 |
+
from tqdm import tqdm
|
5 |
+
from sklearn.model_selection import train_test_split
|
6 |
+
from src.configs import configs
|
7 |
+
from pyvi import ViTokenizer
|
8 |
+
|
9 |
+
def join_tokens(tokens):
|
10 |
+
text = ' '.join(tokens)
|
11 |
+
return text
|
12 |
+
|
13 |
+
def reform_raw_text(tokens):
|
14 |
+
text = ' '.join(tokens)
|
15 |
+
return text.replace("_", " ")
|
16 |
+
|
17 |
+
def label(x, ):
|
18 |
+
id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
|
19 |
+
return [id_tag[int(i)] for i in x]
|
20 |
+
|
21 |
+
def replace_7_8(lst):
|
22 |
+
return [0 if x in (7, 8) else x for x in lst]
|
23 |
+
|
24 |
+
# Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece
|
25 |
+
def group_embeddings(tokens, embeddings):
|
26 |
+
word_embeddings = []
|
27 |
+
current_vecs = []
|
28 |
+
|
29 |
+
for token, emb in zip(tokens, embeddings):
|
30 |
+
if token in ["<s>", "</s>"]:
|
31 |
+
continue
|
32 |
+
|
33 |
+
if token.endswith("@@"):
|
34 |
+
current_vecs.append(emb)
|
35 |
+
else:
|
36 |
+
current_vecs.append(emb)
|
37 |
+
word_emb = torch.mean(torch.stack(current_vecs), dim=0)
|
38 |
+
word_embeddings.append(word_emb)
|
39 |
+
current_vecs = []
|
40 |
+
|
41 |
+
if current_vecs: # Trong trường hợp sót lại cuối câu
|
42 |
+
word_emb = torch.mean(torch.stack(current_vecs), dim=0)
|
43 |
+
word_embeddings.append(word_emb)
|
44 |
+
|
45 |
+
return word_embeddings
|
46 |
+
|
47 |
+
|
48 |
+
# Download the dataset form Hugging Face
|
49 |
+
def download_raw_data():
|
50 |
+
splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}
|
51 |
+
df_train = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["train"])
|
52 |
+
df_valid = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["valid"])
|
53 |
+
df = pd.concat([df_train, df_valid]).reset_index(drop=True)
|
54 |
+
|
55 |
+
return df
|
56 |
+
|
57 |
+
# Process dataframe for EDA
|
58 |
+
def preprocess_data_for_EDA(df):
|
59 |
+
# Define tag - id
|
60 |
+
tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
|
61 |
+
id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
|
62 |
+
|
63 |
+
# Add columns and remove inappropriate tags
|
64 |
+
df['ner_tags'] = df['ner_tags'].apply(replace_7_8)
|
65 |
+
df['text_withseg'] = df['tokens'].apply(join_tokens)
|
66 |
+
df['text_raw'] = df['tokens'].apply(reform_raw_text)
|
67 |
+
df["ner_labels"] = df.ner_tags.apply(label)
|
68 |
+
df.columns = ['tokens', 'id_labels', 'seg_text', 'raw_text', 'labels']
|
69 |
+
|
70 |
+
return df
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
def load_phoBERT_model_and_tokenizer():
|
76 |
+
# Load PhoBERT tokenizer và model
|
77 |
+
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
|
78 |
+
model = AutoModel.from_pretrained("vinai/phobert-base")
|
79 |
+
model.eval()
|
80 |
+
return model, tokenizer
|
81 |
+
|
82 |
+
|
83 |
+
# Embedding text
|
84 |
+
def create_embeddings(df, model, tokenizer):
|
85 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
86 |
+
model.to(device)
|
87 |
+
|
88 |
+
all_embeddings = [] # list of [seq_len_i, 768] tensors
|
89 |
+
all_labels = [] # list of [seq_len_i,] tensors
|
90 |
+
remove_index = []
|
91 |
+
|
92 |
+
for i, row in tqdm(df.iterrows(), total=len(df)):
|
93 |
+
|
94 |
+
# Truy cập phần tử từng dòng
|
95 |
+
sentence = row['seg_text']
|
96 |
+
gold_labels = row["id_labels"]
|
97 |
+
|
98 |
+
# Cho sentence đi qua SentencePiece
|
99 |
+
input_ids = tokenizer.encode(sentence, return_tensors="pt").to(device)
|
100 |
+
|
101 |
+
tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
|
102 |
+
|
103 |
+
# Encode tạo embeddings
|
104 |
+
with torch.no_grad():
|
105 |
+
outputs = model(input_ids)
|
106 |
+
last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()
|
107 |
+
|
108 |
+
# Gộp các embeddings đã bị tách khi đi qua SentencePiece
|
109 |
+
word_embeds = group_embeddings(tokens, last_hidden_state)
|
110 |
+
|
111 |
+
# Kiểm tra số lượng embeddings và số lượng labels, nếu conflict -> xóa dòng đó
|
112 |
+
if len(word_embeds) != len(gold_labels):
|
113 |
+
# print(f"Warning: Skip row {i} - length mismatch")
|
114 |
+
remove_index.append(i)
|
115 |
+
continue
|
116 |
+
|
117 |
+
# Thêm vào list tổng & Tới đây là data đã sẵn sàng cho training
|
118 |
+
all_embeddings.append(torch.stack(word_embeds))
|
119 |
+
all_labels.append(torch.tensor(gold_labels))
|
120 |
+
|
121 |
+
# Create Dict
|
122 |
+
processed_data = {
|
123 |
+
"embeddings": all_embeddings,
|
124 |
+
"labels": all_labels
|
125 |
+
}
|
126 |
+
|
127 |
+
return processed_data
|
128 |
+
|
129 |
+
|
130 |
+
def split_dataset(data):
|
131 |
+
|
132 |
+
# Train_Val / Test Split
|
133 |
+
X_train_val, X_test, Y_train_val, Y_test = train_test_split(data["embeddings"], data["labels"], test_size=configs["test_ratio"], random_state=42)
|
134 |
+
|
135 |
+
# Train / Val Split
|
136 |
+
val_rest_ratio = configs["val_ratio"] / (configs["val_ratio"] + configs["train_ratio"])
|
137 |
+
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size = val_rest_ratio, random_state=42)
|
138 |
+
|
139 |
+
return X_train, Y_train, X_val, Y_val, X_test, Y_test
|
140 |
+
|
141 |
+
|
142 |
+
# TODO: Refactor hàm process_demo_sentence, và hàm predict demo, warning nếu độ dài tokens_word không bằng độ dài sau group_embeddings
|
143 |
+
|
144 |
+
def process_demo_sentence(text):
|
145 |
+
"""
|
146 |
+
Trả về tensor shape 1 x Seq_length x 768
|
147 |
+
"""
|
148 |
+
segmented_text = ViTokenizer.tokenize(text)
|
149 |
+
tokens_word = segmented_text.strip().split(" ")
|
150 |
+
|
151 |
+
model, tokenizer = load_phoBERT_model_and_tokenizer()
|
152 |
+
|
153 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
154 |
+
model.to(device)
|
155 |
+
|
156 |
+
input_ids = tokenizer.encode(segmented_text, return_tensors="pt").to(device)
|
157 |
+
|
158 |
+
tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
|
159 |
+
|
160 |
+
with torch.no_grad():
|
161 |
+
outputs = model(input_ids)
|
162 |
+
last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()
|
163 |
+
|
164 |
+
word_embeds = group_embeddings(tokens, last_hidden_state)
|
165 |
+
|
166 |
+
all_embeddings = torch.stack(word_embeds) # seq_length x 768
|
167 |
+
|
168 |
+
all_embeddings = all_embeddings.unsqueeze(0) # Thêm chiều batch size là 1 -> 1 x seq_length x 768
|
169 |
+
|
170 |
+
return all_embeddings, tokens_word
|
171 |
+
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/torchcrf/__init__.py
ADDED
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__version__ = '0.7.2'
|
2 |
+
|
3 |
+
from typing import List, Optional
|
4 |
+
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
|
8 |
+
|
9 |
+
class CRF(nn.Module):
|
10 |
+
"""Conditional random field.
|
11 |
+
|
12 |
+
This module implements a conditional random field [LMP01]_. The forward computation
|
13 |
+
of this class computes the log likelihood of the given sequence of tags and
|
14 |
+
emission score tensor. This class also has `~CRF.decode` method which finds
|
15 |
+
the best tag sequence given an emission score tensor using `Viterbi algorithm`_.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
num_tags: Number of tags.
|
19 |
+
batch_first: Whether the first dimension corresponds to the size of a minibatch.
|
20 |
+
|
21 |
+
Attributes:
|
22 |
+
start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
|
23 |
+
``(num_tags,)``.
|
24 |
+
end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
|
25 |
+
``(num_tags,)``.
|
26 |
+
transitions (`~torch.nn.Parameter`): Transition score tensor of size
|
27 |
+
``(num_tags, num_tags)``.
|
28 |
+
|
29 |
+
|
30 |
+
.. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
|
31 |
+
"Conditional random fields: Probabilistic models for segmenting and
|
32 |
+
labeling sequence data". *Proc. 18th International Conf. on Machine
|
33 |
+
Learning*. Morgan Kaufmann. pp. 282–289.
|
34 |
+
|
35 |
+
.. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
|
36 |
+
"""
|
37 |
+
|
38 |
+
def __init__(self, num_tags: int, batch_first: bool = False) -> None:
|
39 |
+
if num_tags <= 0:
|
40 |
+
raise ValueError(f'invalid number of tags: {num_tags}')
|
41 |
+
super().__init__()
|
42 |
+
self.num_tags = num_tags
|
43 |
+
self.batch_first = batch_first
|
44 |
+
self.start_transitions = nn.Parameter(torch.empty(num_tags))
|
45 |
+
self.end_transitions = nn.Parameter(torch.empty(num_tags))
|
46 |
+
self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))
|
47 |
+
|
48 |
+
self.reset_parameters()
|
49 |
+
|
50 |
+
def reset_parameters(self) -> None:
|
51 |
+
"""Initialize the transition parameters.
|
52 |
+
|
53 |
+
The parameters will be initialized randomly from a uniform distribution
|
54 |
+
between -0.1 and 0.1.
|
55 |
+
"""
|
56 |
+
nn.init.uniform_(self.start_transitions, -0.1, 0.1)
|
57 |
+
nn.init.uniform_(self.end_transitions, -0.1, 0.1)
|
58 |
+
nn.init.uniform_(self.transitions, -0.1, 0.1)
|
59 |
+
|
60 |
+
def __repr__(self) -> str:
|
61 |
+
return f'{self.__class__.__name__}(num_tags={self.num_tags})'
|
62 |
+
|
63 |
+
def forward(
|
64 |
+
self,
|
65 |
+
emissions: torch.Tensor,
|
66 |
+
tags: torch.LongTensor,
|
67 |
+
mask: Optional[torch.ByteTensor] = None,
|
68 |
+
reduction: str = 'sum',
|
69 |
+
) -> torch.Tensor:
|
70 |
+
"""Compute the conditional log likelihood of a sequence of tags given emission scores.
|
71 |
+
|
72 |
+
Args:
|
73 |
+
emissions (`~torch.Tensor`): Emission score tensor of size
|
74 |
+
``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
|
75 |
+
``(batch_size, seq_length, num_tags)`` otherwise.
|
76 |
+
tags (`~torch.LongTensor`): Sequence of tags tensor of size
|
77 |
+
``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
|
78 |
+
``(batch_size, seq_length)`` otherwise.
|
79 |
+
mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
|
80 |
+
if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
|
81 |
+
reduction: Specifies the reduction to apply to the output:
|
82 |
+
``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
|
83 |
+
``sum``: the output will be summed over batches. ``mean``: the output will be
|
84 |
+
averaged over batches. ``token_mean``: the output will be averaged over tokens.
|
85 |
+
|
86 |
+
Returns:
|
87 |
+
`~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
|
88 |
+
reduction is ``none``, ``()`` otherwise.
|
89 |
+
"""
|
90 |
+
self._validate(emissions, tags=tags, mask=mask)
|
91 |
+
if reduction not in ('none', 'sum', 'mean', 'token_mean'):
|
92 |
+
raise ValueError(f'invalid reduction: {reduction}')
|
93 |
+
if mask is None:
|
94 |
+
mask = torch.ones_like(tags, dtype=torch.uint8)
|
95 |
+
|
96 |
+
if self.batch_first:
|
97 |
+
emissions = emissions.transpose(0, 1)
|
98 |
+
tags = tags.transpose(0, 1)
|
99 |
+
mask = mask.transpose(0, 1)
|
100 |
+
|
101 |
+
# shape: (batch_size,)
|
102 |
+
numerator = self._compute_score(emissions, tags, mask)
|
103 |
+
# shape: (batch_size,)
|
104 |
+
denominator = self._compute_normalizer(emissions, mask)
|
105 |
+
# shape: (batch_size,)
|
106 |
+
llh = numerator - denominator
|
107 |
+
|
108 |
+
if reduction == 'none':
|
109 |
+
return llh
|
110 |
+
if reduction == 'sum':
|
111 |
+
return llh.sum()
|
112 |
+
if reduction == 'mean':
|
113 |
+
return llh.mean()
|
114 |
+
assert reduction == 'token_mean'
|
115 |
+
return llh.sum() / mask.type_as(emissions).sum()
|
116 |
+
|
117 |
+
@torch.jit.export
|
118 |
+
def decode(self, emissions: torch.Tensor,
|
119 |
+
mask: Optional[torch.ByteTensor] = None) -> List[List[int]]:
|
120 |
+
"""Find the most likely tag sequence using Viterbi algorithm.
|
121 |
+
|
122 |
+
Args:
|
123 |
+
emissions (`~torch.Tensor`): Emission score tensor of size
|
124 |
+
``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
|
125 |
+
``(batch_size, seq_length, num_tags)`` otherwise.
|
126 |
+
mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
|
127 |
+
if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
|
128 |
+
|
129 |
+
Returns:
|
130 |
+
List of list containing the best tag sequence for each batch.
|
131 |
+
"""
|
132 |
+
self._validate(emissions, mask=mask)
|
133 |
+
if mask is None:
|
134 |
+
mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)
|
135 |
+
|
136 |
+
if self.batch_first:
|
137 |
+
emissions = emissions.transpose(0, 1)
|
138 |
+
mask = mask.transpose(0, 1)
|
139 |
+
|
140 |
+
return self._viterbi_decode(emissions, mask)
|
141 |
+
|
142 |
+
def _validate(
|
143 |
+
self,
|
144 |
+
emissions: torch.Tensor,
|
145 |
+
tags: Optional[torch.LongTensor] = None,
|
146 |
+
mask: Optional[torch.ByteTensor] = None) -> None:
|
147 |
+
if emissions.dim() != 3:
|
148 |
+
raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
|
149 |
+
if emissions.size(2) != self.num_tags:
|
150 |
+
raise ValueError(
|
151 |
+
f'expected last dimension of emissions is {self.num_tags}, '
|
152 |
+
f'got {emissions.size(2)}')
|
153 |
+
|
154 |
+
if tags is not None:
|
155 |
+
if emissions.shape[:2] != tags.shape:
|
156 |
+
raise ValueError(
|
157 |
+
'the first two dimensions of emissions and tags must match, '
|
158 |
+
f'got {(emissions.shape[0], emissions.shape[1])} and {(tags.shape[0], tags.shape[1])}'
|
159 |
+
)
|
160 |
+
|
161 |
+
if mask is not None:
|
162 |
+
if emissions.shape[:2] != mask.shape:
|
163 |
+
raise ValueError(
|
164 |
+
'the first two dimensions of emissions and mask must match, '
|
165 |
+
f'got {(emissions.shape[0], emissions.shape[1])} and {(mask.shape[0], mask.shape[1])}'
|
166 |
+
)
|
167 |
+
no_empty_seq = not self.batch_first and mask[0].all()
|
168 |
+
no_empty_seq_bf = self.batch_first and mask[:, 0].all()
|
169 |
+
if not no_empty_seq and not no_empty_seq_bf:
|
170 |
+
raise ValueError('mask of the first timestep must all be on')
|
171 |
+
|
172 |
+
def _compute_score(
|
173 |
+
self, emissions: torch.Tensor, tags: torch.LongTensor,
|
174 |
+
mask: torch.ByteTensor) -> torch.Tensor:
|
175 |
+
# emissions: (seq_length, batch_size, num_tags)
|
176 |
+
# tags: (seq_length, batch_size)
|
177 |
+
# mask: (seq_length, batch_size)
|
178 |
+
assert emissions.dim() == 3 and tags.dim() == 2
|
179 |
+
assert emissions.shape[:2] == tags.shape
|
180 |
+
assert emissions.size(2) == self.num_tags
|
181 |
+
assert mask.shape == tags.shape
|
182 |
+
assert mask[0].all()
|
183 |
+
|
184 |
+
seq_length, batch_size = tags.shape
|
185 |
+
mask = mask.type_as(emissions)
|
186 |
+
|
187 |
+
# Start transition score and first emission
|
188 |
+
# shape: (batch_size,)
|
189 |
+
score = self.start_transitions[tags[0]]
|
190 |
+
score += emissions[0, torch.arange(batch_size), tags[0]]
|
191 |
+
|
192 |
+
for i in range(1, seq_length):
|
193 |
+
# Transition score to next tag, only added if next timestep is valid (mask == 1)
|
194 |
+
# shape: (batch_size,)
|
195 |
+
score += self.transitions[tags[i - 1], tags[i]] * mask[i]
|
196 |
+
|
197 |
+
# Emission score for next tag, only added if next timestep is valid (mask == 1)
|
198 |
+
# shape: (batch_size,)
|
199 |
+
score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i]
|
200 |
+
|
201 |
+
# End transition score
|
202 |
+
# shape: (batch_size,)
|
203 |
+
seq_ends = mask.long().sum(dim=0) - 1
|
204 |
+
# shape: (batch_size,)
|
205 |
+
last_tags = tags[seq_ends, torch.arange(batch_size)]
|
206 |
+
# shape: (batch_size,)
|
207 |
+
score += self.end_transitions[last_tags]
|
208 |
+
|
209 |
+
return score
|
210 |
+
|
211 |
+
def _compute_normalizer(
|
212 |
+
self, emissions: torch.Tensor, mask: torch.ByteTensor) -> torch.Tensor:
|
213 |
+
# emissions: (seq_length, batch_size, num_tags)
|
214 |
+
# mask: (seq_length, batch_size)
|
215 |
+
assert emissions.dim() == 3 and mask.dim() == 2
|
216 |
+
assert emissions.shape[:2] == mask.shape
|
217 |
+
assert emissions.size(2) == self.num_tags
|
218 |
+
assert mask[0].all()
|
219 |
+
|
220 |
+
seq_length = emissions.size(0)
|
221 |
+
|
222 |
+
# Start transition score and first emission; score has size of
|
223 |
+
# (batch_size, num_tags) where for each batch, the j-th column stores
|
224 |
+
# the score that the first timestep has tag j
|
225 |
+
# shape: (batch_size, num_tags)
|
226 |
+
score = self.start_transitions + emissions[0]
|
227 |
+
|
228 |
+
for i in range(1, seq_length):
|
229 |
+
# Broadcast score for every possible next tag
|
230 |
+
# shape: (batch_size, num_tags, 1)
|
231 |
+
broadcast_score = score.unsqueeze(2)
|
232 |
+
|
233 |
+
# Broadcast emission score for every possible current tag
|
234 |
+
# shape: (batch_size, 1, num_tags)
|
235 |
+
broadcast_emissions = emissions[i].unsqueeze(1)
|
236 |
+
|
237 |
+
# Compute the score tensor of size (batch_size, num_tags, num_tags) where
|
238 |
+
# for each sample, entry at row i and column j stores the sum of scores of all
|
239 |
+
# possible tag sequences so far that end with transitioning from tag i to tag j
|
240 |
+
# and emitting
|
241 |
+
# shape: (batch_size, num_tags, num_tags)
|
242 |
+
next_score = broadcast_score + self.transitions + broadcast_emissions
|
243 |
+
|
244 |
+
# Sum over all possible current tags, but we're in score space, so a sum
|
245 |
+
# becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
|
246 |
+
# all possible tag sequences so far, that end in tag i
|
247 |
+
# shape: (batch_size, num_tags)
|
248 |
+
next_score = torch.logsumexp(next_score, dim=1)
|
249 |
+
|
250 |
+
# Set score to the next score if this timestep is valid (mask == 1)
|
251 |
+
# shape: (batch_size, num_tags)
|
252 |
+
score = torch.where(mask[i].unsqueeze(1), next_score, score)
|
253 |
+
|
254 |
+
# End transition score
|
255 |
+
# shape: (batch_size, num_tags)
|
256 |
+
score += self.end_transitions
|
257 |
+
|
258 |
+
# Sum (log-sum-exp) over all possible tags
|
259 |
+
# shape: (batch_size,)
|
260 |
+
return torch.logsumexp(score, dim=1)
|
261 |
+
|
262 |
+
def _viterbi_decode(self, emissions: torch.FloatTensor,
|
263 |
+
mask: torch.ByteTensor) -> List[List[int]]:
|
264 |
+
# emissions: (seq_length, batch_size, num_tags)
|
265 |
+
# mask: (seq_length, batch_size)
|
266 |
+
assert emissions.dim() == 3 and mask.dim() == 2
|
267 |
+
assert emissions.shape[:2] == mask.shape
|
268 |
+
assert emissions.size(2) == self.num_tags
|
269 |
+
assert mask[0].all()
|
270 |
+
|
271 |
+
seq_length, batch_size = mask.shape
|
272 |
+
|
273 |
+
# Start transition and first emission
|
274 |
+
# shape: (batch_size, num_tags)
|
275 |
+
score = self.start_transitions + emissions[0]
|
276 |
+
history: List[torch.Tensor] = []
|
277 |
+
|
278 |
+
# score is a tensor of size (batch_size, num_tags) where for every batch,
|
279 |
+
# value at column j stores the score of the best tag sequence so far that ends
|
280 |
+
# with tag j
|
281 |
+
# history saves where the best tags candidate transitioned from; this is used
|
282 |
+
# when we trace back the best tag sequence
|
283 |
+
|
284 |
+
# Viterbi algorithm recursive case: we compute the score of the best tag sequence
|
285 |
+
# for every possible next tag
|
286 |
+
for i in range(1, seq_length):
|
287 |
+
# Broadcast viterbi score for every possible next tag
|
288 |
+
# shape: (batch_size, num_tags, 1)
|
289 |
+
broadcast_score = score.unsqueeze(2)
|
290 |
+
|
291 |
+
# Broadcast emission score for every possible current tag
|
292 |
+
# shape: (batch_size, 1, num_tags)
|
293 |
+
broadcast_emission = emissions[i].unsqueeze(1)
|
294 |
+
|
295 |
+
# Compute the score tensor of size (batch_size, num_tags, num_tags) where
|
296 |
+
# for each sample, entry at row i and column j stores the score of the best
|
297 |
+
# tag sequence so far that ends with transitioning from tag i to tag j and emitting
|
298 |
+
# shape: (batch_size, num_tags, num_tags)
|
299 |
+
next_score = broadcast_score + self.transitions + broadcast_emission
|
300 |
+
|
301 |
+
# Find the maximum score over all possible current tag
|
302 |
+
# shape: (batch_size, num_tags)
|
303 |
+
next_score, indices = next_score.max(dim=1)
|
304 |
+
|
305 |
+
# Set score to the next score if this timestep is valid (mask == 1)
|
306 |
+
# and save the index that produces the next score
|
307 |
+
# shape: (batch_size, num_tags)
|
308 |
+
score = torch.where(mask[i].unsqueeze(1), next_score, score)
|
309 |
+
history.append(indices)
|
310 |
+
|
311 |
+
# End transition score
|
312 |
+
# shape: (batch_size, num_tags)
|
313 |
+
score += self.end_transitions
|
314 |
+
|
315 |
+
# Now, compute the best path for each sample
|
316 |
+
|
317 |
+
# shape: (batch_size,)
|
318 |
+
seq_ends = mask.long().sum(dim=0) - 1
|
319 |
+
best_tags_list: List[List[int]] = []
|
320 |
+
|
321 |
+
for idx in range(batch_size):
|
322 |
+
# Find the tag which maximizes the score at the last timestep; this is our best tag
|
323 |
+
# for the last timestep
|
324 |
+
_, best_last_tag = score[idx].max(dim=0)
|
325 |
+
best_tags: List[int] = []
|
326 |
+
best_tags.append(best_last_tag.item())
|
327 |
+
|
328 |
+
# We trace back where the best last tag comes from, append that to our best tag
|
329 |
+
# sequence, and trace it back again, and so on
|
330 |
+
# NOTE: reversed() cannot be used here because it is not supported by TorchScript,
|
331 |
+
# see https://github.com/pytorch/pytorch/issues/31772.
|
332 |
+
for hist in history[:seq_ends[idx]][::-1]:
|
333 |
+
best_last_tag = hist[idx][best_tags[-1]]
|
334 |
+
best_tags.append(best_last_tag.item())
|
335 |
+
|
336 |
+
# Reverse the order because we start from the last timestep
|
337 |
+
best_tags.reverse()
|
338 |
+
best_tags_list.append(best_tags)
|
339 |
+
|
340 |
+
return best_tags_list
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/train.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import wandb
|
2 |
+
from tqdm import tqdm
|
3 |
+
from src.evaluate import evaluate
|
4 |
+
import torch
|
5 |
+
|
6 |
+
def train_model(model, optimizer, configs, loaders):
|
7 |
+
|
8 |
+
# Login wandb
|
9 |
+
wandb.login()
|
10 |
+
|
11 |
+
# Init Wandb for tracking training phase
|
12 |
+
wandb.init(
|
13 |
+
project=configs["project"],
|
14 |
+
name=configs["name"],
|
15 |
+
config=configs
|
16 |
+
)
|
17 |
+
|
18 |
+
# Log gradient of parameter
|
19 |
+
wandb.watch(model, log="all")
|
20 |
+
|
21 |
+
# Save model checkpoint by best F1
|
22 |
+
best_val_f1 = 0.0
|
23 |
+
|
24 |
+
# Training Loop
|
25 |
+
for epoch in range(1, configs["epochs"] + 1):
|
26 |
+
model.train()
|
27 |
+
total_loss = 0.0
|
28 |
+
|
29 |
+
# Create progress bar
|
30 |
+
train_bar = tqdm(loaders['train'], desc=f"Train Epoch {epoch}/{configs['epochs']}")
|
31 |
+
|
32 |
+
for batch_idx, (x, y, _) in enumerate(train_bar, start=1):
|
33 |
+
mask = (y != -1)
|
34 |
+
loss = model(x, y, mask)
|
35 |
+
optimizer.zero_grad()
|
36 |
+
loss.backward()
|
37 |
+
optimizer.step()
|
38 |
+
total_loss += loss.item()
|
39 |
+
|
40 |
+
train_bar.set_postfix(batch_loss=loss.item(), avg_loss=total_loss / batch_idx)
|
41 |
+
|
42 |
+
# Evaluate model after each epoch
|
43 |
+
avg_train_loss = total_loss / len(loaders['train'])
|
44 |
+
train_precision, train_recall, train_f1, train_acc, _, _ = evaluate(model, loaders['train'], count_loss=False)
|
45 |
+
val_precision, val_recall, val_f1, val_acc, avg_val_loss, _= evaluate(model, loaders['val'], count_loss=True)
|
46 |
+
|
47 |
+
# Log metric for train and val set
|
48 |
+
print(f"Epoch {epoch}: train_loss={avg_train_loss:.4f}, train_f1={train_f1:.4f}, val_loss={avg_val_loss:.4f}, val_f1={val_f1:.4f}")
|
49 |
+
wandb.log({
|
50 |
+
|
51 |
+
"epoch": epoch,
|
52 |
+
|
53 |
+
# Group: Training metrics
|
54 |
+
"Train/Loss": avg_train_loss,
|
55 |
+
"Train/Precision": train_precision,
|
56 |
+
"Train/Recall": train_recall,
|
57 |
+
"Train/F1": train_f1,
|
58 |
+
"Train/Accuracy": train_acc,
|
59 |
+
|
60 |
+
# Group: Validation metrics
|
61 |
+
"Val/Loss": avg_val_loss,
|
62 |
+
"Val/Precision": val_precision,
|
63 |
+
"Val/Recall": val_recall,
|
64 |
+
"Val/F1": val_f1,
|
65 |
+
"Val/Accuracy": val_acc
|
66 |
+
})
|
67 |
+
|
68 |
+
# Save best model based on val_f1
|
69 |
+
if val_f1 > best_val_f1:
|
70 |
+
best_val_f1 = val_f1
|
71 |
+
ckpt_path = f"./models/best_epoch_{epoch}.pt"
|
72 |
+
torch.save(model.state_dict(), ckpt_path)
|
73 |
+
wandb.save(ckpt_path)
|
74 |
+
print(f"Saved imporved model to {ckpt_path}")
|
75 |
+
|
76 |
+
print()
|
77 |
+
|
78 |
+
# Load best model before test
|
79 |
+
print(f"Loading best model from {ckpt_path} for final evaluation...")
|
80 |
+
model.load_state_dict(torch.load(ckpt_path))
|
81 |
+
print("Done \n")
|
82 |
+
|
83 |
+
|
84 |
+
# Log metric for test set
|
85 |
+
print("Evaluation on test set ...")
|
86 |
+
test_precision, test_recall, test_f1, test_acc, avg_test_loss, report = evaluate(model, loaders['test'], count_loss=True, report=True)
|
87 |
+
wandb.log({
|
88 |
+
"Test/Loss": avg_test_loss,
|
89 |
+
"Test/Precision": test_precision,
|
90 |
+
"Test/Recall": test_recall,
|
91 |
+
"Test/F1": test_f1,
|
92 |
+
"Test/Accuracy": test_acc,
|
93 |
+
})
|
94 |
+
print(f"Test_loss={avg_test_loss:.4f}, Test_f1={test_f1:.4f}")
|
95 |
+
print(report)
|
96 |
+
|
97 |
+
# Finish W&B run
|
98 |
+
wandb.finish()
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
|
|
3 |
from src.predict import predict_demo
|
4 |
from src.front import render_html
|
|
|
5 |
|
6 |
st.set_page_config(page_title="Vietnamese NER", layout="wide")
|
7 |
|
@@ -24,20 +27,99 @@ with tab1:
|
|
24 |
|
25 |
# --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
|
26 |
with tab2:
|
27 |
-
st.
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
# --- Tab 3: DEMO MÔ HÌNH ---
|
43 |
with tab3:
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
import plotly.graph_objects as go
|
4 |
+
|
5 |
from src.predict import predict_demo
|
6 |
from src.front import render_html
|
7 |
+
from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare
|
8 |
|
9 |
st.set_page_config(page_title="Vietnamese NER", layout="wide")
|
10 |
|
|
|
27 |
|
28 |
# --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
|
29 |
with tab2:
|
30 |
+
st.set_page_config(
|
31 |
+
page_title="My NER App",
|
32 |
+
layout="wide",
|
33 |
+
initial_sidebar_state="expanded"
|
34 |
+
)
|
35 |
+
|
36 |
+
# ==== TẠO FIGURES ====
|
37 |
+
|
38 |
+
# 1️⃣ Loss
|
39 |
+
fig_loss = go.Figure()
|
40 |
+
fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
|
41 |
+
mode='lines+markers', name='Train Loss'))
|
42 |
+
fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
|
43 |
+
mode='lines+markers', name='Val Loss'))
|
44 |
+
fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
|
45 |
+
|
46 |
+
# 2️⃣ F1-Score
|
47 |
+
fig_f1 = go.Figure()
|
48 |
+
fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
|
49 |
+
mode='lines+markers', name='Train F1'))
|
50 |
+
fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
|
51 |
+
mode='lines+markers', name='Val F1'))
|
52 |
+
fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
|
53 |
+
|
54 |
+
# 3️⃣ Classification Report Table & Bar
|
55 |
+
labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
|
56 |
+
report_data = [[lbl,
|
57 |
+
report_dict[lbl]["precision"],
|
58 |
+
report_dict[lbl]["recall"],
|
59 |
+
report_dict[lbl]["f1-score"]]
|
60 |
+
for lbl in labels]
|
61 |
+
df_report = pd.DataFrame(report_data,
|
62 |
+
columns=["Label", "Precision", "Recall", "F1-Score"])
|
63 |
+
|
64 |
+
fig_report = go.Figure()
|
65 |
+
for col in ["Precision", "Recall", "F1-Score"]:
|
66 |
+
fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
|
67 |
+
fig_report.update_layout(barmode='group',
|
68 |
+
title="Class Report Metrics of PhoBert + CRF",
|
69 |
+
xaxis_title="Label", yaxis_title="Score",
|
70 |
+
yaxis=dict(range=[0,1.0]))
|
71 |
+
|
72 |
+
labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
|
73 |
+
report_data2 = [[lbl,
|
74 |
+
report_dict_2[lbl]["precision"],
|
75 |
+
report_dict_2[lbl]["recall"],
|
76 |
+
report_dict_2[lbl]["f1-score"]]
|
77 |
+
for lbl in labels2]
|
78 |
+
df_report2 = pd.DataFrame(report_data2,
|
79 |
+
columns=["Label", "Precision", "Recall", "F1-Score"])
|
80 |
+
|
81 |
+
fig_report2 = go.Figure()
|
82 |
+
for col in ["Precision", "Recall", "F1-Score"]:
|
83 |
+
fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
|
84 |
+
fig_report2.update_layout(barmode='group',
|
85 |
+
title="Class Report Metrics of PhoBert + Softmax",
|
86 |
+
xaxis_title="Label", yaxis_title="Score",
|
87 |
+
yaxis=dict(range=[0,1.0]))
|
88 |
+
|
89 |
+
# 4️⃣ Model & Data Comparison Tables
|
90 |
+
df_model = pd.DataFrame(
|
91 |
+
[[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()],
|
92 |
+
columns=["Model", "F1-Score", "Accuracy"]
|
93 |
+
)
|
94 |
+
df_data = pd.DataFrame(
|
95 |
+
[[s, f1] for s, f1 in data_compare["Data"].items()],
|
96 |
+
columns=["Preprocessing", "F1-Score"]
|
97 |
+
)
|
98 |
+
|
99 |
+
# ==== LAYOUT RAO GỌN VỚI COLUMNS ====
|
100 |
+
|
101 |
+
# Row 1: Loss | F1
|
102 |
+
col1, col2 = st.columns(2)
|
103 |
+
with col1:
|
104 |
+
st.plotly_chart(fig_loss, use_container_width=True)
|
105 |
+
with col2:
|
106 |
+
st.plotly_chart(fig_f1, use_container_width=True)
|
107 |
+
|
108 |
+
# Row 2: Class Report Table | Bar Chart
|
109 |
+
col3, col4 = st.columns(2)
|
110 |
+
with col3:
|
111 |
+
st.plotly_chart(fig_report2, use_container_width=True)
|
112 |
+
with col4:
|
113 |
+
st.plotly_chart(fig_report, use_container_width=True)
|
114 |
+
|
115 |
+
# Row 3: Model Compare | Data Compare
|
116 |
+
col5, col6 = st.columns(2)
|
117 |
+
with col5:
|
118 |
+
st.markdown("**Model Comparison**")
|
119 |
+
st.dataframe(df_model, use_container_width=True)
|
120 |
+
with col6:
|
121 |
+
st.markdown("**Data Preprocessing Comparison**")
|
122 |
+
st.dataframe(df_data, use_container_width=True)
|
123 |
|
124 |
# --- Tab 3: DEMO MÔ HÌNH ---
|
125 |
with tab3:
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py
CHANGED
@@ -36,7 +36,7 @@ def predict_demo(text):
|
|
36 |
NUM_TAGS = 7
|
37 |
|
38 |
model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
|
39 |
-
model.load_state_dict(torch.load("
|
40 |
model.eval()
|
41 |
with torch.no_grad():
|
42 |
preds = model.decode(x)
|
|
|
36 |
NUM_TAGS = 7
|
37 |
|
38 |
model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
|
39 |
+
model.load_state_dict(torch.load("../models/best_epoch_16.pt"))
|
40 |
model.eval()
|
41 |
with torch.no_grad():
|
42 |
preds = model.decode(x)
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/st.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.graph_objects as go
|
4 |
+
from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare
|
5 |
+
|
6 |
+
st.set_page_config(
|
7 |
+
page_title="My NER App",
|
8 |
+
layout="wide",
|
9 |
+
initial_sidebar_state="expanded"
|
10 |
+
)
|
11 |
+
|
12 |
+
# ==== TẠO FIGURES ====
|
13 |
+
|
14 |
+
# 1️⃣ Loss
|
15 |
+
fig_loss = go.Figure()
|
16 |
+
fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
|
17 |
+
mode='lines+markers', name='Train Loss'))
|
18 |
+
fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
|
19 |
+
mode='lines+markers', name='Val Loss'))
|
20 |
+
fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
|
21 |
+
|
22 |
+
# 2️⃣ F1-Score
|
23 |
+
fig_f1 = go.Figure()
|
24 |
+
fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
|
25 |
+
mode='lines+markers', name='Train F1'))
|
26 |
+
fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
|
27 |
+
mode='lines+markers', name='Val F1'))
|
28 |
+
fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
|
29 |
+
|
30 |
+
# 3️⃣ Classification Report Table & Bar
|
31 |
+
labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
|
32 |
+
report_data = [[lbl,
|
33 |
+
report_dict[lbl]["precision"],
|
34 |
+
report_dict[lbl]["recall"],
|
35 |
+
report_dict[lbl]["f1-score"]]
|
36 |
+
for lbl in labels]
|
37 |
+
df_report = pd.DataFrame(report_data,
|
38 |
+
columns=["Label", "Precision", "Recall", "F1-Score"])
|
39 |
+
|
40 |
+
fig_report = go.Figure()
|
41 |
+
for col in ["Precision", "Recall", "F1-Score"]:
|
42 |
+
fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
|
43 |
+
fig_report.update_layout(barmode='group',
|
44 |
+
title="Class Report Metrics of PhoBert + CRF",
|
45 |
+
xaxis_title="Label", yaxis_title="Score",
|
46 |
+
yaxis=dict(range=[0,1.0]))
|
47 |
+
|
48 |
+
labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
|
49 |
+
report_data2 = [[lbl,
|
50 |
+
report_dict_2[lbl]["precision"],
|
51 |
+
report_dict_2[lbl]["recall"],
|
52 |
+
report_dict_2[lbl]["f1-score"]]
|
53 |
+
for lbl in labels2]
|
54 |
+
df_report2 = pd.DataFrame(report_data2,
|
55 |
+
columns=["Label", "Precision", "Recall", "F1-Score"])
|
56 |
+
|
57 |
+
fig_report2 = go.Figure()
|
58 |
+
for col in ["Precision", "Recall", "F1-Score"]:
|
59 |
+
fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
|
60 |
+
fig_report2.update_layout(barmode='group',
|
61 |
+
title="Class Report Metrics of PhoBert + Softmax",
|
62 |
+
xaxis_title="Label", yaxis_title="Score",
|
63 |
+
yaxis=dict(range=[0,1.0]))
|
64 |
+
|
65 |
+
# 4️⃣ Model & Data Comparison Tables
|
66 |
+
df_model = pd.DataFrame(
|
67 |
+
[[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()],
|
68 |
+
columns=["Model", "F1-Score", "Accuracy"]
|
69 |
+
)
|
70 |
+
df_data = pd.DataFrame(
|
71 |
+
[[s, f1] for s, f1 in data_compare["Data"].items()],
|
72 |
+
columns=["Preprocessing", "F1-Score"]
|
73 |
+
)
|
74 |
+
|
75 |
+
# ==== LAYOUT RAO GỌN VỚI COLUMNS ====
|
76 |
+
|
77 |
+
# Row 1: Loss | F1
|
78 |
+
col1, col2 = st.columns(2)
|
79 |
+
with col1:
|
80 |
+
st.plotly_chart(fig_loss, use_container_width=True)
|
81 |
+
with col2:
|
82 |
+
st.plotly_chart(fig_f1, use_container_width=True)
|
83 |
+
|
84 |
+
# Row 2: Class Report Table | Bar Chart
|
85 |
+
col3, col4 = st.columns(2)
|
86 |
+
with col3:
|
87 |
+
st.plotly_chart(fig_report2, use_container_width=True)
|
88 |
+
with col4:
|
89 |
+
st.plotly_chart(fig_report, use_container_width=True)
|
90 |
+
|
91 |
+
# Row 3: Model Compare | Data Compare
|
92 |
+
col5, col6 = st.columns(2)
|
93 |
+
with col5:
|
94 |
+
st.markdown("**Model Comparison**")
|
95 |
+
st.dataframe(df_model, use_container_width=True)
|
96 |
+
with col6:
|
97 |
+
st.markdown("**Data Preprocessing Comparison**")
|
98 |
+
st.dataframe(df_data, use_container_width=True)
|
space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import torch
|
2 |
from src.model import CRF_Tagger
|
3 |
from src.preprocessing import process_demo_sentence
|
|
|
4 |
|
5 |
def predict(model, loader, count_loss=True):
|
6 |
|
@@ -29,6 +30,9 @@ def predict(model, loader, count_loss=True):
|
|
29 |
|
30 |
def predict_demo(text):
|
31 |
|
|
|
|
|
|
|
32 |
|
33 |
id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
|
34 |
|
@@ -36,7 +40,7 @@ def predict_demo(text):
|
|
36 |
NUM_TAGS = 7
|
37 |
|
38 |
model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
|
39 |
-
model.load_state_dict(torch.load(
|
40 |
model.eval()
|
41 |
with torch.no_grad():
|
42 |
preds = model.decode(x)
|
|
|
1 |
import torch
|
2 |
from src.model import CRF_Tagger
|
3 |
from src.preprocessing import process_demo_sentence
|
4 |
+
import os
|
5 |
|
6 |
def predict(model, loader, count_loss=True):
|
7 |
|
|
|
30 |
|
31 |
def predict_demo(text):
|
32 |
|
33 |
+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
34 |
+
model_path = os.path.join(BASE_DIR, "models", "best_epoch_16.pt")
|
35 |
+
|
36 |
|
37 |
id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
|
38 |
|
|
|
40 |
NUM_TAGS = 7
|
41 |
|
42 |
model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
|
43 |
+
model.load_state_dict(torch.load(model_path))
|
44 |
model.eval()
|
45 |
with torch.no_grad():
|
46 |
preds = model.decode(x)
|
space/space/space/space/space/space/space/space/space/space/space/space/space/st.py
CHANGED
@@ -1,98 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import plotly.graph_objects as go
|
4 |
-
from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
layout="wide",
|
9 |
-
initial_sidebar_state="expanded"
|
10 |
-
)
|
11 |
|
12 |
-
#
|
13 |
-
|
14 |
-
# 1️⃣ Loss
|
15 |
-
fig_loss = go.Figure()
|
16 |
-
fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
|
17 |
-
mode='lines+markers', name='Train Loss'))
|
18 |
-
fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
|
19 |
-
mode='lines+markers', name='Val Loss'))
|
20 |
-
fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
|
21 |
-
|
22 |
-
# 2️⃣ F1-Score
|
23 |
-
fig_f1 = go.Figure()
|
24 |
-
fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
|
25 |
-
mode='lines+markers', name='Train F1'))
|
26 |
-
fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
|
27 |
-
mode='lines+markers', name='Val F1'))
|
28 |
-
fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
|
29 |
-
|
30 |
-
# 3️⃣ Classification Report Table & Bar
|
31 |
-
labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
|
32 |
-
report_data = [[lbl,
|
33 |
-
report_dict[lbl]["precision"],
|
34 |
-
report_dict[lbl]["recall"],
|
35 |
-
report_dict[lbl]["f1-score"]]
|
36 |
-
for lbl in labels]
|
37 |
-
df_report = pd.DataFrame(report_data,
|
38 |
-
columns=["Label", "Precision", "Recall", "F1-Score"])
|
39 |
-
|
40 |
-
fig_report = go.Figure()
|
41 |
-
for col in ["Precision", "Recall", "F1-Score"]:
|
42 |
-
fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
|
43 |
-
fig_report.update_layout(barmode='group',
|
44 |
-
title="Class Report Metrics of PhoBert + CRF",
|
45 |
-
xaxis_title="Label", yaxis_title="Score",
|
46 |
-
yaxis=dict(range=[0,1.0]))
|
47 |
-
|
48 |
-
labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
|
49 |
-
report_data2 = [[lbl,
|
50 |
-
report_dict_2[lbl]["precision"],
|
51 |
-
report_dict_2[lbl]["recall"],
|
52 |
-
report_dict_2[lbl]["f1-score"]]
|
53 |
-
for lbl in labels2]
|
54 |
-
df_report2 = pd.DataFrame(report_data2,
|
55 |
-
columns=["Label", "Precision", "Recall", "F1-Score"])
|
56 |
-
|
57 |
-
fig_report2 = go.Figure()
|
58 |
-
for col in ["Precision", "Recall", "F1-Score"]:
|
59 |
-
fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
|
60 |
-
fig_report2.update_layout(barmode='group',
|
61 |
-
title="Class Report Metrics of PhoBert + Softmax",
|
62 |
-
xaxis_title="Label", yaxis_title="Score",
|
63 |
-
yaxis=dict(range=[0,1.0]))
|
64 |
-
|
65 |
-
# 4️⃣ Model & Data Comparison Tables
|
66 |
-
df_model = pd.DataFrame(
|
67 |
-
[[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()],
|
68 |
-
columns=["Model", "F1-Score", "Accuracy"]
|
69 |
-
)
|
70 |
-
df_data = pd.DataFrame(
|
71 |
-
[[s, f1] for s, f1 in data_compare["Data"].items()],
|
72 |
-
columns=["Preprocessing", "F1-Score"]
|
73 |
-
)
|
74 |
-
|
75 |
-
# ==== LAYOUT RAO GỌN VỚI COLUMNS ====
|
76 |
-
|
77 |
-
# Row 1: Loss | F1
|
78 |
-
col1, col2 = st.columns(2)
|
79 |
-
with col1:
|
80 |
-
st.plotly_chart(fig_loss, use_container_width=True)
|
81 |
-
with col2:
|
82 |
-
st.plotly_chart(fig_f1, use_container_width=True)
|
83 |
-
|
84 |
-
# Row 2: Class Report Table | Bar Chart
|
85 |
-
col3, col4 = st.columns(2)
|
86 |
-
with col3:
|
87 |
-
st.plotly_chart(fig_report2, use_container_width=True)
|
88 |
-
with col4:
|
89 |
-
st.plotly_chart(fig_report, use_container_width=True)
|
90 |
-
|
91 |
-
# Row 3: Model Compare | Data Compare
|
92 |
-
col5, col6 = st.columns(2)
|
93 |
-
with col5:
|
94 |
-
st.markdown("**Model Comparison**")
|
95 |
-
st.dataframe(df_model, use_container_width=True)
|
96 |
-
with col6:
|
97 |
-
st.markdown("**Data Preprocessing Comparison**")
|
98 |
-
st.dataframe(df_data, use_container_width=True)
|
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
2 |
|
3 |
+
# Load ảnh từ file local
|
4 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/W%26B%20Chart%206_18_2025%2C%207_23_58%20PM.png", caption="Ảnh minh hoạ", use_column_width=True)
|
|
|
|
|
|
|
5 |
|
6 |
+
# Load ảnh từ URL
|
7 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/W%26B%20Chart%206_18_2025%2C%207_24_20%20PM.png", caption="Ảnh từ URL", use_column_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
space/space/space/space/space/space/space/space/space/space/space/src/app.py
CHANGED
@@ -16,14 +16,27 @@ tab1, tab2, tab3 = st.tabs(["📊 Phân tích dữ liệu", "📈 Kết quả hu
|
|
16 |
|
17 |
# --- Tab 1: PHÂN TÍCH DỮ LIỆU ---
|
18 |
with tab1:
|
19 |
-
st.
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
"
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
|
29 |
with tab2:
|
|
|
16 |
|
17 |
# --- Tab 1: PHÂN TÍCH DỮ LIỆU ---
|
18 |
with tab1:
|
19 |
+
col1, col2 = st.columns(2)
|
20 |
+
|
21 |
+
# ==== Distribution of NER Label Frequency ====
|
22 |
+
with col1:
|
23 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png")
|
24 |
+
|
25 |
+
# ==== Distribution of NER Label Frequency (Add crawled data) ====
|
26 |
+
with col2:
|
27 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png")
|
28 |
+
|
29 |
+
# ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
|
30 |
+
with col1:
|
31 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png")
|
32 |
+
|
33 |
+
# ==== Distribution of Sentence Lengths ====
|
34 |
+
with col2:
|
35 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png")
|
36 |
+
|
37 |
+
# ==== Distribution of Token Lengths ====
|
38 |
+
with col1:
|
39 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png")
|
40 |
|
41 |
# --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
|
42 |
with tab2:
|
space/space/space/space/space/space/space/space/space/space/space/st.py
CHANGED
@@ -1,7 +1,23 @@
|
|
1 |
import streamlit as st
|
2 |
|
3 |
-
|
4 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/W%26B%20Chart%206_18_2025%2C%207_23_58%20PM.png", caption="Ảnh minh hoạ", use_column_width=True)
|
5 |
|
6 |
-
#
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
+
col1, col2 = st.columns(2)
|
|
|
4 |
|
5 |
+
# ==== Distribution of NER Label Frequency ====
|
6 |
+
with col1:
|
7 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png")
|
8 |
+
|
9 |
+
# ==== Distribution of NER Label Frequency (Add crawled data) ====
|
10 |
+
with col2:
|
11 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png")
|
12 |
+
|
13 |
+
# ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
|
14 |
+
with col1:
|
15 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png")
|
16 |
+
|
17 |
+
# ==== Distribution of Sentence Lengths ====
|
18 |
+
with col2:
|
19 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png")
|
20 |
+
|
21 |
+
# ==== Distribution of Token Lengths ====
|
22 |
+
with col1:
|
23 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png")
|
space/space/space/space/space/space/space/space/src/app.py
CHANGED
@@ -8,52 +8,57 @@ from results.output import training_log, report_dict, report_dict_2, model_compa
|
|
8 |
|
9 |
st.set_page_config(page_title="Vietnamese NER", layout="wide")
|
10 |
|
11 |
-
# =====
|
12 |
-
st.title("🔍
|
13 |
|
14 |
# Tabs
|
15 |
-
tab1, tab2, tab3 = st.tabs(["📊
|
16 |
|
17 |
-
# --- Tab 1:
|
18 |
with tab1:
|
19 |
col1, col2 = st.columns(2)
|
20 |
|
21 |
# ==== Distribution of NER Label Frequency ====
|
22 |
with col1:
|
23 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png"
|
|
|
24 |
|
25 |
# ==== Distribution of NER Label Frequency (Add crawled data) ====
|
26 |
with col2:
|
27 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png"
|
|
|
28 |
|
29 |
# ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
|
30 |
with col1:
|
31 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png"
|
|
|
32 |
|
33 |
# ==== Distribution of Sentence Lengths ====
|
34 |
with col2:
|
35 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png"
|
|
|
36 |
|
37 |
# ==== Distribution of Token Lengths ====
|
38 |
with col1:
|
39 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png"
|
|
|
40 |
|
41 |
-
# --- Tab 2:
|
42 |
with tab2:
|
43 |
st.set_page_config(
|
44 |
-
page_title="
|
45 |
layout="wide",
|
46 |
initial_sidebar_state="expanded"
|
47 |
)
|
48 |
|
49 |
-
# ====
|
50 |
|
51 |
# 1️⃣ Loss
|
52 |
fig_loss = go.Figure()
|
53 |
fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
|
54 |
-
|
55 |
fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
|
56 |
-
|
57 |
fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
|
58 |
|
59 |
# 2️⃣ F1-Score
|
@@ -61,7 +66,7 @@ with tab2:
|
|
61 |
fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
|
62 |
mode='lines+markers', name='Train F1'))
|
63 |
fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
|
64 |
-
mode='lines+markers', name='
|
65 |
fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
|
66 |
|
67 |
# 3️⃣ Classification Report Table & Bar
|
@@ -70,34 +75,34 @@ with tab2:
|
|
70 |
report_dict[lbl]["precision"],
|
71 |
report_dict[lbl]["recall"],
|
72 |
report_dict[lbl]["f1-score"]]
|
73 |
-
|
74 |
df_report = pd.DataFrame(report_data,
|
75 |
-
|
76 |
|
77 |
fig_report = go.Figure()
|
78 |
for col in ["Precision", "Recall", "F1-Score"]:
|
79 |
fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
|
80 |
fig_report.update_layout(barmode='group',
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
|
85 |
labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
|
86 |
report_data2 = [[lbl,
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
for lbl in labels2]
|
91 |
df_report2 = pd.DataFrame(report_data2,
|
92 |
-
|
93 |
|
94 |
fig_report2 = go.Figure()
|
95 |
for col in ["Precision", "Recall", "F1-Score"]:
|
96 |
fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
|
97 |
fig_report2.update_layout(barmode='group',
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
|
102 |
# 4️⃣ Model & Data Comparison Tables
|
103 |
df_model = pd.DataFrame(
|
@@ -109,7 +114,7 @@ with tab2:
|
|
109 |
columns=["Preprocessing", "F1-Score"]
|
110 |
)
|
111 |
|
112 |
-
# ==== LAYOUT
|
113 |
|
114 |
# Row 1: Loss | F1
|
115 |
col1, col2 = st.columns(2)
|
@@ -134,26 +139,26 @@ with tab2:
|
|
134 |
st.markdown("**Data Preprocessing Comparison**")
|
135 |
st.dataframe(df_data, use_container_width=True)
|
136 |
|
137 |
-
# --- Tab 3: DEMO
|
138 |
with tab3:
|
139 |
-
st.header("🧪 Vietnamese Named Entity Recognition")
|
140 |
|
141 |
-
text = st.text_input("
|
142 |
|
143 |
-
if st.button("
|
144 |
if not text.strip():
|
145 |
-
st.warning("
|
146 |
else:
|
147 |
tokens, labels = predict_demo(text)
|
148 |
|
149 |
-
st.subheader("
|
150 |
entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"]
|
151 |
|
152 |
if entities:
|
153 |
for tok, lab in entities:
|
154 |
st.markdown(f"🔹 **{tok}** — *{lab}*")
|
155 |
else:
|
156 |
-
st.info("
|
157 |
|
158 |
-
st.subheader("
|
159 |
st.markdown(render_html(tokens, labels), unsafe_allow_html=True)
|
|
|
8 |
|
9 |
st.set_page_config(page_title="Vietnamese NER", layout="wide")
|
10 |
|
11 |
+
# ===== Main Title =====
|
12 |
+
st.title("🔍 Vietnamese Named Entity Recognition (NER) Application")
|
13 |
|
14 |
# Tabs
|
15 |
+
tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"])
|
16 |
|
17 |
+
# --- Tab 1: DATA ANALYSIS ---
|
18 |
with tab1:
|
19 |
col1, col2 = st.columns(2)
|
20 |
|
21 |
# ==== Distribution of NER Label Frequency ====
|
22 |
with col1:
|
23 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png",
|
24 |
+
caption="NER Label Frequency Distribution")
|
25 |
|
26 |
# ==== Distribution of NER Label Frequency (Add crawled data) ====
|
27 |
with col2:
|
28 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png",
|
29 |
+
caption="NER Label Frequency (Extended with Crawled Data)")
|
30 |
|
31 |
# ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
|
32 |
with col1:
|
33 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png",
|
34 |
+
caption="Number of Entities per Sentence")
|
35 |
|
36 |
# ==== Distribution of Sentence Lengths ====
|
37 |
with col2:
|
38 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png",
|
39 |
+
caption="Sentence Length Distribution")
|
40 |
|
41 |
# ==== Distribution of Token Lengths ====
|
42 |
with col1:
|
43 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png",
|
44 |
+
caption="Token Length Distribution")
|
45 |
|
46 |
+
# --- Tab 2: TRAINING RESULTS ---
|
47 |
with tab2:
|
48 |
st.set_page_config(
|
49 |
+
page_title="Vietnamese NER",
|
50 |
layout="wide",
|
51 |
initial_sidebar_state="expanded"
|
52 |
)
|
53 |
|
54 |
+
# ==== CREATE FIGURES ====
|
55 |
|
56 |
# 1️⃣ Loss
|
57 |
fig_loss = go.Figure()
|
58 |
fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
|
59 |
+
mode='lines+markers', name='Train Loss'))
|
60 |
fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
|
61 |
+
mode='lines+markers', name='Validation Loss'))
|
62 |
fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
|
63 |
|
64 |
# 2️⃣ F1-Score
|
|
|
66 |
fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
|
67 |
mode='lines+markers', name='Train F1'))
|
68 |
fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
|
69 |
+
mode='lines+markers', name='Validation F1'))
|
70 |
fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
|
71 |
|
72 |
# 3️⃣ Classification Report Table & Bar
|
|
|
75 |
report_dict[lbl]["precision"],
|
76 |
report_dict[lbl]["recall"],
|
77 |
report_dict[lbl]["f1-score"]]
|
78 |
+
for lbl in labels]
|
79 |
df_report = pd.DataFrame(report_data,
|
80 |
+
columns=["Label", "Precision", "Recall", "F1-Score"])
|
81 |
|
82 |
fig_report = go.Figure()
|
83 |
for col in ["Precision", "Recall", "F1-Score"]:
|
84 |
fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
|
85 |
fig_report.update_layout(barmode='group',
|
86 |
+
title="Class Metrics: PhoBERT + CRF",
|
87 |
+
xaxis_title="Label", yaxis_title="Score",
|
88 |
+
yaxis=dict(range=[0, 1.0]))
|
89 |
|
90 |
labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
|
91 |
report_data2 = [[lbl,
|
92 |
+
report_dict_2[lbl]["precision"],
|
93 |
+
report_dict_2[lbl]["recall"],
|
94 |
+
report_dict_2[lbl]["f1-score"]]
|
95 |
for lbl in labels2]
|
96 |
df_report2 = pd.DataFrame(report_data2,
|
97 |
+
columns=["Label", "Precision", "Recall", "F1-Score"])
|
98 |
|
99 |
fig_report2 = go.Figure()
|
100 |
for col in ["Precision", "Recall", "F1-Score"]:
|
101 |
fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
|
102 |
fig_report2.update_layout(barmode='group',
|
103 |
+
title="Class Metrics: PhoBERT + Softmax",
|
104 |
+
xaxis_title="Label", yaxis_title="Score",
|
105 |
+
yaxis=dict(range=[0, 1.0]))
|
106 |
|
107 |
# 4️⃣ Model & Data Comparison Tables
|
108 |
df_model = pd.DataFrame(
|
|
|
114 |
columns=["Preprocessing", "F1-Score"]
|
115 |
)
|
116 |
|
117 |
+
# ==== CLEAN LAYOUT WITH COLUMNS ====
|
118 |
|
119 |
# Row 1: Loss | F1
|
120 |
col1, col2 = st.columns(2)
|
|
|
139 |
st.markdown("**Data Preprocessing Comparison**")
|
140 |
st.dataframe(df_data, use_container_width=True)
|
141 |
|
142 |
+
# --- Tab 3: MODEL DEMO ---
|
143 |
with tab3:
|
144 |
+
st.header("🧪 Vietnamese Named Entity Recognition Demo")
|
145 |
|
146 |
+
text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội")
|
147 |
|
148 |
+
if st.button("Analyze"):
|
149 |
if not text.strip():
|
150 |
+
st.warning("Please enter some text!")
|
151 |
else:
|
152 |
tokens, labels = predict_demo(text)
|
153 |
|
154 |
+
st.subheader("Detected Entities")
|
155 |
entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"]
|
156 |
|
157 |
if entities:
|
158 |
for tok, lab in entities:
|
159 |
st.markdown(f"🔹 **{tok}** — *{lab}*")
|
160 |
else:
|
161 |
+
st.info("No named entities detected.")
|
162 |
|
163 |
+
st.subheader("Highlighted Text")
|
164 |
st.markdown(render_html(tokens, labels), unsafe_allow_html=True)
|
space/space/space/space/space/space/space/src/app.py
CHANGED
@@ -9,7 +9,7 @@ from results.output import training_log, report_dict, report_dict_2, model_compa
|
|
9 |
st.set_page_config(page_title="Vietnamese NER", layout="wide")
|
10 |
|
11 |
# ===== Main Title =====
|
12 |
-
st.title("🔍 Vietnamese Named Entity Recognition
|
13 |
|
14 |
# Tabs
|
15 |
tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"])
|
@@ -20,28 +20,23 @@ with tab1:
|
|
20 |
|
21 |
# ==== Distribution of NER Label Frequency ====
|
22 |
with col1:
|
23 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png"
|
24 |
-
caption="NER Label Frequency Distribution")
|
25 |
|
26 |
# ==== Distribution of NER Label Frequency (Add crawled data) ====
|
27 |
with col2:
|
28 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png"
|
29 |
-
caption="NER Label Frequency (Extended with Crawled Data)")
|
30 |
|
31 |
# ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
|
32 |
with col1:
|
33 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png"
|
34 |
-
caption="Number of Entities per Sentence")
|
35 |
|
36 |
# ==== Distribution of Sentence Lengths ====
|
37 |
with col2:
|
38 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png"
|
39 |
-
caption="Sentence Length Distribution")
|
40 |
|
41 |
# ==== Distribution of Token Lengths ====
|
42 |
with col1:
|
43 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png"
|
44 |
-
caption="Token Length Distribution")
|
45 |
|
46 |
# --- Tab 2: TRAINING RESULTS ---
|
47 |
with tab2:
|
@@ -141,8 +136,6 @@ with tab2:
|
|
141 |
|
142 |
# --- Tab 3: MODEL DEMO ---
|
143 |
with tab3:
|
144 |
-
st.header("🧪 Vietnamese Named Entity Recognition Demo")
|
145 |
-
|
146 |
text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội")
|
147 |
|
148 |
if st.button("Analyze"):
|
|
|
9 |
st.set_page_config(page_title="Vietnamese NER", layout="wide")
|
10 |
|
11 |
# ===== Main Title =====
|
12 |
+
st.title("🔍 Vietnamese Named Entity Recognition Demo")
|
13 |
|
14 |
# Tabs
|
15 |
tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"])
|
|
|
20 |
|
21 |
# ==== Distribution of NER Label Frequency ====
|
22 |
with col1:
|
23 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png")
|
|
|
24 |
|
25 |
# ==== Distribution of NER Label Frequency (Add crawled data) ====
|
26 |
with col2:
|
27 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png")
|
|
|
28 |
|
29 |
# ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
|
30 |
with col1:
|
31 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png")
|
|
|
32 |
|
33 |
# ==== Distribution of Sentence Lengths ====
|
34 |
with col2:
|
35 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png")
|
|
|
36 |
|
37 |
# ==== Distribution of Token Lengths ====
|
38 |
with col1:
|
39 |
+
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png")
|
|
|
40 |
|
41 |
# --- Tab 2: TRAINING RESULTS ---
|
42 |
with tab2:
|
|
|
136 |
|
137 |
# --- Tab 3: MODEL DEMO ---
|
138 |
with tab3:
|
|
|
|
|
139 |
text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội")
|
140 |
|
141 |
if st.button("Analyze"):
|
space/space/space/space/space/src/app.py
CHANGED
@@ -1,157 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import plotly.graph_objects as go
|
4 |
-
|
5 |
-
from src.predict import predict_demo
|
6 |
-
from src.front import render_html
|
7 |
-
from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare
|
8 |
-
|
9 |
-
st.set_page_config(page_title="Vietnamese NER", layout="wide")
|
10 |
-
|
11 |
-
# ===== Main Title =====
|
12 |
-
st.title("🔍 Vietnamese Named Entity Recognition Demo")
|
13 |
-
|
14 |
-
# Tabs
|
15 |
-
tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"])
|
16 |
-
|
17 |
-
# --- Tab 1: DATA ANALYSIS ---
|
18 |
-
with tab1:
|
19 |
-
col1, col2 = st.columns(2)
|
20 |
-
|
21 |
-
# ==== Distribution of NER Label Frequency ====
|
22 |
-
with col1:
|
23 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png")
|
24 |
-
|
25 |
-
# ==== Distribution of NER Label Frequency (Add crawled data) ====
|
26 |
-
with col2:
|
27 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png")
|
28 |
-
|
29 |
-
# ==== Distribution of the Number of Entities per Sentence (0 to 15+) ====
|
30 |
-
with col1:
|
31 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png")
|
32 |
-
|
33 |
-
# ==== Distribution of Sentence Lengths ====
|
34 |
-
with col2:
|
35 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png")
|
36 |
-
|
37 |
-
# ==== Distribution of Token Lengths ====
|
38 |
-
with col1:
|
39 |
-
st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png")
|
40 |
-
|
41 |
-
# --- Tab 2: TRAINING RESULTS ---
|
42 |
-
with tab2:
|
43 |
-
st.set_page_config(
|
44 |
-
page_title="Vietnamese NER",
|
45 |
-
layout="wide",
|
46 |
-
initial_sidebar_state="expanded"
|
47 |
-
)
|
48 |
-
|
49 |
-
# ==== CREATE FIGURES ====
|
50 |
-
|
51 |
-
# 1️⃣ Loss
|
52 |
-
fig_loss = go.Figure()
|
53 |
-
fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
|
54 |
-
mode='lines+markers', name='Train Loss'))
|
55 |
-
fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
|
56 |
-
mode='lines+markers', name='Validation Loss'))
|
57 |
-
fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
|
58 |
-
|
59 |
-
# 2️⃣ F1-Score
|
60 |
-
fig_f1 = go.Figure()
|
61 |
-
fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
|
62 |
-
mode='lines+markers', name='Train F1'))
|
63 |
-
fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
|
64 |
-
mode='lines+markers', name='Validation F1'))
|
65 |
-
fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
|
66 |
-
|
67 |
-
# 3️⃣ Classification Report Table & Bar
|
68 |
-
labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
|
69 |
-
report_data = [[lbl,
|
70 |
-
report_dict[lbl]["precision"],
|
71 |
-
report_dict[lbl]["recall"],
|
72 |
-
report_dict[lbl]["f1-score"]]
|
73 |
-
for lbl in labels]
|
74 |
-
df_report = pd.DataFrame(report_data,
|
75 |
-
columns=["Label", "Precision", "Recall", "F1-Score"])
|
76 |
-
|
77 |
-
fig_report = go.Figure()
|
78 |
-
for col in ["Precision", "Recall", "F1-Score"]:
|
79 |
-
fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
|
80 |
-
fig_report.update_layout(barmode='group',
|
81 |
-
title="Class Metrics: PhoBERT + CRF",
|
82 |
-
xaxis_title="Label", yaxis_title="Score",
|
83 |
-
yaxis=dict(range=[0, 1.0]))
|
84 |
-
|
85 |
-
labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
|
86 |
-
report_data2 = [[lbl,
|
87 |
-
report_dict_2[lbl]["precision"],
|
88 |
-
report_dict_2[lbl]["recall"],
|
89 |
-
report_dict_2[lbl]["f1-score"]]
|
90 |
-
for lbl in labels2]
|
91 |
-
df_report2 = pd.DataFrame(report_data2,
|
92 |
-
columns=["Label", "Precision", "Recall", "F1-Score"])
|
93 |
-
|
94 |
-
fig_report2 = go.Figure()
|
95 |
-
for col in ["Precision", "Recall", "F1-Score"]:
|
96 |
-
fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
|
97 |
-
fig_report2.update_layout(barmode='group',
|
98 |
-
title="Class Metrics: PhoBERT + Softmax",
|
99 |
-
xaxis_title="Label", yaxis_title="Score",
|
100 |
-
yaxis=dict(range=[0, 1.0]))
|
101 |
-
|
102 |
-
# 4️⃣ Model & Data Comparison Tables
|
103 |
-
df_model = pd.DataFrame(
|
104 |
-
[[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()],
|
105 |
-
columns=["Model", "F1-Score", "Accuracy"]
|
106 |
-
)
|
107 |
-
df_data = pd.DataFrame(
|
108 |
-
[[s, f1] for s, f1 in data_compare["Data"].items()],
|
109 |
-
columns=["Preprocessing", "F1-Score"]
|
110 |
-
)
|
111 |
-
|
112 |
-
# ==== CLEAN LAYOUT WITH COLUMNS ====
|
113 |
-
|
114 |
-
# Row 1: Loss | F1
|
115 |
-
col1, col2 = st.columns(2)
|
116 |
-
with col1:
|
117 |
-
st.plotly_chart(fig_loss, use_container_width=True)
|
118 |
-
with col2:
|
119 |
-
st.plotly_chart(fig_f1, use_container_width=True)
|
120 |
-
|
121 |
-
# Row 2: Class Report Table | Bar Chart
|
122 |
-
col3, col4 = st.columns(2)
|
123 |
-
with col3:
|
124 |
-
st.plotly_chart(fig_report2, use_container_width=True)
|
125 |
-
with col4:
|
126 |
-
st.plotly_chart(fig_report, use_container_width=True)
|
127 |
-
|
128 |
-
# Row 3: Model Compare | Data Compare
|
129 |
-
col5, col6 = st.columns(2)
|
130 |
-
with col5:
|
131 |
-
st.markdown("**Model Comparison**")
|
132 |
-
st.dataframe(df_model, use_container_width=True)
|
133 |
-
with col6:
|
134 |
-
st.markdown("**Data Preprocessing Comparison**")
|
135 |
-
st.dataframe(df_data, use_container_width=True)
|
136 |
-
|
137 |
-
# --- Tab 3: MODEL DEMO ---
|
138 |
-
with tab3:
|
139 |
-
text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội")
|
140 |
-
|
141 |
-
if st.button("Analyze"):
|
142 |
-
if not text.strip():
|
143 |
-
st.warning("Please enter some text!")
|
144 |
-
else:
|
145 |
-
tokens, labels = predict_demo(text)
|
146 |
-
|
147 |
-
st.subheader("Detected Entities")
|
148 |
-
entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"]
|
149 |
-
|
150 |
-
if entities:
|
151 |
-
for tok, lab in entities:
|
152 |
-
st.markdown(f"🔹 **{tok}** — *{lab}*")
|
153 |
-
else:
|
154 |
-
st.info("No named entities detected.")
|
155 |
-
|
156 |
-
st.subheader("Highlighted Text")
|
157 |
-
st.markdown(render_html(tokens, labels), unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
space/space/space/space/space/src/predict.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
import torch
|
2 |
-
from
|
3 |
-
from
|
4 |
-
import os
|
5 |
-
|
6 |
def predict(model, loader, count_loss=True):
|
7 |
|
8 |
model.eval() # Evaluation Mode, Ignore Dropout, BatchNorm, ...
|
@@ -30,9 +28,6 @@ def predict(model, loader, count_loss=True):
|
|
30 |
|
31 |
def predict_demo(text):
|
32 |
|
33 |
-
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
34 |
-
model_path = os.path.join(BASE_DIR, "models", "best_epoch_16.pt")
|
35 |
-
|
36 |
|
37 |
id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
|
38 |
|
@@ -40,7 +35,7 @@ def predict_demo(text):
|
|
40 |
NUM_TAGS = 7
|
41 |
|
42 |
model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
|
43 |
-
model.load_state_dict(torch.load(
|
44 |
model.eval()
|
45 |
with torch.no_grad():
|
46 |
preds = model.decode(x)
|
|
|
1 |
import torch
|
2 |
+
from model import CRF_Tagger
|
3 |
+
from preprocessing import process_demo_sentence
|
|
|
|
|
4 |
def predict(model, loader, count_loss=True):
|
5 |
|
6 |
model.eval() # Evaluation Mode, Ignore Dropout, BatchNorm, ...
|
|
|
28 |
|
29 |
def predict_demo(text):
|
30 |
|
|
|
|
|
|
|
31 |
|
32 |
id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
|
33 |
|
|
|
35 |
NUM_TAGS = 7
|
36 |
|
37 |
model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
|
38 |
+
model.load_state_dict(torch.load("models/best_epoch_16.pt"))
|
39 |
model.eval()
|
40 |
with torch.no_grad():
|
41 |
preds = model.decode(x)
|
space/space/space/space/space/src/preprocessing.py
CHANGED
@@ -3,7 +3,7 @@ import torch
|
|
3 |
from transformers import AutoTokenizer, AutoModel
|
4 |
from tqdm import tqdm
|
5 |
from sklearn.model_selection import train_test_split
|
6 |
-
from
|
7 |
from pyvi import ViTokenizer
|
8 |
|
9 |
def join_tokens(tokens):
|
|
|
3 |
from transformers import AutoTokenizer, AutoModel
|
4 |
from tqdm import tqdm
|
5 |
from sklearn.model_selection import train_test_split
|
6 |
+
from configs import configs
|
7 |
from pyvi import ViTokenizer
|
8 |
|
9 |
def join_tokens(tokens):
|
space/space/space/space/space/src/templates/demo.html
ADDED
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<html lang="en">
|
2 |
+
<head>
|
3 |
+
<meta charset="utf-8"/>
|
4 |
+
<meta content="width=device-width, initial-scale=1" name="viewport"/>
|
5 |
+
<title>
|
6 |
+
Model Demo
|
7 |
+
</title>
|
8 |
+
<script src="https://cdn.tailwindcss.com">
|
9 |
+
</script>
|
10 |
+
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.3/css/all.min.css" rel="stylesheet"/>
|
11 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap" rel="stylesheet"/>
|
12 |
+
<style>
|
13 |
+
body {
|
14 |
+
font-family: "Inter", sans-serif;
|
15 |
+
}
|
16 |
+
</style>
|
17 |
+
</head>
|
18 |
+
<body class="min-h-screen bg-gradient-to-br from-[#e07db7] via-[#e07db7]/40 to-[#f47a2f] flex items-center justify-center p-6">
|
19 |
+
<div class="max-w-6xl w-full rounded-xl flex overflow-hidden drop-shadow-lg bg-gradient-to-br from-[#f7f8fa] to-[#e9ebf0]">
|
20 |
+
<!-- Sidebar -->
|
21 |
+
<aside class="w-48 flex flex-col bg-[#F0F0F5] border-r border-gray-200 select-none">
|
22 |
+
<nav class="flex flex-col mt-6 space-y-1 px-2">
|
23 |
+
<button id="homeBtn" class="flex items-center gap-3 text-xs font-semibold text-[#FF6A00] rounded-md py-3 px-4 border border-[#FF6A00] bg-white shadow-sm hover:bg-[#ff6a0040] transition" type="button">
|
24 |
+
<i class="fas fa-home text-sm"></i>
|
25 |
+
<span data-i18n="home">HOME</span>
|
26 |
+
</button>
|
27 |
+
<button id="historyBtn" class="flex items-center gap-3 text-xs font-semibold text-gray-700 rounded-md py-3 px-4 hover:bg-[#ff6a0040] hover:text-[#FF6A00] transition" type="button">
|
28 |
+
<i class="fas fa-clock text-sm"></i>
|
29 |
+
<span data-i18n="history">DETECT HISTORY</span>
|
30 |
+
</button>
|
31 |
+
<button id="statBtn" class="flex items-center gap-3 text-xs font-semibold text-gray-700 rounded-md py-3 px-4 hover:bg-[#ff6a0040] hover:text-[#FF6A00] transition" type="button">
|
32 |
+
<i class="fas fa-chart-bar text-sm"></i>
|
33 |
+
<span data-i18n="stat">STATISTICS</span>
|
34 |
+
</button>
|
35 |
+
<button id="settingSidebarBtn" class="flex items-center gap-3 text-xs font-semibold text-gray-700 rounded-md py-3 px-4 hover:bg-[#ff6a0040] hover:text-[#FF6A00] transition" type="button">
|
36 |
+
<i class="fas fa-sliders-h text-sm"></i>
|
37 |
+
<span data-i18n="settings">SETTINGS</span>
|
38 |
+
</button>
|
39 |
+
</nav>
|
40 |
+
</aside>
|
41 |
+
<!-- Main content -->
|
42 |
+
<main class="flex-1 flex flex-col p-6">
|
43 |
+
<!-- Topbar: Tabs + Search -->
|
44 |
+
<div class="flex flex-col md:flex-row md:items-center md:justify-between gap-4 md:gap-0 mb-8">
|
45 |
+
<!-- Tabs as segmented control card -->
|
46 |
+
<div aria-label="Main navigation tabs" class="inline-flex rounded-lg bg-white shadow-sm border border-gray-300 overflow-hidden text-xs font-semibold text-gray-600" role="tablist">
|
47 |
+
<button aria-selected="false" class="px-4 py-2 flex items-center gap-1 hover:text-[#FF6A00] transition focus:outline-none" role="tab" tabindex="0" type="button">
|
48 |
+
<img alt="Data Analysis icon, a small colorful bar chart" class="w-4 h-4" height="16" src="https://storage.googleapis.com/a1aa/image/97104fd7-5bb0-41e8-f1d2-7860810595dd.jpg" width="16"/>
|
49 |
+
<span data-i18n="dataAnalysis">Data Analysis</span>
|
50 |
+
</button>
|
51 |
+
<button aria-selected="false" class="px-4 py-2 flex items-center gap-1 hover:text-[#FF6A00] transition focus:outline-none" role="tab" tabindex="-1" type="button">
|
52 |
+
<img alt="Training Results icon, a small line chart with upward trend" class="w-4 h-4" height="16" src="https://storage.googleapis.com/a1aa/image/ac9104af-fd5c-488d-8f07-2e00044563e3.jpg" width="16"/>
|
53 |
+
<span data-i18n="trainingResults">Training Results</span>
|
54 |
+
</button>
|
55 |
+
<button aria-selected="true" class="px-4 py-2 flex items-center gap-1 bg-[#FF6A00] text-white rounded-lg focus:outline-none" role="tab" tabindex="0" type="button">
|
56 |
+
<img alt="Model Demo icon, a small green pencil" class="w-4 h-4" height="16" src="https://storage.googleapis.com/a1aa/image/ee1b8981-47ae-4e2b-3171-3006c09b5080.jpg" width="16"/>
|
57 |
+
<span data-i18n="modelDemo">Model Demo</span>
|
58 |
+
</button>
|
59 |
+
</div>
|
60 |
+
<!-- Search bar on right -->
|
61 |
+
<form aria-label="Search form" class="w-full max-w-xs md:max-w-sm" onsubmit="event.preventDefault()">
|
62 |
+
<label class="sr-only" for="search">
|
63 |
+
Search
|
64 |
+
</label>
|
65 |
+
<div class="relative w-full">
|
66 |
+
<!-- Search icon or input here if needed -->
|
67 |
+
</div>
|
68 |
+
</form>
|
69 |
+
</div>
|
70 |
+
|
71 |
+
<!-- Input type tabs (ẩn khi ở trang Setting) -->
|
72 |
+
<div id="innerTabs" class="flex gap-6 w-full mb-6">
|
73 |
+
<button id="textTabBtn" class="flex-1 bg-white font-semibold text-sm py-3 rounded-xl border border-gray-300 shadow-sm hover:shadow-md transition text-center tab-btn active" type="button">
|
74 |
+
<span data-i18n="enterText">✍️ Enter text</span>
|
75 |
+
</button>
|
76 |
+
<button id="fileTabBtn" class="flex-1 bg-white font-semibold text-sm py-3 rounded-xl border border-gray-300 shadow-sm hover:shadow-md transition text-center tab-btn" type="button">
|
77 |
+
📄
|
78 |
+
<span class="font-bold" data-i18n="uploadFile">
|
79 |
+
Upload .txt or .docx file
|
80 |
+
</span>
|
81 |
+
</button>
|
82 |
+
</div>
|
83 |
+
<!-- Tab contents -->
|
84 |
+
<div id="tabContents" class="relative w-full flex-1">
|
85 |
+
<!-- Text input area -->
|
86 |
+
<form id="textForm" aria-label="Vietnamese text input form" class="w-full bg-white rounded-2xl p-6 shadow-sm border border-gray-200 tab-content transition-all duration-300" onsubmit="event.preventDefault()">
|
87 |
+
<label class="block text-xs font-normal text-gray-600 mb-2" for="vietnamese-text" data-i18n="enterTextLabel">
|
88 |
+
Enter Vietnamese text:
|
89 |
+
</label>
|
90 |
+
<textarea class="w-full rounded-[12px] border border-gray-300 bg-gray-50 text-gray-700 text-sm p-3 mb-2 resize-none shadow-sm focus:outline-none focus:ring-2 focus:ring-[#FF6A00] transition" id="vietnamese-text" maxlength="300" placeholder="Ví dụ: Nguyễn Văn A sinh sống tại TP.HCM" rows="5">Nguyễn Văn A đang làm việc tại Hà Nội</textarea>
|
91 |
+
<div aria-live="polite" class="flex justify-between items-center mb-4 text-xs text-gray-500 select-none">
|
92 |
+
<span id="charCount">
|
93 |
+
Characters: 38 / 300
|
94 |
+
</span>
|
95 |
+
<span id="wordCount">
|
96 |
+
Words: 7
|
97 |
+
</span>
|
98 |
+
</div>
|
99 |
+
<button class="inline-flex items-center gap-2 bg-[#FF6A00] text-white text-xs font-semibold py-2 px-4 rounded-xl shadow-md hover:bg-[#e65a00] disabled:bg-[#ffb380] disabled:cursor-not-allowed transition" type="submit">
|
100 |
+
<i class="fas fa-brain"></i>
|
101 |
+
<span data-i18n="analyze">Analyze</span>
|
102 |
+
</button>
|
103 |
+
<div id="textResult" class="mt-4 text-green-700 font-semibold hidden"></div>
|
104 |
+
</form>
|
105 |
+
<!-- File upload area -->
|
106 |
+
<form id="fileForm" aria-label="File upload form" class="w-full bg-white rounded-2xl p-6 shadow-sm border border-gray-200 tab-content transition-all duration-300 absolute top-0 left-0 opacity-0 pointer-events-none" onsubmit="event.preventDefault()">
|
107 |
+
<label class="block text-xs font-normal text-gray-600 mb-2" for="file-upload" data-i18n="uploadFileLabel">
|
108 |
+
Upload .txt or .docx file:
|
109 |
+
</label>
|
110 |
+
<input id="file-upload" type="file" accept=".txt,.docx" class="mb-4 block"/>
|
111 |
+
<button class="inline-flex items-center gap-2 bg-[#FF6A00] text-white text-xs font-semibold py-2 px-4 rounded-xl shadow-md hover:bg-[#e65a00] disabled:bg-[#ffb380] disabled:cursor-not-allowed transition" type="submit" id="analyzeFileBtn" disabled>
|
112 |
+
<i class="fas fa-brain"></i>
|
113 |
+
<span data-i18n="analyze">Analyze</span>
|
114 |
+
</button>
|
115 |
+
<div id="fileResult" class="mt-4 text-green-700 font-semibold hidden"></div>
|
116 |
+
</form>
|
117 |
+
<!-- Setting area (chỉ hiện khi vào Setting) -->
|
118 |
+
<form id="settingForm" class="w-full bg-white rounded-2xl p-6 shadow-sm border border-gray-200 tab-content transition-all duration-300 absolute top-0 left-0 opacity-0 pointer-events-none" onsubmit="event.preventDefault()">
|
119 |
+
<label class="block text-xs font-normal text-gray-600 mb-2" for="language-select" data-i18n="chooseLanguage">
|
120 |
+
Chọn ngôn ngữ / Select language:
|
121 |
+
</label>
|
122 |
+
<select id="language-select" class="w-full rounded-[12px] border border-gray-300 bg-gray-50 text-gray-700 text-sm p-3 mb-4 shadow-sm focus:outline-none focus:ring-2 focus:ring-[#FF6A00] transition">
|
123 |
+
<option value="vi">Tiếng Việt</option>
|
124 |
+
<option value="en">English</option>
|
125 |
+
<option value="zh">中文</option>
|
126 |
+
<option value="ja">日本語</option>
|
127 |
+
</select>
|
128 |
+
<button class="inline-flex items-center gap-2 bg-[#FF6A00] text-white text-xs font-semibold py-2 px-4 rounded-xl shadow-md hover:bg-[#e65a00] transition" type="submit" id="confirmSettingBtn">
|
129 |
+
<i class="fas fa-check"></i>
|
130 |
+
</button>
|
131 |
+
<div id="settingResult" class="mt-4 text-green-700 font-semibold hidden"></div>
|
132 |
+
</form>
|
133 |
+
</div>
|
134 |
+
</main>
|
135 |
+
<script>
|
136 |
+
const translations = {
|
137 |
+
en: {
|
138 |
+
home: "HOME",
|
139 |
+
history: "DETECT HISTORY",
|
140 |
+
stat: "STATISTICS",
|
141 |
+
settings: "SETTINGS",
|
142 |
+
dataAnalysis: "Data Analysis",
|
143 |
+
trainingResults: "Training Results",
|
144 |
+
modelDemo: "Model Demo",
|
145 |
+
enterText: "✍️ Enter text",
|
146 |
+
uploadFile: "Upload .txt or .docx file",
|
147 |
+
enterTextLabel: "Enter Vietnamese text:",
|
148 |
+
uploadFileLabel: "Upload .txt or .docx file:",
|
149 |
+
chooseLanguage: "Select language:",
|
150 |
+
confirm: "Confirm",
|
151 |
+
analyze: "Analyze"
|
152 |
+
},
|
153 |
+
vi: {
|
154 |
+
home: "TRANG CHỦ",
|
155 |
+
history: "LỊCH SỬ PHÁT HIỆN",
|
156 |
+
stat: "THỐNG KÊ",
|
157 |
+
settings: "CÀI ĐẶT",
|
158 |
+
dataAnalysis: "Phân tích dữ liệu",
|
159 |
+
trainingResults: "Kết quả huấn luyện",
|
160 |
+
modelDemo: "Demo mô hình",
|
161 |
+
enterText: "✍️ Nhập văn bản",
|
162 |
+
uploadFile: "Tải lên file .txt hoặc .docx",
|
163 |
+
enterTextLabel: "Nhập văn bản tiếng Việt:",
|
164 |
+
uploadFileLabel: "Tải lên file .txt hoặc .docx:",
|
165 |
+
chooseLanguage: "Chọn ngôn ngữ:",
|
166 |
+
confirm: "Xác nhận",
|
167 |
+
analyze: "Phân tích"
|
168 |
+
},
|
169 |
+
zh: {
|
170 |
+
home: "主页",
|
171 |
+
history: "检测历史",
|
172 |
+
stat: "统计",
|
173 |
+
settings: "设置",
|
174 |
+
dataAnalysis: "数据分析",
|
175 |
+
trainingResults: "训练结果",
|
176 |
+
modelDemo: "模型演示",
|
177 |
+
enterText: "✍️ 输入文本",
|
178 |
+
uploadFile: "上传 .txt 或 .docx 文件",
|
179 |
+
enterTextLabel: "输入越南语文本:",
|
180 |
+
uploadFileLabel: "上传 .txt 或 .docx 文件:",
|
181 |
+
chooseLanguage: "选择语言:",
|
182 |
+
confirm: "确认",
|
183 |
+
analyze: "分析"
|
184 |
+
},
|
185 |
+
ja: {
|
186 |
+
home: "ホーム",
|
187 |
+
history: "検出履歴",
|
188 |
+
stat: "統計",
|
189 |
+
settings: "設定",
|
190 |
+
dataAnalysis: "データ分析",
|
191 |
+
trainingResults: "トレーニング結果",
|
192 |
+
modelDemo: "モデルデモ",
|
193 |
+
enterText: "✍️ テキスト入力",
|
194 |
+
uploadFile: ".txt または .docx ファイルをアップロード",
|
195 |
+
enterTextLabel: "ベトナム語のテキストを入力:",
|
196 |
+
uploadFileLabel: ".txt または .docx ファイルをアップロード:",
|
197 |
+
chooseLanguage: "言語を選択:",
|
198 |
+
confirm: "確認",
|
199 |
+
analyze: "解析"
|
200 |
+
}
|
201 |
+
};
|
202 |
+
|
203 |
+
let currentLang = 'en';
|
204 |
+
|
205 |
+
function setLanguage(lang) {
|
206 |
+
currentLang = lang;
|
207 |
+
document.querySelectorAll('[data-i18n]').forEach(el => {
|
208 |
+
const key = el.getAttribute('data-i18n');
|
209 |
+
if (translations[lang][key]) {
|
210 |
+
el.textContent = translations[lang][key];
|
211 |
+
}
|
212 |
+
});
|
213 |
+
}
|
214 |
+
|
215 |
+
// Sidebar button logic
|
216 |
+
const homeBtn = document.getElementById('homeBtn');
|
217 |
+
const historyBtn = document.getElementById('historyBtn');
|
218 |
+
const statBtn = document.getElementById('statBtn');
|
219 |
+
const settingSidebarBtn = document.getElementById('settingSidebarBtn');
|
220 |
+
const innerTabs = document.getElementById('innerTabs');
|
221 |
+
const settingForm = document.getElementById('settingForm');
|
222 |
+
const textForm = document.getElementById('textForm');
|
223 |
+
const fileForm = document.getElementById('fileForm');
|
224 |
+
|
225 |
+
function showMainTabs() {
|
226 |
+
innerTabs.style.display = '';
|
227 |
+
textForm.style.position = '';
|
228 |
+
fileForm.style.position = 'absolute';
|
229 |
+
settingForm.classList.add('opacity-0', 'pointer-events-none');
|
230 |
+
settingForm.classList.remove('opacity-100');
|
231 |
+
activateTab('text');
|
232 |
+
}
|
233 |
+
function showSettingTab() {
|
234 |
+
innerTabs.style.display = 'none';
|
235 |
+
textForm.classList.add('opacity-0', 'pointer-events-none');
|
236 |
+
fileForm.classList.add('opacity-0', 'pointer-events-none');
|
237 |
+
settingForm.style.position = '';
|
238 |
+
settingForm.classList.remove('opacity-0', 'pointer-events-none');
|
239 |
+
settingForm.classList.add('opacity-100');
|
240 |
+
}
|
241 |
+
|
242 |
+
// Sidebar events
|
243 |
+
homeBtn.addEventListener('click', showMainTabs);
|
244 |
+
historyBtn.addEventListener('click', showMainTabs);
|
245 |
+
statBtn.addEventListener('click', showMainTabs);
|
246 |
+
settingSidebarBtn.addEventListener('click', showSettingTab);
|
247 |
+
|
248 |
+
// Language change logic
|
249 |
+
document.getElementById('language-select').addEventListener('change', function() {
|
250 |
+
setLanguage(this.value);
|
251 |
+
});
|
252 |
+
document.getElementById('settingForm').addEventListener('submit', function(e) {
|
253 |
+
e.preventDefault();
|
254 |
+
setLanguage(document.getElementById('language-select').value);
|
255 |
+
document.getElementById('settingResult').textContent =
|
256 |
+
currentLang === 'vi' ? '✔️ Đã đổi ngôn ngữ!' :
|
257 |
+
currentLang === 'en' ? '✔️ Language changed!' :
|
258 |
+
currentLang === 'zh' ? '✔️ 语言已更改!' :
|
259 |
+
'✔️ 言語が変更されました!';
|
260 |
+
document.getElementById('settingResult').classList.remove('hidden');
|
261 |
+
});
|
262 |
+
|
263 |
+
// Tab switching logic
|
264 |
+
const textTabBtn = document.getElementById('textTabBtn');
|
265 |
+
const fileTabBtn = document.getElementById('fileTabBtn');
|
266 |
+
|
267 |
+
function activateTab(tab) {
|
268 |
+
if (tab === 'text') {
|
269 |
+
textTabBtn.classList.add('active');
|
270 |
+
fileTabBtn.classList.remove('active');
|
271 |
+
textForm.classList.remove('opacity-0', 'pointer-events-none');
|
272 |
+
textForm.classList.add('opacity-100');
|
273 |
+
fileForm.classList.add('opacity-0', 'pointer-events-none');
|
274 |
+
fileForm.classList.remove('opacity-100');
|
275 |
+
} else {
|
276 |
+
fileTabBtn.classList.add('active');
|
277 |
+
textTabBtn.classList.remove('active');
|
278 |
+
fileForm.classList.remove('opacity-0', 'pointer-events-none');
|
279 |
+
fileForm.classList.add('opacity-100');
|
280 |
+
textForm.classList.add('opacity-0', 'pointer-events-none');
|
281 |
+
textForm.classList.remove('opacity-100');
|
282 |
+
}
|
283 |
+
}
|
284 |
+
textTabBtn.addEventListener('click', () => activateTab('text'));
|
285 |
+
fileTabBtn.addEventListener('click', () => activateTab('file'));
|
286 |
+
|
287 |
+
// Textarea character/word count
|
288 |
+
const textarea = document.getElementById('vietnamese-text');
|
289 |
+
const charCount = document.getElementById('charCount');
|
290 |
+
const wordCount = document.getElementById('wordCount');
|
291 |
+
textarea.addEventListener('input', () => {
|
292 |
+
charCount.textContent = `Characters: ${textarea.value.length} / 300`;
|
293 |
+
wordCount.textContent = `Words: ${textarea.value.trim().split(/\s+/).filter(Boolean).length}`;
|
294 |
+
});
|
295 |
+
|
296 |
+
// Analyze text
|
297 |
+
textForm.addEventListener('submit', () => {
|
298 |
+
const result = document.getElementById('textResult');
|
299 |
+
result.textContent = 'Processing...';
|
300 |
+
result.classList.remove('hidden');
|
301 |
+
|
302 |
+
fetch('http://localhost:5000/predict', {
|
303 |
+
method: 'POST',
|
304 |
+
headers: { 'Content-Type': 'application/json' },
|
305 |
+
body: JSON.stringify({ text: textarea.value })
|
306 |
+
})
|
307 |
+
.then(response => response.json())
|
308 |
+
.then(data => {
|
309 |
+
if (data.html_result) {
|
310 |
+
result.innerHTML = data.html_result;
|
311 |
+
} else {
|
312 |
+
result.textContent = 'No result.';
|
313 |
+
}
|
314 |
+
})
|
315 |
+
.catch(err => {
|
316 |
+
result.textContent = 'Error processing request.';
|
317 |
+
});
|
318 |
+
});
|
319 |
+
|
320 |
+
|
321 |
+
// File upload logic
|
322 |
+
const fileInput = document.getElementById('file-upload');
|
323 |
+
const analyzeFileBtn = document.getElementById('analyzeFileBtn');
|
324 |
+
fileInput.addEventListener('change', () => {
|
325 |
+
analyzeFileBtn.disabled = !fileInput.files.length;
|
326 |
+
});
|
327 |
+
fileForm.addEventListener('submit', () => {
|
328 |
+
const file = fileInput.files[0];
|
329 |
+
const result = document.getElementById('fileResult');
|
330 |
+
if (file) {
|
331 |
+
result.textContent = `Đã tải lên và phân tích file: ${file.name}`;
|
332 |
+
result.classList.remove('hidden');
|
333 |
+
}
|
334 |
+
});
|
335 |
+
|
336 |
+
// Khởi tạo tab đầu tiên và ngôn ngữ mặc định
|
337 |
+
activateTab('text');
|
338 |
+
setLanguage(currentLang);
|
339 |
+
</script>
|
340 |
+
<style>
|
341 |
+
.tab-content {
|
342 |
+
transition: opacity 0.3s;
|
343 |
+
}
|
344 |
+
.tab-btn.active {
|
345 |
+
background: #FF6A00 !important;
|
346 |
+
color: #fff !important;
|
347 |
+
box-shadow: 0 2px 8px #ff6a0033;
|
348 |
+
}
|
349 |
+
</style>
|
space/space/space/space/space/src/train.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import wandb
|
2 |
from tqdm import tqdm
|
3 |
-
from
|
4 |
import torch
|
5 |
|
6 |
def train_model(model, optimizer, configs, loaders):
|
|
|
1 |
import wandb
|
2 |
from tqdm import tqdm
|
3 |
+
from evaluate import evaluate
|
4 |
import torch
|
5 |
|
6 |
def train_model(model, optimizer, configs, loaders):
|