diff --git a/README.md b/README.md index a7661e356beb3c7820454c7655f998b947a23192..2c5eb80af8c616cc5615bc6e12454de71b5ae9a9 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-ar Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)** -![Demo Screenshot](https://github.com/user-attachments/assets/4fbcdc49-5a8b-47c0-991e-d3ec839cede9) +![image](https://github.com/user-attachments/assets/ac6f0b96-52f2-4e47-b542-e908c02261c4) ## 🔄 Project Workflow diff --git a/requirements.txt b/requirements.txt index e80f418ad6e7682b5ec2e31452f0cef5ff92bd9b..9039316ac83b79f7d56769c1d07c308e71906287 100644 Binary files a/requirements.txt and b/requirements.txt differ diff --git a/space/README.md b/space/README.md index a7661e356beb3c7820454c7655f998b947a23192..2c5eb80af8c616cc5615bc6e12454de71b5ae9a9 100644 --- a/space/README.md +++ b/space/README.md @@ -17,7 +17,7 @@ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-ar Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)** -![Demo Screenshot](https://github.com/user-attachments/assets/4fbcdc49-5a8b-47c0-991e-d3ec839cede9) +![image](https://github.com/user-attachments/assets/ac6f0b96-52f2-4e47-b542-e908c02261c4) ## 🔄 Project Workflow diff --git a/space/space/space/README.md b/space/space/space/README.md index 2c5eb80af8c616cc5615bc6e12454de71b5ae9a9..a7661e356beb3c7820454c7655f998b947a23192 100644 --- a/space/space/space/README.md +++ b/space/space/space/README.md @@ -17,7 +17,7 @@ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-ar Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)** -![image](https://github.com/user-attachments/assets/ac6f0b96-52f2-4e47-b542-e908c02261c4) +![Demo Screenshot](https://github.com/user-attachments/assets/4fbcdc49-5a8b-47c0-991e-d3ec839cede9) ## 🔄 Project Workflow diff --git a/space/space/space/space/space/requirements.txt b/space/space/space/space/space/requirements.txt index 9039316ac83b79f7d56769c1d07c308e71906287..e80f418ad6e7682b5ec2e31452f0cef5ff92bd9b 100644 Binary files a/space/space/space/space/space/requirements.txt and b/space/space/space/space/space/requirements.txt differ diff --git a/space/space/space/space/space/space/README.md b/space/space/space/space/space/space/README.md index a7661e356beb3c7820454c7655f998b947a23192..2c5eb80af8c616cc5615bc6e12454de71b5ae9a9 100644 --- a/space/space/space/space/space/space/README.md +++ b/space/space/space/space/space/space/README.md @@ -17,7 +17,7 @@ A comprehensive Vietnamese Named Entity Recognition system using state-of-the-ar Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)** -![Demo Screenshot](https://github.com/user-attachments/assets/4fbcdc49-5a8b-47c0-991e-d3ec839cede9) +![image](https://github.com/user-attachments/assets/ac6f0b96-52f2-4e47-b542-e908c02261c4) ## 🔄 Project Workflow diff --git a/space/space/space/space/space/space/space/space/space/space/README.md b/space/space/space/space/space/space/space/space/space/space/README.md index ca121044777ca34994a9cc1568ddc703dfb631bb..a7661e356beb3c7820454c7655f998b947a23192 100644 --- a/space/space/space/space/space/space/space/space/space/space/README.md +++ b/space/space/space/space/space/space/space/space/space/space/README.md @@ -8,80 +8,184 @@ sdk_version: 1.46.1 app_file: src/app.py pinned: false --- -# Vietnamese Named Entity Recognition +# Vietnamese Named Entity Recognition (NER) 🧠 -## 🛠️ Set Up Your Environment With Conda +A comprehensive Vietnamese Named Entity Recognition system using state-of-the-art deep learning models including PhoBERT, CRF, and ensemble methods. -### Option 1: Using `requirements.txt` +## 🚀 Live Demo + +Try the interactive demo: **[Vietnamese NER Demo](https://huggingface.co/spaces/DucLai/Vietnamese_NER)** + +![Demo Screenshot](https://github.com/user-attachments/assets/4fbcdc49-5a8b-47c0-991e-d3ec839cede9) + +## 🔄 Project Workflow + +![Project Flowchart](https://github.com/user-attachments/assets/5b800180-d6c8-44f7-8622-ba188f6cd7be) + +## 🎯 Overview + +This project implements a robust Vietnamese Named Entity Recognition system that can identify and classify entities in Vietnamese text. The system combines multiple approaches including: + +- **PhoBERT-based embeddings** for contextual understanding +- **Conditional Random Fields (CRF)** for sequence labeling +- **Random Forest** with semantic embeddings +- **Rule-based methods** for enhanced accuracy + +## 📂 Project Structure + +``` +VIETNAMESE_NER/ +│ +├── .github/workflows +│ └── main.yml # Auto deploy to Hugging Space +│ +├── data/ # Dataset files +│ └── raw_data.csv # Raw training data +│ +├── notebooks/ # Jupyter notebooks for experimentation +│ ├── Duc_Notebook.ipynb # CRF + RandomForest experiments +│ ├── Softmax_PhoBERT.ipynb # Softmax approach +│ ├── Kien_Rule_base.ipynb # Rule-based method with RF +│ └── Kien_RF_lightgbm.ipynb # RF with semantic embeddings +│ +├── src/ # Main source code +│ ├── __init__.py +│ ├── app.py # Streamlit web application +│ ├── front.py # Highlight function +│ ├── config.py # Project configuration +│ ├── data_loader.py # Data loading utilities +│ ├── preprocessing.py # Data preprocessing functions +│ ├── model.py # Model architecture definitions +│ ├── train.py # Training pipeline +│ ├── evaluate.py # Model evaluation +│ └── predict.py # Inference utilities +│ +├── models/ # Saved model artifacts +│ └── best_model.pt # Best trained model weights +│ +├── outputs/ # Training outputs +│ ├── output.log # Training logs (TensorBoard) +│ └── figures/ # Visualization plots +│ +├── tests/ # Unit tests (planned) +│ +├── requirements.txt # Python dependencies +├── environment.yml # Conda environment file +├── README.md # Project documentation +└── run.py # Main training script +``` + + +## 🏗️ Model Architecture + +The system uses a hybrid architecture combining the strengths of different approaches: + +![Model Architecture](https://github.com/user-attachments/assets/82d243a2-42fa-4dad-b1af-8946767d4f44) + +### Core Components: +- **PhoBERT-Base**: Generates contextual embeddings for Vietnamese text +- **Linear + CRF Layer**: Handles sequence labeling with context awareness +- **Softmax/Random Forest**: Provides single-label prediction capabilities + +## 📊 Dataset & Performance + +### Dataset: VLSP2016 +The model is trained on the VLSP2016 dataset extracted from Vietnamese news articles. + +#### Dataset Statistics: + + + + + + + + + +
Entity FrequencyEntity Distribution
Token Length DistributionSentence Length Distribution
+ + +### Model Performance: + + + + + +
+ F1 Score + + Training Loss +
+ +![Results Comparison](https://github.com/user-attachments/assets/e2fecc2c-8b27-4f28-a174-41078b17567c) + +## 🛠️ Installation & Setup + +### Prerequisites +- Python 3.10+ +- Conda (recommended) + +### Option 1: Using `requirements.txt` ```bash +# Create and activate conda environment conda create --name vnner python=3.10 conda activate vnner + +# Install dependencies pip install -r requirements.txt ``` ### Option 2: Using `environment.yml` - ```bash +# Create environment from yml file conda env create -f environment.yml conda activate vnner ``` -## Run +## 🚀 Quick Start + +### Training the Model ```bash python run.py ``` ---- -## 📂 Project Structure - -``` -my_ai_project/ -│ -├── data/ -│ ├── raw_data.csv # Dữ liệu gốc -│ ├── processed_data_EDA.csv # Dữ liệu sau khi tiền xử lý -│ └── processed_data_full.csv # Dữ liệu sẵn sàng training -│ -├── notebooks/ # Thử nghiệm và khám phá dữ liệu -│ ├── Duc_Notebook.ipynb # CRF + RandomForest -│ ├── Softmax_PhoBERT.ipynb # Softmax -│ -├── src/ # Mã nguồn chính của dự án -│ ├── __init__.py -│ ├── data_loader.py # Nạp và xử lý dữ liệu -│ ├── preprocessing.py # Hàm tiền xử lý dữ liệu -│ ├── model.py # Định nghĩa kiến trúc mô hình -│ ├── train.py # Huấn luyện mô hình -│ ├── evaluate.py # Đánh giá mô hình -│ └── predict.py # Dự đoán với mô hình đã huấn luyện -│ -├── models/ # Mô hình đã lưu sau khi huấn luyện -│ └── best_model.pth # File trọng số mô hình -│ -├── outputs/ # Kết quả, biểu đồ, log, metrics -│ ├── logs/ # Nhật ký huấn luyện (tensorboard/logging) -│ └── figures/ # Biểu đồ trực quan hóa -│ -├── configs/ # File cấu hình cho mô hình, huấn luyện -│ └── config.yaml -│ -├── tests/ # Unit test cho các hàm chính -│ -├── requirements.txt # Thư viện cần cài đặt -├── environment.yml # Môi trường Conda -├── README.md # Giới thiệu dự án -└── run.py # Script chính để chạy toàn bộ pipeline +### Running the Streamlit App +```bash +python src/app.py ``` ---- +## 🧪 Experimental Approaches + +The project explores multiple methodologies: + +1. **PhoBERT + CRF**: Sequential labeling with contextual embeddings +2. **PhoBERT + Softmax**: Direct classification approach +3. **Random Forest + Rule-based**: Traditional ML with linguistic rules +4. **Random Forest + Semantic Embeddings**: Enhanced feature engineering -## 📚 Additional Resources (Optional) +## 🤝 Contributing -If you have any questions about the project structure, consider reading these helpful articles first: +Contributions are welcome! Please feel free to submit a Pull Request. -* [Understanding `__init__.py`](https://zetcode.com/python/init-file/) -* [Markdown Basic Syntax](https://www.markdownguide.org/basic-syntax/#escaping-characters) -* [Difference Between `requirements.txt` and `environment.yml`](https://www.reddit.com/r/learnpython/comments/xvlpdz/why_do_people_provide_a_requirementstxt_or/) +## 📄 License + +This project is open source. Please check the repository for license details. + +## 🙏 Acknowledgments + +- VLSP2016 dataset providers +- PhoBERT model creators +- Hugging Face for hosting the demo + +## 📚 Additional Resources + +For better understanding of the project structure and technologies used: + +- [Understanding `__init__.py`](https://zetcode.com/python/init-file/) +- [Markdown Basic Syntax](https://www.markdownguide.org/basic-syntax/#escaping-characters) +- [Requirements.txt vs Environment.yml](https://www.reddit.com/r/learnpython/comments/xvlpdz/why_do_people_provide_a_requirementstxt_or/) + +--- -These resources could be useful for you! +**Happy NER-ing! 🎯** diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.log b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.log new file mode 100644 index 0000000000000000000000000000000000000000..2b85aa888b2519beaf02f8698f33465d7383d643 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.log @@ -0,0 +1,88 @@ +Train Epoch 1/20: 100%|██████████| 736/736 [00:22<00:00, 32.46it/s, avg_loss=2.69, batch_loss=0.947] +Epoch 1: train_loss=2.6912, train_f1=0.8224, val_loss=1.0848, val_f1=0.8273 +Saved imporved model to ./models/best_epoch_1.pt +Train Epoch 2/20: 100%|██████████| 736/736 [00:21<00:00, 33.55it/s, avg_loss=0.806, batch_loss=0.998] + +Epoch 2: train_loss=0.8061, train_f1=0.8674, val_loss=0.7191, val_f1=0.8613 +Saved imporved model to ./models/best_epoch_2.pt +Train Epoch 3/20: 100%|██████████| 736/736 [00:22<00:00, 32.59it/s, avg_loss=0.584, batch_loss=0.0527] + +Epoch 3: train_loss=0.5842, train_f1=0.8996, val_loss=0.5643, val_f1=0.8895 +Saved imporved model to ./models/best_epoch_3.pt +Train Epoch 4/20: 100%|██████████| 736/736 [00:23<00:00, 31.34it/s, avg_loss=0.478, batch_loss=1.06] + +Epoch 4: train_loss=0.4782, train_f1=0.9122, val_loss=0.4838, val_f1=0.8994 +Saved imporved model to ./models/best_epoch_4.pt +Train Epoch 5/20: 100%|██████████| 736/736 [00:22<00:00, 32.59it/s, avg_loss=0.406, batch_loss=0.421] + +Epoch 5: train_loss=0.4056, train_f1=0.9254, val_loss=0.4281, val_f1=0.9101 +Saved imporved model to ./models/best_epoch_5.pt +Train Epoch 6/20: 100%|██████████| 736/736 [00:21<00:00, 34.15it/s, avg_loss=0.36, batch_loss=1.01] + +Epoch 6: train_loss=0.3599, train_f1=0.9343, val_loss=0.3934, val_f1=0.9190 +Saved imporved model to ./models/best_epoch_6.pt +Train Epoch 7/20: 100%|██████████| 736/736 [00:22<00:00, 33.08it/s, avg_loss=0.322, batch_loss=0.392] + +Epoch 7: train_loss=0.3218, train_f1=0.9383, val_loss=0.3751, val_f1=0.9192 +Saved imporved model to ./models/best_epoch_7.pt +Train Epoch 8/20: 100%|██████████| 736/736 [00:22<00:00, 32.66it/s, avg_loss=0.294, batch_loss=0.468] + +Epoch 8: train_loss=0.2942, train_f1=0.9424, val_loss=0.3560, val_f1=0.9189 +Train Epoch 9/20: 100%|██████████| 736/736 [00:23<00:00, 31.68it/s, avg_loss=0.27, batch_loss=0.681] + +Epoch 9: train_loss=0.2699, train_f1=0.9429, val_loss=0.3521, val_f1=0.9177 +Train Epoch 10/20: 100%|██████████| 736/736 [00:21<00:00, 33.46it/s, avg_loss=0.252, batch_loss=0.525] + +Epoch 10: train_loss=0.2517, train_f1=0.9493, val_loss=0.3413, val_f1=0.9222 +Saved imporved model to ./models/best_epoch_10.pt +Train Epoch 11/20: 100%|██████████| 736/736 [00:22<00:00, 32.92it/s, avg_loss=0.238, batch_loss=0.022] + +Epoch 11: train_loss=0.2383, train_f1=0.9551, val_loss=0.3292, val_f1=0.9232 +Saved imporved model to ./models/best_epoch_11.pt +Train Epoch 12/20: 100%|██████████| 736/736 [00:23<00:00, 31.72it/s, avg_loss=0.222, batch_loss=0.529] + +Epoch 12: train_loss=0.2223, train_f1=0.9543, val_loss=0.3305, val_f1=0.9207 +Train Epoch 13/20: 100%|██████████| 736/736 [00:23<00:00, 31.74it/s, avg_loss=0.213, batch_loss=0.381] + +Epoch 13: train_loss=0.2127, train_f1=0.9593, val_loss=0.3244, val_f1=0.9221 +Train Epoch 14/20: 100%|██████████| 736/736 [00:23<00:00, 31.69it/s, avg_loss=0.203, batch_loss=0.279] + +Epoch 14: train_loss=0.2026, train_f1=0.9609, val_loss=0.3213, val_f1=0.9224 +Train Epoch 15/20: 100%|██████████| 736/736 [00:23<00:00, 31.84it/s, avg_loss=0.193, batch_loss=0.0462] + +Epoch 15: train_loss=0.1925, train_f1=0.9574, val_loss=0.3392, val_f1=0.9117 +Train Epoch 16/20: 100%|██████████| 736/736 [00:22<00:00, 32.11it/s, avg_loss=0.186, batch_loss=0.943] + +Epoch 16: train_loss=0.1863, train_f1=0.9654, val_loss=0.3169, val_f1=0.9250 +Saved imporved model to ./models/best_epoch_16.pt +Train Epoch 17/20: 100%|██████████| 736/736 [00:22<00:00, 32.38it/s, avg_loss=0.18, batch_loss=0.113] + +Epoch 17: train_loss=0.1795, train_f1=0.9677, val_loss=0.3187, val_f1=0.9237 +Train Epoch 18/20: 100%|██████████| 736/736 [00:22<00:00, 33.30it/s, avg_loss=0.173, batch_loss=0.00558] + +Epoch 18: train_loss=0.1728, train_f1=0.9692, val_loss=0.3219, val_f1=0.9173 +Train Epoch 19/20: 100%|██████████| 736/736 [00:23<00:00, 31.48it/s, avg_loss=0.167, batch_loss=0.115] + +Epoch 19: train_loss=0.1673, train_f1=0.9681, val_loss=0.3261, val_f1=0.9195 +Train Epoch 20/20: 100%|██████████| 736/736 [00:22<00:00, 32.17it/s, avg_loss=0.164, batch_loss=0.0463] + +Epoch 20: train_loss=0.1640, train_f1=0.9715, val_loss=0.3230, val_f1=0.9185 + +Loading best model from ./models/best_epoch_16.pt for final evaluation... +Done + +Evaluation on test set ... +Test_loss=0.2967, Test_f1=0.9087 + precision recall f1-score support + + 0 1.00 1.00 1.00 51036 + 1 0.99 0.98 0.99 1112 + 2 0.97 0.99 0.98 506 + 3 0.86 0.79 0.82 180 + 4 0.84 0.80 0.82 291 + 5 0.89 0.91 0.90 939 + 6 0.87 0.84 0.86 428 + + accuracy 0.99 54492 + macro avg 0.92 0.90 0.91 54492 +weighted avg 0.99 0.99 0.99 54492 diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.py new file mode 100644 index 0000000000000000000000000000000000000000..d20dcb237307f4a0db087c1800c5d034be062648 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/results/output.py @@ -0,0 +1,116 @@ +# Model Results +training_log = { + "epoch": list(range(1, 21)), + "train_loss": [ + 2.6912, 0.8061, 0.5842, 0.4782, 0.4056, + 0.3599, 0.3218, 0.2942, 0.2699, 0.2517, + 0.2383, 0.2223, 0.2127, 0.2026, 0.1925, + 0.1863, 0.1795, 0.1728, 0.1673, 0.1640 + ], + "val_loss": [ + 1.0848, 0.7191, 0.5643, 0.4838, 0.4281, + 0.3934, 0.3751, 0.3560, 0.3521, 0.3413, + 0.3292, 0.3305, 0.3244, 0.3213, 0.3392, + 0.3169, 0.3187, 0.3219, 0.3261, 0.3230 + ], + "train_f1": [ + 0.8224, 0.8674, 0.8996, 0.9122, 0.9254, + 0.9343, 0.9383, 0.9424, 0.9429, 0.9493, + 0.9551, 0.9543, 0.9593, 0.9609, 0.9574, + 0.9654, 0.9677, 0.9692, 0.9681, 0.9715 + ], + "val_f1": [ + 0.8273, 0.8613, 0.8895, 0.8994, 0.9101, + 0.9190, 0.9192, 0.9189, 0.9177, 0.9222, + 0.9232, 0.9207, 0.9221, 0.9224, 0.9117, + 0.9250, 0.9237, 0.9173, 0.9195, 0.9185 + ] +} + +report_dict = { + 'O': {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 51036}, + 'B-PER': {"precision": 0.99, "recall": 0.98, "f1-score": 0.98, "support": 1112}, + 'I-PER': {"precision": 0.97, "recall": 0.99, "f1-score": 0.98, "support": 506}, + 'B-ORG': {"precision": 0.93, "recall": 0.95, "f1-score": 0.94, "support": 939}, + 'I-ORG': {"precision": 0.93, "recall": 0.91, "f1-score": 0.92, "support": 428}, + 'B-LOC': {"precision": 0.83, "recall": 0.84, "f1-score": 0.84, "support": 180}, + 'I-LOC': {"precision": 0.88, "recall": 0.84, "f1-score": 0.86, "support": 291}, + "accuracy": 0.99, + "macro avg": {"precision": 0.93, "recall": 0.93, "f1-score": 0.93, "support": 54492}, + "weighted avg": {"precision": 0.99, "recall": 0.99, "f1-score": 0.99, "support": 54492} +} + + +report_dict_2 = { + 'O': {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 68476}, + 'B-PER': {"precision": 0.99, "recall": 0.98, "f1-score": 0.98, "support": 1464}, + 'I-PER': {"precision": 0.98, "recall": 0.98, "f1-score": 0.98, "support": 686}, + 'B-ORG': {"precision": 0.77, "recall": 0.82, "f1-score": 0.80, "support": 257}, + 'I-ORG': {"precision": 0.80, "recall": 0.77, "f1-score": 0.78, "support": 430}, + 'B-LOC': {"precision": 0.88, "recall": 0.90, "f1-score": 0.89, "support": 1241}, + 'I-LOC': {"precision": 0.83, "recall": 0.82, "f1-score": 0.82, "support": 554}, + "accuracy": 0.99, + "macro avg": {"precision": 0.89, "recall": 0.89, "f1-score": 0.89, "support": 73108}, + "weighted avg": {"precision": 0.99, "recall": 0.99, "f1-score": 0.99, "support": 73108} +} + + +model_compare = { + "Header": ["Model", "F1", "Accuracy"], + "Data": { + "PhoBERT + CRF": {"F1": 0.93, "Accuracy": 0.99}, + "CRF": {"F1": 0.91, "Accuracy": 0.99}, + "Softmax": {"F1": 0.89, "Accuracy": 0.99}, + "Random Forest": {"F1": 0.78, "Accuracy": 0.98} + } +} + +data_compare = { + "Header": ["Data Preprocessing Strategy", "F1"], + "Data": { + "Raw": 0.93, + "Crawl for Balance": 0.91, + "Remove Sentences with Only 'O' Tags": 0.91 + } +} + + + +# EDA +data_aug_count_sorted = { + 'B-PER': 474, + 'I-PER': 121, + 'B-LOC': 874, + 'I-LOC': 289, + 'B-ORG': 1110, + 'I-ORG': 761 +} + +raw_data_count_sorted = { + 'B-PER': 7479, + 'I-PER': 3522, + 'B-LOC': 6244, + 'I-LOC': 2783, + 'B-ORG': 1212, + 'I-ORG': 2055, + 'B-NAT': 282, + 'I-NAT': 279 +} + +raw_data_count_withoutNAT_sorted = { + 'B-PER': 7479, + 'I-PER': 3522, + 'B-LOC': 6244, + 'I-LOC': 2783, + 'B-ORG': 1212, + 'I-ORG': 2055 +} + +combined_count_sorted = { + 'B-PER': 7953, + 'I-PER': 3643, + 'B-LOC': 7118, + 'I-LOC': 3072, + 'B-ORG': 2322, + 'I-ORG': 2816 +} diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore index 1923239781b0a3437631d87ec0ed5b52be406b8d..977d789c25b5dbfdf383a416cd72e9ff680c34be 100644 --- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore @@ -10,8 +10,6 @@ __pycache__/ # Dataset and results folders data/ -results/ -outputs/ logs/ # Large files diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt index 8823d2c82071e62bde7663c8e04f180d019119a0..9039316ac83b79f7d56769c1d07c308e71906287 100644 Binary files a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt and b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt differ diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.github/workflows/main.yml b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.github/workflows/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..735b15ea5c845e6fae62054f6fdfcb80e7cfabf4 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.github/workflows/main.yml @@ -0,0 +1,47 @@ +name: Deploy to Hugging Face Space + +on: + push: + branches: + - main # hoặc branch bạn dùng + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout repo + uses: actions/checkout@v3 + + - name: Set up Git + run: | + git config --global user.email "actions@github.com" + git config --global user.name "GitHub Actions" + + - name: Push to Hugging Face Spaces + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + git clone https://huggingface.co/spaces/DucLai/Vietnamese_NER space + + # Đồng bộ code vào repo Space (không copy .git) + rsync -av --exclude '.git' ./ space/ + + # Xoá file binary ra khỏi Git index trước khi commit + cd space + find . -type f \( \ + -iname "*.png" -o \ + -iname "*.jpg" -o \ + -iname "*.jpeg" -o \ + -iname "*.mp4" -o \ + -iname "*.zip" -o \ + -iname "*.pth" -o \ + -iname "*.h5" -o \ + -iname "*.tar.gz" -o \ + -iname "*.wav" \ + \) -exec git rm --cached {} \; || true + + # Commit và push + git add . + git commit -m "Auto-deploy from GitHub (binary files removed)" || echo "No changes to commit" + git push https://DucLai:${HF_TOKEN}@huggingface.co/spaces/DucLai/Vietnamese_NER HEAD diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..1923239781b0a3437631d87ec0ed5b52be406b8d --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitignore @@ -0,0 +1,23 @@ +# Python cache +__pycache__/ +*.pyc +*.pyo + +# Hugging Face binary/model outputs +*.pth +*.h5 +*.ckpt + +# Dataset and results folders +data/ +results/ +outputs/ +logs/ + +# Large files +*.zip +*.tar.gz +*.mp4 +*.png +*.jpg +*.jpeg diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/LICENSE b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/configs/config.yaml b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/configs/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4d6a3b44f49d40438deea0c8a83c0d34021efdaf --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/configs/config.yaml @@ -0,0 +1 @@ +ECHO is on. diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/environment.yml b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..a24ba09e39f547180c776187a9fa983d7f7d8086 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/environment.yml @@ -0,0 +1,9 @@ +name: vnner +channels: + - defaults + - conda-forge +dependencies: + - python=3.10 + - pip + - pip: + - -r requirements.txt diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/models/best_epoch_16.pt b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/models/best_epoch_16.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c74d469d0d76ed8e1822312cf6d186ca8c7e55d --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/models/best_epoch_16.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:622cac3a55eec6a245f70c2ec7591d8fbfa8c18e13db7555915405fb57b145a0 +size 24130 diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e9b3a353d53842e138708c0bd0bc253937851417 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb @@ -0,0 +1,7467 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "2707f2f1d216421385cc4166127d696a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5350c7b689f14d138357f92a78479d4b", + "IPY_MODEL_5423cc4795f9415ebcf7eb2eb45f08b4", + "IPY_MODEL_f1ef72618a0b4710ac6ab5cfc86ed252" + ], + "layout": "IPY_MODEL_8eb197c462304d6fb6d15c175db315f5" + } + }, + "5350c7b689f14d138357f92a78479d4b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a4178b6f78bf4f2aa6cb7ad924308970", + "placeholder": "​", + "style": "IPY_MODEL_59f7b90017364fc3ad2969061e3efba2", + "value": "config.json: 100%" + } + }, + "5423cc4795f9415ebcf7eb2eb45f08b4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3ca4b088872649c7856c3be691ca6224", + "max": 557, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1c77b809b5ec42e7b00b512cbbc7071f", + "value": 557 + } + }, + "f1ef72618a0b4710ac6ab5cfc86ed252": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c657eed438b741189da3846983d8e0a6", + "placeholder": "​", + "style": "IPY_MODEL_21f740caf6a94a468a54552961c54d63", + "value": " 557/557 [00:00<00:00, 13.2kB/s]" + } + }, + "8eb197c462304d6fb6d15c175db315f5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a4178b6f78bf4f2aa6cb7ad924308970": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "59f7b90017364fc3ad2969061e3efba2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3ca4b088872649c7856c3be691ca6224": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1c77b809b5ec42e7b00b512cbbc7071f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c657eed438b741189da3846983d8e0a6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "21f740caf6a94a468a54552961c54d63": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7b988f4f4c97462c9ee30aebabf4029b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8b5ccad1921342dca6cbf5adcc93e9fa", + "IPY_MODEL_25c32ab8424242daa414680dc5b8ea57", + "IPY_MODEL_71a5bbc69fe648168877b7ab6f6cd8a6" + ], + "layout": "IPY_MODEL_0434bc2965584b018978d590bcda68c6" + } + }, + "8b5ccad1921342dca6cbf5adcc93e9fa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b9ba2a9d9c704dd091cf17241541c280", + "placeholder": "​", + "style": "IPY_MODEL_a75ea7ca7e384c948f07eeffa8f676b5", + "value": "vocab.txt: 100%" + } + }, + "25c32ab8424242daa414680dc5b8ea57": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e0a24e13af474afc98fc5c93c561e880", + "max": 895321, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4a1b96a5fde64fb499eeacd733b72c32", + "value": 895321 + } + }, + "71a5bbc69fe648168877b7ab6f6cd8a6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f761d67cb46a4af3b49a22209cd450a9", + "placeholder": "​", + "style": "IPY_MODEL_8125e9952f68467d8c7d55da426c9098", + "value": " 895k/895k [00:00<00:00, 4.78MB/s]" + } + }, + "0434bc2965584b018978d590bcda68c6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b9ba2a9d9c704dd091cf17241541c280": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a75ea7ca7e384c948f07eeffa8f676b5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e0a24e13af474afc98fc5c93c561e880": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a1b96a5fde64fb499eeacd733b72c32": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f761d67cb46a4af3b49a22209cd450a9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8125e9952f68467d8c7d55da426c9098": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0885e06d76f24053890d4ade7044b22e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4303d7ea0bf14661803caf8f617ce788", + "IPY_MODEL_cd2aec8cb6de49f095681da2b99e7660", + "IPY_MODEL_fe84d9c4f3124682809f6e7117b40638" + ], + "layout": "IPY_MODEL_c14214a879ca425c8955b380d73f3010" + } + }, + "4303d7ea0bf14661803caf8f617ce788": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2f28ad6792294553b24cbaa7dea533af", + "placeholder": "​", + "style": "IPY_MODEL_c58168f9246046728211a403540060f5", + "value": "bpe.codes: 100%" + } + }, + "cd2aec8cb6de49f095681da2b99e7660": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_64473dfca69a45438094656d2b995207", + "max": 1135173, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0a782a4d3cfc4b9cbd802bedcdae3153", + "value": 1135173 + } + }, + "fe84d9c4f3124682809f6e7117b40638": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dc5b47931e0340a4980ae315c6a802a5", + "placeholder": "​", + "style": "IPY_MODEL_8d431574a7a14c5fb1466fa97a33e4fb", + "value": " 1.14M/1.14M [00:00<00:00, 8.93MB/s]" + } + }, + "c14214a879ca425c8955b380d73f3010": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f28ad6792294553b24cbaa7dea533af": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c58168f9246046728211a403540060f5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "64473dfca69a45438094656d2b995207": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0a782a4d3cfc4b9cbd802bedcdae3153": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "dc5b47931e0340a4980ae315c6a802a5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8d431574a7a14c5fb1466fa97a33e4fb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "960273e5205f49efb2be0576d2f74bca": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7e3192df593248c7bfafd5b0347a2b1b", + "IPY_MODEL_d18a2302adaa415785ed8f8bb578b5b9", + "IPY_MODEL_9604f5d16db5446a83400c70071c90e7" + ], + "layout": "IPY_MODEL_337bbd72f0d4481f8a13cb8323afa241" + } + }, + "7e3192df593248c7bfafd5b0347a2b1b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8b2536405b1b4c62a0988b6360379060", + "placeholder": "​", + "style": "IPY_MODEL_24ea201c035d4e5a96f6d95c146c6ca8", + "value": "tokenizer.json: 100%" + } + }, + "d18a2302adaa415785ed8f8bb578b5b9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_51027870cc714d8db898838afc41d396", + "max": 3132320, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_380dca91b19d43d4b3de84afe29f3bd4", + "value": 3132320 + } + }, + "9604f5d16db5446a83400c70071c90e7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5d102b9cc45943808fadad7c06ee4352", + "placeholder": "​", + "style": "IPY_MODEL_ba6e6b0b454b471a9b529dc24bb13bdd", + "value": " 3.13M/3.13M [00:00<00:00, 24.4MB/s]" + } + }, + "337bbd72f0d4481f8a13cb8323afa241": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8b2536405b1b4c62a0988b6360379060": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "24ea201c035d4e5a96f6d95c146c6ca8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "51027870cc714d8db898838afc41d396": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "380dca91b19d43d4b3de84afe29f3bd4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "5d102b9cc45943808fadad7c06ee4352": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ba6e6b0b454b471a9b529dc24bb13bdd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9c6331e2efe74bfd9292c4948beaafb5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_26e942f1e9b441b1861a6ffc5b3299ed", + "IPY_MODEL_2d8c0bd34c104619bee375c98eb47160", + "IPY_MODEL_1702bb0d2e964f28bca673b1ac4550d3" + ], + "layout": "IPY_MODEL_1a128f1ccf93416a873560bd462a287e" + } + }, + "26e942f1e9b441b1861a6ffc5b3299ed": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6ebff4a83fe54c688224e27bd56b1d80", + "placeholder": "​", + "style": "IPY_MODEL_4cd7105d16db47ca90f66d6932beed36", + "value": "pytorch_model.bin: 100%" + } + }, + "2d8c0bd34c104619bee375c98eb47160": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e0e19cc9d12a4f91a4b37fcc8ffd691a", + "max": 542923308, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_aa5bf384ac5d4aa9976fda08d2574d57", + "value": 542923308 + } + }, + "1702bb0d2e964f28bca673b1ac4550d3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_92ee08ad38d541c8a0d7e151cb478ab9", + "placeholder": "​", + "style": "IPY_MODEL_871356ac545e462d8318ba3830de1ac9", + "value": " 543M/543M [00:03<00:00, 176MB/s]" + } + }, + "1a128f1ccf93416a873560bd462a287e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6ebff4a83fe54c688224e27bd56b1d80": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4cd7105d16db47ca90f66d6932beed36": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e0e19cc9d12a4f91a4b37fcc8ffd691a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "aa5bf384ac5d4aa9976fda08d2574d57": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "92ee08ad38d541c8a0d7e151cb478ab9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "871356ac545e462d8318ba3830de1ac9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "356930c123634c258b194b79654b602c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ff5fe04a8b43428f94e82affa61c8aa6", + "IPY_MODEL_89389fd2337f4e6fa564282157d0f9a8", + "IPY_MODEL_ec5b0bbf78fd4118b455040b801cd0fa" + ], + "layout": "IPY_MODEL_fe441fbf9bdd4d2099e67ed31eafce12" + } + }, + "ff5fe04a8b43428f94e82affa61c8aa6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c3d75f70be8a41f0a4aaaf43b65df684", + "placeholder": "​", + "style": "IPY_MODEL_da5dfc79703041c78fd2de3ea04ae025", + "value": "model.safetensors: 100%" + } + }, + "89389fd2337f4e6fa564282157d0f9a8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_18a9ab8c76b84ebc8a17c5854649e6ce", + "max": 542900336, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d128a1638ad0472d99a3bd52b5aae3a7", + "value": 542900336 + } + }, + "ec5b0bbf78fd4118b455040b801cd0fa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_06b631379c0740289420fda9a8b57892", + "placeholder": "​", + "style": "IPY_MODEL_29cbf804df244f41a57d9b83c7c2427e", + "value": " 543M/543M [00:05<00:00, 110MB/s]" + } + }, + "fe441fbf9bdd4d2099e67ed31eafce12": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c3d75f70be8a41f0a4aaaf43b65df684": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "da5dfc79703041c78fd2de3ea04ae025": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "18a9ab8c76b84ebc8a17c5854649e6ce": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d128a1638ad0472d99a3bd52b5aae3a7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "06b631379c0740289420fda9a8b57892": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "29cbf804df244f41a57d9b83c7c2427e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "!pip install pytorch-crf" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3OUdWCMb_XpJ", + "outputId": "593a403e-3432-428f-fd8e-93f8957d740a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting pytorch-crf\n", + " Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)\n", + "Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)\n", + "Installing collected packages: pytorch-crf\n", + "Successfully installed pytorch-crf-0.7.2\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import wandb\n", + "wandb.login()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 191 + }, + "id": "inx5CwCVgIvl", + "outputId": "f9317181-b433-468e-ecec-dc392e540e52" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " window._wandbApiKey = new Promise((resolve, reject) => {\n", + " function loadScript(url) {\n", + " return new Promise(function(resolve, reject) {\n", + " let newScript = document.createElement(\"script\");\n", + " newScript.onerror = reject;\n", + " newScript.onload = resolve;\n", + " document.body.appendChild(newScript);\n", + " newScript.src = url;\n", + " });\n", + " }\n", + " loadScript(\"https://cdn.jsdelivr.net/npm/postmate/build/postmate.min.js\").then(() => {\n", + " const iframe = document.createElement('iframe')\n", + " iframe.style.cssText = \"width:0;height:0;border:none\"\n", + " document.body.appendChild(iframe)\n", + " const handshake = new Postmate({\n", + " container: iframe,\n", + " url: 'https://wandb.ai/authorize'\n", + " });\n", + " const timeout = setTimeout(() => reject(\"Couldn't auto authenticate\"), 5000)\n", + " handshake.then(function(child) {\n", + " child.on('authorize', data => {\n", + " clearTimeout(timeout)\n", + " resolve(data)\n", + " });\n", + " });\n", + " })\n", + " });\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n", + "wandb: Paste an API key from your profile and hit enter:" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ··········\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: No netrc file found, creating one.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mlaiducaivn\u001b[0m (\u001b[33mlaiducaivn-fpt-university\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Data Preparation" + ], + "metadata": { + "id": "YY74yDYXID_a" + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}\n", + "df_train = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"train\"])\n", + "df_valid = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"valid\"])\n", + "df = pd.concat([df_train, df_valid]).reset_index(drop=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "66m2J73nGXEV", + "outputId": "5a9a1457-9660-47ab-a5b7-85264c1cd34b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Tạo thêm các cột khác\n", + "def join_tokens(tokens):\n", + " text = ' '.join(tokens)\n", + " return text\n", + "\n", + "def reform_raw_text(tokens):\n", + " text = ' '.join(tokens)\n", + " return text.replace(\"_\", \" \")\n", + "\n", + "def label(x):\n", + " return [id_tag[int(i)] for i in x]\n", + "\n", + "def replace_7_8(lst):\n", + " return [0 if x in (7, 8) else x for x in lst]\n", + "\n", + "\n", + "tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}\n", + "id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}\n", + "\n", + "\n", + "df['ner_tags'] = df['ner_tags'].apply(replace_7_8)\n", + "df['text_withseg'] = df['tokens'].apply(join_tokens)\n", + "df['text_raw'] = df['tokens'].apply(reform_raw_text)\n", + "df[\"ner_labels\"] = df.ner_tags.apply(label)\n", + "df.columns = ['tokens', 'id', 'seg_text', 'raw_text', 'labels']\n", + "df\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "U81OmhBeGmMM", + "outputId": "c8bec51d-a878-4b12-e2f1-42076572a731" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " tokens \\\n", + "0 [Không_khí, thật, náo_nhiệt, .] \n", + "1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n", + "2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n", + "3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n", + "4 [Nhật_ký, của, thuyền_viên, .] \n", + "... ... \n", + "16853 [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ... \n", + "16854 [Nhưng, mọi, chuyện, không, dừng, ở, đó, .] \n", + "16855 [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,... \n", + "16856 [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ... \n", + "16857 [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t... \n", + "\n", + " id \\\n", + "0 [0, 0, 0, 0] \n", + "1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n", + "4 [0, 0, 0, 0] \n", + "... ... \n", + "16853 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... \n", + "16854 [0, 0, 0, 0, 0, 0, 0, 0] \n", + "16855 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "16856 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "16857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "\n", + " seg_text \\\n", + "0 Không_khí thật náo_nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,... \n", + "4 Nhật_ký của thuyền_viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh... \n", + "16856 Biết_bao người đã tình_nguyện hiến_dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành_tích ” tiêu tiền của m... \n", + "\n", + " raw_text \\\n", + "0 Không khí thật náo nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,... \n", + "4 Nhật ký của thuyền viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh... \n", + "16856 Biết bao người đã tình nguyện hiến dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành tích ” tiêu tiền của m... \n", + "\n", + " labels \n", + "0 [O, O, O, O] \n", + "1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n", + "2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n", + "3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n", + "4 [O, O, O, O] \n", + "... ... \n", + "16853 [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,... \n", + "16854 [O, O, O, O, O, O, O, O] \n", + "16855 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "16856 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "16857 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "\n", + "[16858 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokensidseg_textraw_textlabels
0[Không_khí, thật, náo_nhiệt, .][0, 0, 0, 0]Không_khí thật náo_nhiệt .Không khí thật náo nhiệt .[O, O, O, O]
1[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...
2[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...
3[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...
4[Nhật_ký, của, thuyền_viên, .][0, 0, 0, 0]Nhật_ký của thuyền_viên .Nhật ký của thuyền viên .[O, O, O, O]
..................
16853[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...
16854[Nhưng, mọi, chuyện, không, dừng, ở, đó, .][0, 0, 0, 0, 0, 0, 0, 0]Nhưng mọi chuyện không dừng ở đó .Nhưng mọi chuyện không dừng ở đó .[O, O, O, O, O, O, O, O]
16855[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...
16856[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...Biết bao người đã tình nguyện hiến dâng cả cuộ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
16857[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Trên đây mới là “ thành_tích ” tiêu tiền của m...Trên đây mới là “ thành tích ” tiêu tiền của m...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
\n", + "

16858 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 16858,\n \"fields\": [\n {\n \"column\": \"tokens\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seg_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16787,\n \"samples\": [\n \"T\\u00ednh th\\u00f4ng_minh , l\\u1ea1i t\\u00f2_m\\u00f2 , anh Ki\\u1ec7m b\\u1eaft_\\u0111\\u1ea7u \\u0111i \\u0111\\u1ebfn c\\u00e1c x\\u01b0\\u1edfng c\\u01a1_kh\\u00ed \\u0111\\u1ec3 quan_s\\u00e1t c\\u00e1c lo\\u1ea1i m\\u00e1y_m\\u00f3c , r\\u1ed3i v\\u1ec1 nh\\u00e0 suy_ngh\\u0129 v\\u00e0 c\\u1ea7m b\\u00fat v\\u1ebd ph\\u00e1c_ho\\u1ea1 ra c\\u00e1i m\\u00e1y v\\u00fat g\\u1ea1o .\",\n \"V\\u1eady th\\u00ec , h\\u1ecd c\\u1ea7n ph\\u1ea3i \\u0111\\u01b0\\u1ee3c gi\\u00fap_\\u0111\\u1ee1 , ph\\u1ea3i \\u0111\\u01b0\\u1ee3c s\\u1ed1ng \\u0111\\u00e0ng_ho\\u00e0ng , ph\\u1ea3i \\u0111\\u01b0\\u1ee3c l\\u00e0m ng\\u01b0\\u1eddi d\\u00f9 ch\\u1ec9 l\\u00e0 nh\\u1eefng ng\\u00e0y cu\\u1ed1i_c\\u00f9ng .\",\n \"Nhi\\u1ec1u ng\\u01b0\\u1eddi th\\u00f4ng_d\\u1ecbch c\\u00f9ng th\\u1eddi v\\u1edbi Nguy\\u1ec5n Trung Hi\\u1ebfu c\\u0169ng \\u0111\\u00e3 ch\\u1ebft trong khi th\\u1ef1c_hi\\u1ec7n nhi\\u1ec7m_v\\u1ee5 t\\u1ea1i chi\\u1ebfn_tr\\u01b0\\u1eddng ho\\u1eb7c tr\\u00ean \\u0111\\u01b0\\u1eddng h\\u00e0nh_qu\\u00e2n .\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"raw_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16785,\n \"samples\": [\n \"Trong kho\\u1ea3ng th\\u1eddi gian \\u0111\\u00f3 ch\\u1ecb c\\u1ed1 c\\u00f4ng t\\u1ef1 h\\u1ecdc ti\\u1ebfng Anh .\",\n \"Sau \\u0111\\u00f3 , ch\\u00ednh b\\u00e0 Susan \\u0111\\u00e3 \\u0111\\u01b0a Mai l\\u00ean h\\u1ecdc \\u0111\\u1ea1i h\\u1ecdc , m\\u1ed7i n\\u0103m chu c\\u1ea5p cho c\\u00f4 30.000 USD .\",\n \"T\\u1eeb r\\u1ea5t l\\u00e2u r\\u1ed3i t\\u00f4i v\\u1eabn ngh\\u0129 n\\u1ebfu nh\\u01b0 cu\\u1ed1n s\\u00e1ch \\u0111\\u01b0\\u1ee3c xu\\u1ea5t b\\u1ea3n , ho\\u1eb7c ng\\u01b0\\u1eddi ta l\\u00e0m phim v\\u1ec1 n\\u00f3 th\\u00ec t\\u00f4i s\\u1ebd d\\u00f9ng s\\u1ed1 ti\\u1ec1n b\\u00e1n s\\u00e1ch \\u0111\\u1ec3 thi\\u1ebft l\\u1eadp m\\u1ed9t s\\u1ed1 gi\\u01b0\\u1eddng b\\u1ec7nh t\\u1ea1i H\\u00e0 N\\u1ed9i .\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"labels\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Get Embedding Vectors" + ], + "metadata": { + "id": "ooewb479FdqS" + } + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "from transformers import AutoTokenizer, AutoModel\n", + "from tqdm import tqdm\n", + "\n", + "# Load PhoBERT tokenizer và model\n", + "tokenizer = AutoTokenizer.from_pretrained(\"vinai/phobert-base\", use_fast=False)\n", + "model = AutoModel.from_pretrained(\"vinai/phobert-base\")\n", + "model.eval()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 920, + "referenced_widgets": [ + "2707f2f1d216421385cc4166127d696a", + "5350c7b689f14d138357f92a78479d4b", + "5423cc4795f9415ebcf7eb2eb45f08b4", + "f1ef72618a0b4710ac6ab5cfc86ed252", + "8eb197c462304d6fb6d15c175db315f5", + "a4178b6f78bf4f2aa6cb7ad924308970", + "59f7b90017364fc3ad2969061e3efba2", + "3ca4b088872649c7856c3be691ca6224", + "1c77b809b5ec42e7b00b512cbbc7071f", + "c657eed438b741189da3846983d8e0a6", + "21f740caf6a94a468a54552961c54d63", + "7b988f4f4c97462c9ee30aebabf4029b", + "8b5ccad1921342dca6cbf5adcc93e9fa", + "25c32ab8424242daa414680dc5b8ea57", + "71a5bbc69fe648168877b7ab6f6cd8a6", + "0434bc2965584b018978d590bcda68c6", + "b9ba2a9d9c704dd091cf17241541c280", + "a75ea7ca7e384c948f07eeffa8f676b5", + "e0a24e13af474afc98fc5c93c561e880", + "4a1b96a5fde64fb499eeacd733b72c32", + "f761d67cb46a4af3b49a22209cd450a9", + "8125e9952f68467d8c7d55da426c9098", + "0885e06d76f24053890d4ade7044b22e", + "4303d7ea0bf14661803caf8f617ce788", + "cd2aec8cb6de49f095681da2b99e7660", + "fe84d9c4f3124682809f6e7117b40638", + "c14214a879ca425c8955b380d73f3010", + "2f28ad6792294553b24cbaa7dea533af", + "c58168f9246046728211a403540060f5", + "64473dfca69a45438094656d2b995207", + "0a782a4d3cfc4b9cbd802bedcdae3153", + "dc5b47931e0340a4980ae315c6a802a5", + "8d431574a7a14c5fb1466fa97a33e4fb", + "960273e5205f49efb2be0576d2f74bca", + "7e3192df593248c7bfafd5b0347a2b1b", + "d18a2302adaa415785ed8f8bb578b5b9", + "9604f5d16db5446a83400c70071c90e7", + "337bbd72f0d4481f8a13cb8323afa241", + "8b2536405b1b4c62a0988b6360379060", + "24ea201c035d4e5a96f6d95c146c6ca8", + "51027870cc714d8db898838afc41d396", + "380dca91b19d43d4b3de84afe29f3bd4", + "5d102b9cc45943808fadad7c06ee4352", + "ba6e6b0b454b471a9b529dc24bb13bdd", + "9c6331e2efe74bfd9292c4948beaafb5", + "26e942f1e9b441b1861a6ffc5b3299ed", + "2d8c0bd34c104619bee375c98eb47160", + "1702bb0d2e964f28bca673b1ac4550d3", + "1a128f1ccf93416a873560bd462a287e", + "6ebff4a83fe54c688224e27bd56b1d80", + "4cd7105d16db47ca90f66d6932beed36", + "e0e19cc9d12a4f91a4b37fcc8ffd691a", + "aa5bf384ac5d4aa9976fda08d2574d57", + "92ee08ad38d541c8a0d7e151cb478ab9", + "871356ac545e462d8318ba3830de1ac9", + "356930c123634c258b194b79654b602c", + "ff5fe04a8b43428f94e82affa61c8aa6", + "89389fd2337f4e6fa564282157d0f9a8", + "ec5b0bbf78fd4118b455040b801cd0fa", + "fe441fbf9bdd4d2099e67ed31eafce12", + "c3d75f70be8a41f0a4aaaf43b65df684", + "da5dfc79703041c78fd2de3ea04ae025", + "18a9ab8c76b84ebc8a17c5854649e6ce", + "d128a1638ad0472d99a3bd52b5aae3a7", + "06b631379c0740289420fda9a8b57892", + "29cbf804df244f41a57d9b83c7c2427e" + ] + }, + "id": "b04c2Xq7IBac", + "outputId": "c8575bc2-8b3d-415c-8d67-b7cbed0343d3" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/557 [00:00\", \"\"]:\n", + " continue\n", + "\n", + " if token.endswith(\"@@\"):\n", + " current_vecs.append(emb)\n", + " else:\n", + " current_vecs.append(emb)\n", + " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n", + " word_embeddings.append(word_emb)\n", + " current_vecs = []\n", + "\n", + " if current_vecs: # Trong trường hợp sót lại cuối câu\n", + " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n", + " word_embeddings.append(word_emb)\n", + "\n", + " return word_embeddings" + ], + "metadata": { + "id": "z-JZZ2VrJiQ6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "model.to(device)\n", + "\n", + "all_embeddings = [] # list of [seq_len_i, 768] tensors\n", + "all_labels = [] # list of [seq_len_i,] tensors\n", + "len_em = []\n", + "\n", + "# count = 0\n", + "\n", + "for i, row in tqdm(df.iterrows(), total=len(df)):\n", + "\n", + " # count += 1\n", + " # if count == 500:\n", + " # break\n", + "\n", + " # Truy cập phần tử từng dòng\n", + " sentence = row['seg_text']\n", + " gold_labels = row[\"id\"]\n", + "\n", + " # Cho sentence đi qua SentencePiece\n", + " input_ids = tokenizer.encode(sentence, return_tensors=\"pt\").to(device)\n", + "\n", + " tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())\n", + "\n", + " # Encode tạo embeddings\n", + " with torch.no_grad():\n", + " outputs = model(input_ids)\n", + " last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()\n", + "\n", + " # Gộp các embeddings đã bị tách khi đi qua SentencePiece\n", + " word_embeds = group_embeddings(tokens, last_hidden_state)\n", + "\n", + " # Kiểm tra số lượng embeddings và số lượng labels\n", + " if len(word_embeds) != len(gold_labels):\n", + " print(f\"Warning: Skipping row {i} - length mismatch\")\n", + " continue\n", + "\n", + " # Thêm vào list tổng / Tới đây là data đã sẵn sàng cho training\n", + " all_embeddings.append(torch.stack(word_embeds))\n", + " all_labels.append(torch.tensor(gold_labels))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "3wpjBGK3JuwS", + "outputId": "6788bd6f-d9c7-498f-f5dc-0e2766656ed1" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + " 0%| | 0/16858 [00:00 best_f1 or test_acc > best_acc:\n", + " best_f1 = max(test_f1, best_f1)\n", + " best_acc = max(test_acc, best_acc)\n", + " ckpt_path = f\"checkpoints/best_epoch_{epoch}.pt\"\n", + " torch.save(model.state_dict(), ckpt_path)\n", + " wandb.save(ckpt_path)\n", + " print(f\"Saved improved model to {ckpt_path}\")\n", + "\n", + "# Finish W&B run\n", + "wandb.finish()\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "RU_M57LePTb0", + "outputId": "149d92fe-7a3f-47e7-c463-178d80588eb0" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Tracking run with wandb version 0.19.11" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Run data is saved locally in /content/wandb/run-20250605_133906-tjmjkx7n" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Syncing run CRF_VLSP2016 to Weights & Biases (docs)
" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View project at https://wandb.ai/laiducaivn-fpt-university/NER" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View run at https://wandb.ai/laiducaivn-fpt-university/NER/runs/tjmjkx7n" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 1/20: 100%|██████████| 841/841 [00:25<00:00, 32.42it/s, avg_loss=2.55, batch_loss=0.525]\n", + "Train Eval: 100%|██████████| 841/841 [00:06<00:00, 137.51it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 160.88it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1: loss=2.5528, train_f1=0.8316, train_acc=0.9869, test_f1=0.8319, test_acc=0.9869\n", + "Saved improved model to checkpoints/best_epoch_1.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 2/20: 100%|██████████| 841/841 [00:25<00:00, 32.82it/s, avg_loss=0.758, batch_loss=0.0907]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 161.70it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 125.73it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 2: loss=0.7581, train_f1=0.8833, train_acc=0.9907, test_f1=0.8744, test_acc=0.9903\n", + "Saved improved model to checkpoints/best_epoch_2.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 3/20: 100%|██████████| 841/841 [00:36<00:00, 23.06it/s, avg_loss=0.549, batch_loss=0.127]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 160.90it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 115.40it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 3: loss=0.5486, train_f1=0.9070, train_acc=0.9922, test_f1=0.8914, test_acc=0.9913\n", + "Saved improved model to checkpoints/best_epoch_3.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 4/20: 100%|██████████| 841/841 [00:27<00:00, 31.09it/s, avg_loss=0.448, batch_loss=0.71]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 153.29it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 161.35it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 4: loss=0.4482, train_f1=0.9209, train_acc=0.9933, test_f1=0.8992, test_acc=0.9919\n", + "Saved improved model to checkpoints/best_epoch_4.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 5/20: 100%|██████████| 841/841 [00:25<00:00, 32.91it/s, avg_loss=0.384, batch_loss=0.176]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 154.70it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 113.95it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 5: loss=0.3838, train_f1=0.9206, train_acc=0.9937, test_f1=0.8946, test_acc=0.9921\n", + "Saved improved model to checkpoints/best_epoch_5.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 6/20: 100%|██████████| 841/841 [00:25<00:00, 33.20it/s, avg_loss=0.338, batch_loss=0.529]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 150.44it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 166.03it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 6: loss=0.3382, train_f1=0.9342, train_acc=0.9944, test_f1=0.9047, test_acc=0.9925\n", + "Saved improved model to checkpoints/best_epoch_6.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 7/20: 100%|██████████| 841/841 [00:25<00:00, 32.74it/s, avg_loss=0.303, batch_loss=0.344]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 158.03it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 112.37it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 7: loss=0.3029, train_f1=0.9399, train_acc=0.9949, test_f1=0.9110, test_acc=0.9929\n", + "Saved improved model to checkpoints/best_epoch_7.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 8/20: 100%|██████████| 841/841 [00:25<00:00, 33.26it/s, avg_loss=0.28, batch_loss=0.0176]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 148.56it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 161.91it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 8: loss=0.2798, train_f1=0.9449, train_acc=0.9953, test_f1=0.9110, test_acc=0.9930\n", + "Saved improved model to checkpoints/best_epoch_8.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 9/20: 100%|██████████| 841/841 [00:26<00:00, 31.90it/s, avg_loss=0.257, batch_loss=0.113]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 149.53it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 118.68it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 9: loss=0.2575, train_f1=0.9497, train_acc=0.9957, test_f1=0.9092, test_acc=0.9930\n", + "Saved improved model to checkpoints/best_epoch_9.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 10/20: 100%|██████████| 841/841 [00:26<00:00, 31.27it/s, avg_loss=0.242, batch_loss=0.335]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 154.94it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 159.02it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 10: loss=0.2419, train_f1=0.9499, train_acc=0.9958, test_f1=0.9010, test_acc=0.9926\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 11/20: 100%|██████████| 841/841 [00:26<00:00, 31.36it/s, avg_loss=0.228, batch_loss=0.639]\n", + "Train Eval: 100%|██████████| 841/841 [00:06<00:00, 131.67it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 158.27it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 11: loss=0.2276, train_f1=0.9527, train_acc=0.9960, test_f1=0.9130, test_acc=0.9931\n", + "Saved improved model to checkpoints/best_epoch_11.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 12/20: 100%|██████████| 841/841 [00:28<00:00, 29.31it/s, avg_loss=0.216, batch_loss=0.529]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 156.81it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 147.29it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 12: loss=0.2157, train_f1=0.9546, train_acc=0.9960, test_f1=0.9110, test_acc=0.9932\n", + "Saved improved model to checkpoints/best_epoch_12.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 13/20: 100%|██████████| 841/841 [00:27<00:00, 30.55it/s, avg_loss=0.206, batch_loss=0.502]\n", + "Train Eval: 100%|██████████| 841/841 [00:06<00:00, 138.67it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 163.15it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 13: loss=0.2059, train_f1=0.9593, train_acc=0.9965, test_f1=0.9129, test_acc=0.9933\n", + "Saved improved model to checkpoints/best_epoch_13.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 14/20: 100%|██████████| 841/841 [00:26<00:00, 32.00it/s, avg_loss=0.198, batch_loss=0.413]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 154.97it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 110.08it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 14: loss=0.1975, train_f1=0.9612, train_acc=0.9966, test_f1=0.9102, test_acc=0.9930\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 15/20: 100%|██████████| 841/841 [00:27<00:00, 30.12it/s, avg_loss=0.191, batch_loss=0.0384]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 151.24it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 151.00it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 15: loss=0.1905, train_f1=0.9603, train_acc=0.9966, test_f1=0.9030, test_acc=0.9927\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 16/20: 100%|██████████| 841/841 [00:27<00:00, 30.24it/s, avg_loss=0.184, batch_loss=0.219]\n", + "Train Eval: 100%|██████████| 841/841 [00:06<00:00, 132.65it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 159.54it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 16: loss=0.1836, train_f1=0.9649, train_acc=0.9969, test_f1=0.9028, test_acc=0.9926\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 17/20: 100%|██████████| 841/841 [00:27<00:00, 30.78it/s, avg_loss=0.178, batch_loss=0.0707]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 158.34it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 113.24it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 17: loss=0.1777, train_f1=0.9607, train_acc=0.9967, test_f1=0.9092, test_acc=0.9931\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 18/20: 100%|██████████| 841/841 [00:27<00:00, 30.48it/s, avg_loss=0.173, batch_loss=0.557]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 151.59it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 162.60it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 18: loss=0.1728, train_f1=0.9607, train_acc=0.9968, test_f1=0.9039, test_acc=0.9928\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 19/20: 100%|██████████| 841/841 [00:27<00:00, 30.22it/s, avg_loss=0.168, batch_loss=0.0108]\n", + "Train Eval: 100%|██████████| 841/841 [00:06<00:00, 136.29it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 161.68it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 19: loss=0.1682, train_f1=0.9664, train_acc=0.9969, test_f1=0.9116, test_acc=0.9929\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 20/20: 100%|██████████| 841/841 [00:26<00:00, 31.60it/s, avg_loss=0.163, batch_loss=0.181]\n", + "Train Eval: 100%|██████████| 841/841 [00:05<00:00, 160.70it/s]\n", + "Test Eval: 100%|██████████| 211/211 [00:01<00:00, 164.59it/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 20: loss=0.1626, train_f1=0.9647, train_acc=0.9969, test_f1=0.9044, test_acc=0.9928\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "

Run history:


epoch▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_acc▁▅▆▆▇▇███▇████▇▇█▇█▇
test_f1▁▅▆▇▆▇███▇████▇▇█▇█▇
test_precision▁▄▅▄▆▇▆▆▇▆▆█▇▆▆▅██▇█
test_recall▁▄▆▇▆▇██▇▇█▇▇█▇▇▇▇▇▇
train_acc▁▄▅▅▆▆▇▇▇▇▇▇████████
train_f1▁▄▅▆▆▆▇▇▇▇▇▇████████
train_loss█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_precision▁▃▄▃▅▅▅▅▆▆▅▇▇▇▇▇▇█▇█
train_recall▁▄▅▆▆▆▇▇▇▇█▇█████▇██

Run summary:


epoch20
test_acc0.99285
test_f10.90442
test_precision0.9205
test_recall0.88994
train_acc0.99693
train_f10.96475
train_loss0.16259
train_precision0.97877
train_recall0.95181

" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View run CRF_VLSP2016 at: https://wandb.ai/laiducaivn-fpt-university/NER/runs/tjmjkx7n
View project at: https://wandb.ai/laiducaivn-fpt-university/NER
Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 12 other file(s)" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Find logs at: ./wandb/run-20250605_133906-tjmjkx7n/logs" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Generate final classification report\n", + "model.eval()\n", + "all_preds, all_true = [], []\n", + "\n", + "with torch.no_grad():\n", + " for x, y, lengths in tqdm(test_loader, desc=\"Generating classification report\"):\n", + " mask = (y != -1)\n", + " preds = model.decode(x, mask)\n", + " for pred_seq, true_seq, m in zip(preds, y, mask):\n", + " ts = true_seq[m].tolist()\n", + " all_preds.extend(pred_seq)\n", + " all_true.extend(ts)\n", + "\n", + "# Generate and print classification report\n", + "report = classification_report(all_true, all_preds, digits=4)\n", + "print(\"Classification Report:\\n\", report)\n" + ], + "metadata": { + "id": "CBwl-uTjaA1y", + "outputId": "7597a9ab-bd18-4530-e6d6-e335a974f01a", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Generating classification report: 100%|██████████| 211/211 [00:02<00:00, 101.37it/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.9968 0.9983 0.9976 68476\n", + " 1 0.9903 0.9754 0.9828 1464\n", + " 2 0.9941 0.9781 0.9860 686\n", + " 3 0.8384 0.7471 0.7901 257\n", + " 4 0.8560 0.7605 0.8054 430\n", + " 5 0.9066 0.9073 0.9070 1241\n", + " 6 0.8613 0.8628 0.8620 554\n", + "\n", + " accuracy 0.9928 73108\n", + " macro avg 0.9205 0.8899 0.9044 73108\n", + "weighted avg 0.9927 0.9928 0.9927 73108\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "L1bDKxlyZRAy", + "outputId": "cf258765-6629-4d34-bf0c-431ba6575950" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import shutil\n", + "shutil.copy('/content/checkpoints/best_epoch_13.pt', '/content/drive/My Drive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "q4qCaBbrZcTZ", + "outputId": "57eff61e-f5ca-4597-e499-ea8b71d603a9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/content/drive/My Drive/best_epoch_13.pt'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Train/Valid/Test" + ], + "metadata": { + "id": "T0LAYLnU8ONv" + } + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from torchcrf import CRF\n", + "from torch.nn.utils.rnn import pad_sequence\n", + "from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score\n", + "from sklearn.model_selection import train_test_split\n", + "from tqdm import tqdm\n", + "import wandb\n", + "import os\n", + "\n", + "# Initialize Weights & Biases\n", + "wandb.init(\n", + " project=\"NER\",\n", + " name=\"CRF_VLSP2016\",\n", + " config={\n", + " \"epochs\": 20,\n", + " \"batch_size\": 16,\n", + " \"learning_rate\": 1e-3,\n", + " # train/val/test ratios\n", + " \"train_ratio\": 0.70,\n", + " \"val_ratio\": 0.15,\n", + " \"test_ratio\": 0.15\n", + " }\n", + ")\n", + "config = wandb.config\n", + "\n", + "# Create splits: first separate out test, then train/val\n", + "emb_train_val, emb_test, lbl_train_val, lbl_test = train_test_split(\n", + " all_embeddings, all_labels,\n", + " test_size=config.test_ratio,\n", + " random_state=42\n", + ")\n", + "# Compute validation size relative to remaining (val_ratio / (train_ratio + val_ratio))\n", + "val_relative = config.val_ratio / (config.train_ratio + config.val_ratio)\n", + "emb_train, emb_val, lbl_train, lbl_val = train_test_split(\n", + " emb_train_val, lbl_train_val,\n", + " test_size=val_relative,\n", + " random_state=42\n", + ")\n", + "\n", + "class NERDataset(Dataset):\n", + " def __init__(self, embeddings, labels):\n", + " self.embeddings = embeddings\n", + " self.labels = labels\n", + "\n", + " def __len__(self):\n", + " return len(self.embeddings)\n", + "\n", + " def __getitem__(self, idx):\n", + " return self.embeddings[idx], self.labels[idx]\n", + "\n", + "\n", + "def collate_fn(batch):\n", + " embeddings, labels = zip(*batch)\n", + " lengths = [e.size(0) for e in embeddings]\n", + " max_len = max(lengths)\n", + "\n", + " padded_embs = torch.stack([\n", + " torch.cat([e, torch.zeros(max_len - e.size(0), e.size(1))]) for e in embeddings\n", + " ])\n", + " padded_labels = torch.stack([\n", + " torch.cat([l, torch.full((max_len - l.size(0),), -1, dtype=torch.long)]) for l in labels\n", + " ])\n", + " return padded_embs, padded_labels, lengths\n", + "\n", + "# Create DataLoaders\n", + "datasets = {\n", + " 'train': NERDataset(emb_train, lbl_train),\n", + " 'val': NERDataset(emb_val, lbl_val),\n", + " 'test': NERDataset(emb_test, lbl_test)\n", + "}\n", + "loaders = {\n", + " split: DataLoader(ds, batch_size=config.batch_size,\n", + " shuffle=(split=='train'), collate_fn=collate_fn)\n", + " for split, ds in datasets.items()\n", + "}\n", + "\n", + "# Model setup\n", + "num_tags = max(label.max().item() for label in all_labels) + 1\n", + "class CRFTagger(nn.Module):\n", + " def __init__(self, input_dim, num_tags):\n", + " super().__init__()\n", + " self.hidden2tag = nn.Linear(input_dim, num_tags)\n", + " self.crf = CRF(num_tags, batch_first=True)\n", + "\n", + " def forward(self, x, labels, mask):\n", + " emissions = self.hidden2tag(x)\n", + " return -self.crf(emissions, labels, mask=mask, reduction='mean')\n", + "\n", + " def decode(self, x, mask):\n", + " emissions = self.hidden2tag(x)\n", + " return self.crf.decode(emissions, mask)\n", + "\n", + "model = CRFTagger(input_dim=emb_train[0].size(1), num_tags=num_tags)\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)\n", + "\n", + "# Watch model parameters and gradients\n", + "wandb.watch(model, log=\"all\")\n", + "\n", + "# Create checkpoint directory\n", + "os.makedirs(\"checkpoints\", exist_ok=True)\n", + "best_val_f1 = 0.0\n", + "\n", + "# Evaluation helper\n", + "def evaluate(model, loader):\n", + " model.eval()\n", + " all_preds, all_true = [], []\n", + " with torch.no_grad():\n", + " for x, y, _ in loader:\n", + " mask = (y != -1)\n", + " preds = model.decode(x, mask)\n", + " for pred_seq, true_seq, m in zip(preds, y, mask):\n", + " true_labels = true_seq[m].tolist()\n", + " all_preds.extend(pred_seq)\n", + " all_true.extend(true_labels)\n", + " precision, recall, f1, _ = precision_recall_fscore_support(all_true, all_preds, average='macro', zero_division=0)\n", + " acc = accuracy_score(all_true, all_preds)\n", + " return precision, recall, f1, acc\n", + "\n", + "# Training loop\n", + "for epoch in range(1, config.epochs + 1):\n", + " model.train()\n", + " total_loss = 0.0\n", + " train_bar = tqdm(loaders['train'], desc=f\"Train Epoch {epoch}/{config.epochs}\")\n", + " for batch_idx, (x, y, _) in enumerate(train_bar, start=1):\n", + " mask = (y != -1)\n", + " loss = model(x, y, mask)\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " total_loss += loss.item()\n", + " train_bar.set_postfix(batch_loss=loss.item(), avg_loss=total_loss / batch_idx)\n", + "\n", + " avg_train_loss = total_loss / len(loaders['train'])\n", + " train_precision, train_recall, train_f1, train_acc = evaluate(model, loaders['train'])\n", + " val_precision, val_recall, val_f1, val_acc = evaluate(model, loaders['val'])\n", + "\n", + " # Print & log metrics for train and val\n", + " print(f\"Epoch {epoch}: train_loss={avg_train_loss:.4f}, train_f1={train_f1:.4f}, val_f1={val_f1:.4f}\")\n", + " wandb.log({\n", + " \"epoch\": epoch,\n", + " \"train_loss\": avg_train_loss,\n", + " \"train_precision\": train_precision,\n", + " \"train_recall\": train_recall,\n", + " \"train_f1\": train_f1,\n", + " \"train_acc\": train_acc,\n", + " \"val_precision\": val_precision,\n", + " \"val_recall\": val_recall,\n", + " \"val_f1\": val_f1,\n", + " \"val_acc\": val_acc\n", + " })\n", + "\n", + " # Save best model based on val_f1\n", + " if val_f1 > best_val_f1:\n", + " best_val_f1 = val_f1\n", + " ckpt_path = f\"checkpoints/best_epoch_{epoch}.pt\"\n", + " torch.save(model.state_dict(), ckpt_path)\n", + " wandb.save(ckpt_path)\n", + " print(f\"Saved improved model to {ckpt_path}\")\n", + "\n", + "# Final evaluation on test set\n", + "print(\"Evaluating on test set...\")\n", + "test_preds, test_true = [], []\n", + "model.eval()\n", + "with torch.no_grad():\n", + " for x, y, _ in loaders['test']:\n", + " mask = (y != -1)\n", + " preds = model.decode(x, mask)\n", + " for pred_seq, true_seq, m in zip(preds, y, mask):\n", + " test_true.extend(true_seq[m].tolist())\n", + " test_preds.extend(pred_seq)\n", + "\n", + "# Classification report\n", + "report_dict = classification_report(test_true, test_preds, output_dict=True)\n", + "print(classification_report(test_true, test_preds))\n", + "\n", + "# Log classification report table to wandb\n", + "columns = [\"label\", \"precision\", \"recall\", \"f1-score\", \"support\"]\n", + "rows = []\n", + "for label, metrics in report_dict.items():\n", + " if label not in [\"accuracy\", \"macro avg\", \"weighted avg\"]:\n", + " rows.append([label, metrics['precision'], metrics['recall'], metrics['f1-score'], metrics['support']])\n", + "# Add overall averages\n", + "rows.append([\"macro avg\", report_dict['macro avg']['precision'], report_dict['macro avg']['recall'], report_dict['macro avg']['f1-score'], report_dict['macro avg']['support']])\n", + "rows.append([\"weighted avg\", report_dict['weighted avg']['precision'], report_dict['weighted avg']['recall'], report_dict['weighted avg']['f1-score'], report_dict['weighted avg']['support']])\n", + "\n", + "table = wandb.Table(columns=columns, data=rows)\n", + "wandb.log({\"test_classification\": table})\n", + "\n", + "# Finish W&B run\n", + "wandb.finish()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "28KlV4cy8SAL", + "outputId": "d4700801-e21c-4559-ff6a-50ebd3643cc4" + }, + "execution_count": null, + "outputs": [ + { + "data": { + "text/html": [ + "Tracking run with wandb version 0.19.11" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run data is saved locally in /content/wandb/run-20250606_015838-r3oj54fe" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Syncing run CRF_VLSP2016 to Weights & Biases (docs)
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View project at https://wandb.ai/laiducaivn-fpt-university/NER" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run at https://wandb.ai/laiducaivn-fpt-university/NER/runs/r3oj54fe" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 1/20: 100%|██████████| 736/736 [00:18<00:00, 39.34it/s, avg_loss=2.91, batch_loss=1.26]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1: train_loss=2.9090, train_f1=0.8125, val_f1=0.8168\n", + "Saved improved model to checkpoints/best_epoch_1.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 2/20: 100%|██████████| 736/736 [00:20<00:00, 35.77it/s, avg_loss=0.835, batch_loss=0.186]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 2: train_loss=0.8350, train_f1=0.8793, val_f1=0.8784\n", + "Saved improved model to checkpoints/best_epoch_2.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 3/20: 100%|██████████| 736/736 [00:19<00:00, 37.89it/s, avg_loss=0.6, batch_loss=0.803]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 3: train_loss=0.6004, train_f1=0.8985, val_f1=0.8891\n", + "Saved improved model to checkpoints/best_epoch_3.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 4/20: 100%|██████████| 736/736 [00:19<00:00, 37.87it/s, avg_loss=0.485, batch_loss=0.377]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 4: train_loss=0.4847, train_f1=0.9165, val_f1=0.9112\n", + "Saved improved model to checkpoints/best_epoch_4.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 5/20: 100%|██████████| 736/736 [00:19<00:00, 38.52it/s, avg_loss=0.413, batch_loss=0.0734]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 5: train_loss=0.4129, train_f1=0.9088, val_f1=0.8904\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 6/20: 100%|██████████| 736/736 [00:19<00:00, 37.70it/s, avg_loss=0.365, batch_loss=0.779]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 6: train_loss=0.3645, train_f1=0.9327, val_f1=0.9151\n", + "Saved improved model to checkpoints/best_epoch_6.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 7/20: 100%|██████████| 736/736 [00:19<00:00, 38.16it/s, avg_loss=0.33, batch_loss=1.44]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 7: train_loss=0.3297, train_f1=0.9382, val_f1=0.9241\n", + "Saved improved model to checkpoints/best_epoch_7.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 8/20: 100%|██████████| 736/736 [00:19<00:00, 37.06it/s, avg_loss=0.295, batch_loss=0.156]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 8: train_loss=0.2948, train_f1=0.9432, val_f1=0.9167\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 9/20: 100%|██████████| 736/736 [00:18<00:00, 38.98it/s, avg_loss=0.276, batch_loss=0.119]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 9: train_loss=0.2757, train_f1=0.9477, val_f1=0.9247\n", + "Saved improved model to checkpoints/best_epoch_9.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 10/20: 100%|██████████| 736/736 [00:18<00:00, 39.42it/s, avg_loss=0.254, batch_loss=0.141]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 10: train_loss=0.2535, train_f1=0.9496, val_f1=0.9263\n", + "Saved improved model to checkpoints/best_epoch_10.pt\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 11/20: 100%|██████████| 736/736 [00:19<00:00, 38.60it/s, avg_loss=0.238, batch_loss=0.104]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 11: train_loss=0.2382, train_f1=0.9517, val_f1=0.9217\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 12/20: 100%|██████████| 736/736 [00:19<00:00, 38.10it/s, avg_loss=0.226, batch_loss=0.39]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 12: train_loss=0.2255, train_f1=0.9579, val_f1=0.9239\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 13/20: 100%|██████████| 736/736 [00:19<00:00, 37.54it/s, avg_loss=0.214, batch_loss=0.0747]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 13: train_loss=0.2142, train_f1=0.9555, val_f1=0.9213\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 14/20: 100%|██████████| 736/736 [00:19<00:00, 37.30it/s, avg_loss=0.204, batch_loss=0.062]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 14: train_loss=0.2040, train_f1=0.9606, val_f1=0.9255\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 15/20: 100%|██████████| 736/736 [00:19<00:00, 37.20it/s, avg_loss=0.195, batch_loss=0.0167]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 15: train_loss=0.1949, train_f1=0.9634, val_f1=0.9196\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 16/20: 100%|██████████| 736/736 [00:19<00:00, 37.11it/s, avg_loss=0.187, batch_loss=0.333]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 16: train_loss=0.1870, train_f1=0.9638, val_f1=0.9215\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 17/20: 100%|██████████| 736/736 [00:19<00:00, 37.21it/s, avg_loss=0.181, batch_loss=0.0567]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 17: train_loss=0.1811, train_f1=0.9580, val_f1=0.9179\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 18/20: 100%|██████████| 736/736 [00:19<00:00, 36.90it/s, avg_loss=0.175, batch_loss=0.554]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 18: train_loss=0.1747, train_f1=0.9669, val_f1=0.9237\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 19/20: 100%|██████████| 736/736 [00:19<00:00, 37.49it/s, avg_loss=0.169, batch_loss=0.0126]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 19: train_loss=0.1689, train_f1=0.9685, val_f1=0.9231\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Train Epoch 20/20: 100%|██████████| 736/736 [00:20<00:00, 36.24it/s, avg_loss=0.164, batch_loss=0.252]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 20: train_loss=0.1635, train_f1=0.9719, val_f1=0.9237\n", + "Evaluating on test set...\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 51036\n", + " 1 0.99 0.98 0.99 1112\n", + " 2 0.98 0.99 0.99 506\n", + " 3 0.83 0.77 0.80 180\n", + " 4 0.83 0.73 0.78 291\n", + " 5 0.89 0.91 0.90 939\n", + " 6 0.86 0.85 0.85 428\n", + "\n", + " accuracy 0.99 54492\n", + " macro avg 0.91 0.89 0.90 54492\n", + "weighted avg 0.99 0.99 0.99 54492\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "

Run history:


epoch▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc▁▄▅▅▅▆▆▇▇▇▇▇▇███▇███
train_f1▁▄▅▆▅▆▇▇▇▇▇▇▇███▇███
train_loss█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_precision▁▃▄▃▅▆▅▆▆▇▆▇▇▇███▇▇█
train_recall▁▄▅▆▅▆▇▇▇▇▇▇▇██▇▇███
val_acc▁▅▆▇▆▇█▇████████████
val_f1▁▅▆▇▆▇█▇████████▇███
val_precision▁▅▄▃▅▇▆▇▇▇▅▆▆▇▇▇█▆▆▇
val_recall▁▅▆█▆▇█▇██████▇▇▇███

Run summary:


epoch20
train_acc0.99748
train_f10.97193
train_loss0.16354
train_precision0.97333
train_recall0.9706
val_acc0.99327
val_f10.92372
val_precision0.93356
val_recall0.91553

" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View run CRF_VLSP2016 at: https://wandb.ai/laiducaivn-fpt-university/NER/runs/r3oj54fe
View project at: https://wandb.ai/laiducaivn-fpt-university/NER
Synced 5 W&B file(s), 1 media file(s), 2 artifact file(s) and 8 other file(s)" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Find logs at: ./wandb/run-20250606_015838-r3oj54fe/logs" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Train CRF With Kerras" + ], + "metadata": { + "id": "LV5FdgTTXFv3" + } + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "\n", + "X = [emb.numpy() for emb in all_embeddings]\n", + "y = [label.numpy() for label in all_labels]\n", + "\n", + "max_len = max(len(seq) for seq in X)\n", + "num_tags = max(label.max().item() for label in all_labels) + 1\n", + "\n", + "X_padded = pad_sequences(X, maxlen=max_len, dtype='float32', padding='post')\n", + "y_padded = pad_sequences(y, maxlen=max_len, value=-1)\n" + ], + "metadata": { + "id": "l_m8_-UgHlxo" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import tensorflow as tf\n", + "import tensorflow_addons as tfa\n", + "from tensorflow.keras import layers, Model, Input\n", + "\n", + "input_dim = X_padded.shape[2]\n", + "\n", + "inputs = Input(shape=(max_len, input_dim), name=\"input_embedding\")\n", + "masking = layers.Masking(mask_value=0.0)(inputs)\n", + "dense = layers.Dense(num_tags)(masking)\n", + "\n", + "# CRF Layer\n", + "crf = tfa.layers.CRF(num_tags)\n", + "outputs = crf(dense)\n", + "\n", + "model = Model(inputs=inputs, outputs=outputs)\n", + "model.compile(optimizer='adam', loss=crf.loss, metrics=[crf.accuracy])\n", + "model.summary()\n" + ], + "metadata": { + "id": "kYrGkzFPXMBH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model.fit(\n", + " X_padded, y_padded,\n", + " batch_size=32,\n", + " epochs=5,\n", + " validation_split=0.1,\n", + " verbose=1\n", + ")\n" + ], + "metadata": { + "id": "pyxVhvn3XQ5q" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pred = model.predict(X_padded)\n", + "pred_labels = np.argmax(pred, axis=-1)\n", + "\n", + "from sklearn.metrics import classification_report\n", + "\n", + "y_true_flat = []\n", + "y_pred_flat = []\n", + "\n", + "for i in range(len(y_padded)):\n", + " for j in range(max_len):\n", + " if y_padded[i][j] != -1:\n", + " y_true_flat.append(y_padded[i][j])\n", + " y_pred_flat.append(pred_labels[i][j])\n", + "\n", + "print(classification_report(y_true_flat, y_pred_flat, digits=4))\n" + ], + "metadata": { + "id": "zT7BtMiVXSMc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Train Random Forest" + ], + "metadata": { + "id": "1VrZlknUb6cn" + } + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_flat = []\n", + "y_flat = []\n", + "\n", + "for emb_seq, label_seq in zip(all_embeddings, all_labels):\n", + " for emb, label in zip(emb_seq, label_seq):\n", + " X_flat.append(emb.numpy()) # emb: [768]\n", + " y_flat.append(label.item()) # label: int\n", + "\n", + "X_flat = np.array(X_flat) # [N, 768]\n", + "y_flat = np.array(y_flat) # [N]\n" + ], + "metadata": { + "id": "VK2nmLo0b8d3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(X_flat.shape)\n", + "print(y_flat.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GeqgiB4CtzA1", + "outputId": "452979ff-25be-49a9-c809-4acffd3b3c54" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(368172, 768)\n", + "(368172,)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Kiểm tra độ lệch data\n", + "unique_values, counts = np.unique(y_flat, return_counts=True)\n", + "\n", + "# In ra từng giá trị và số lần xuất hiện\n", + "for val, count in zip(unique_values, counts):\n", + " print(f\"Label {val}: {count} times\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VeSfRzgOm6w-", + "outputId": "163a877f-9860-4b3a-e850-f6d8df9c6cfe" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Label 0: 344986 times\n", + "Label 1: 7450 times\n", + "Label 2: 3504 times\n", + "Label 3: 1204 times\n", + "Label 4: 2050 times\n", + "Label 5: 6211 times\n", + "Label 6: 2767 times\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X_flat, y_flat, test_size=0.2, random_state=42, stratify=y_flat)\n" + ], + "metadata": { + "id": "AOOUix-NcERf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import lightgbm as lgb\n", + "from sklearn.metrics import accuracy_score, f1_score, classification_report\n", + "\n", + "# Khởi tạo wandb project\n", + "wandb.init(project=\"NER\", name=\"RandomForest_100Trees_VLSP2016\")\n", + "\n", + "# Tạo Dataset cho LightGBM\n", + "train_data = lgb.Dataset(X_train, label=y_train)\n", + "test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)\n", + "\n", + "# Cấu hình tham số LightGBM (Random Forest mode)\n", + "params = {\n", + " \"objective\": \"multiclass\", # nếu multiclass classification\n", + " \"num_class\": len(np.unique(y_train)),\n", + " \"metric\": \"multi_logloss\",\n", + " \"boosting_type\": \"rf\", # random forest mode trong LightGBM\n", + " \"num_leaves\": 31,\n", + " \"bagging_freq\": 1,\n", + " \"bagging_fraction\": 0.8,\n", + " \"feature_fraction\": 0.8,\n", + " \"bagging_seed\": 42,\n", + " \"verbose\": -1,\n", + " \"seed\": 42,\n", + " \"is_unbalance\": True\n", + "}\n", + "\n", + "\n", + "\n", + "# Train model, tích hợp wandb callback để log metrics\n", + "model = lgb.train(\n", + " params,\n", + " train_data,\n", + " num_boost_round=100,\n", + " valid_sets=[train_data, test_data],\n", + " valid_names=[\"train\", \"test\"],\n", + " callbacks=[wandb.lightgbm.wandb_callback()]\n", + ")\n", + "\n", + "# Dự đoán trên test set\n", + "y_pred_prob = model.predict(X_test)\n", + "y_pred = np.argmax(y_pred_prob, axis=1)\n", + "\n", + "# Ánh xạ số về nhãn tên entity\n", + "label_map = {\n", + " 0: 'O',\n", + " 1: 'B-PER',\n", + " 2: 'I-PER',\n", + " 3: 'B-ORG',\n", + " 4: 'I-ORG',\n", + " 5: 'B-LOC',\n", + " 6: 'I-LOC'\n", + "}\n", + "\n", + "# Chuyển y_test và y_pred sang nhãn gốc\n", + "y_test_labels = [label_map[i] for i in y_test]\n", + "y_pred_labels = [label_map[i] for i in y_pred]\n", + "\n", + "# In classification report với nhãn thật\n", + "print(\"\\nClassification Report (theo label gốc):\")\n", + "print(classification_report(y_test_labels, y_pred_labels, digits=4))\n", + "\n", + "# Tạo bảng để log classification report\n", + "report_dict = classification_report(y_test_labels, y_pred_labels, output_dict=True)\n", + "table = wandb.Table(columns=[\"Label\", \"Precision\", \"Recall\", \"F1-Score\", \"Support\"])\n", + "\n", + "for label, scores in report_dict.items():\n", + " if isinstance(scores, dict): # Bỏ các dòng như 'accuracy'\n", + " table.add_data(\n", + " label,\n", + " scores[\"precision\"],\n", + " scores[\"recall\"],\n", + " scores[\"f1-score\"],\n", + " scores[\"support\"]\n", + " )\n", + "\n", + "wandb.log({\"Classification Report\": table})\n", + "\n", + "\n", + "# Kết thúc wandb run\n", + "wandb.finish()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 721 + }, + "id": "G6PUbpyPgF84", + "outputId": "6efc696f-1b6f-4cea-da68-c25e22bed461" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Tracking run with wandb version 0.19.11" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Run data is saved locally in /content/wandb/run-20250605_114334-x4x6fpo4" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Syncing run RandomForest_100Trees_VLSP2016 to Weights & Biases (docs)
" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View project at https://wandb.ai/laiducaivn-fpt-university/NER" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View run at https://wandb.ai/laiducaivn-fpt-university/NER/runs/x4x6fpo4" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Classification Report (theo label gốc):\n", + " precision recall f1-score support\n", + "\n", + " B-LOC 0.4461 0.6167 0.5177 1242\n", + " B-ORG 0.2841 0.6224 0.3901 241\n", + " B-PER 0.5859 0.8423 0.6911 1490\n", + " I-LOC 0.2812 0.6401 0.3907 553\n", + " I-ORG 0.2350 0.4122 0.2994 410\n", + " I-PER 0.6530 0.7489 0.6977 701\n", + " O 0.9914 0.9550 0.9728 68998\n", + "\n", + " accuracy 0.9386 73635\n", + " macro avg 0.4967 0.6911 0.5657 73635\n", + "weighted avg 0.9589 0.9386 0.9468 73635\n", + "\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "

Run history:


iteration▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇██
test_multi_logloss█▆▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_multi_logloss█▇▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

Run summary:


iteration99

" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View run RandomForest_100Trees_VLSP2016 at: https://wandb.ai/laiducaivn-fpt-university/NER/runs/x4x6fpo4
View project at: https://wandb.ai/laiducaivn-fpt-university/NER
Synced 5 W&B file(s), 1 media file(s), 2 artifact file(s) and 0 other file(s)" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Find logs at: ./wandb/run-20250605_114334-x4x6fpo4/logs" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Lưu data" + ], + "metadata": { + "id": "4Ppa-bdT8r2v" + } + }, + { + "cell_type": "code", + "source": [ + "def save_tensors(all_embeddings, all_labels, embed_path='embeddings.pt', label_path='labels.pt'):\n", + " torch.save(all_embeddings, embed_path)\n", + " torch.save(all_labels, label_path)\n", + " print(f\"Saved embeddings to {embed_path} and labels to {label_path}\")" + ], + "metadata": { + "id": "s9GulKoGqx6d" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "import shutil\n", + "\n", + "# Gọi hàm đã viết\n", + "save_tensors(all_embeddings, all_labels)\n", + "\n", + "# Mount và tải lên Drive\n", + "drive.mount('/content/drive')\n", + "shutil.copy('embeddings.pt', '/content/drive/My Drive')\n", + "shutil.copy('labels.pt', '/content/drive/My Drive')\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "id": "AGAJZH_h8ve6", + "outputId": "13849039-adb8-40e8-ed20-544f65d018f8" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved embeddings to embeddings.pt and labels to labels.pt\n", + "Mounted at /content/drive\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/content/drive/My Drive/labels.pt'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.save_model('lightgbm_rf_model.txt')\n", + "shutil.copy('lightgbm_rf_model.txt', '/content/drive/My Drive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "ESWu8QI59dwl", + "outputId": "7eba9b3d-4c54-48ca-99eb-76771c01140e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/content/drive/My Drive/lightgbm_rf_model.txt'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "BKx8yPUE-UHS" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_RF_lightgbm.ipynb b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_RF_lightgbm.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..91816b305704056445c06ec42b1de606a49b9d23 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_RF_lightgbm.ipynb @@ -0,0 +1,741 @@ +{ + "cells": [ + { + "cell_type": "code", + "id": "10ec017cb658e125", + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:21:33.244538Z", + "start_time": "2025-06-11T00:21:05.317283Z" + } + }, + "source": [ + "import pandas as pd\n", + "\n", + "splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}\n", + "df_train = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"train\"])\n", + "df_valid = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"valid\"])\n", + "df = pd.concat([df_train, df_valid]).reset_index(drop=True)" + ], + "outputs": [], + "execution_count": 1 + }, + { + "cell_type": "code", + "id": "c533c55a2ad7b16e", + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:21:33.499341Z", + "start_time": "2025-06-11T00:21:33.262933Z" + } + }, + "source": [ + "# Tạo thêm các cột khác\n", + "def join_tokens(tokens):\n", + " text = ' '.join(tokens)\n", + " return text\n", + "\n", + "def reform_raw_text(tokens):\n", + " text = ' '.join(tokens)\n", + " return text.replace(\"_\", \" \")\n", + "\n", + "def label(x):\n", + " return [id_tag[int(i)] for i in x]\n", + "\n", + "def replace_7_8(lst):\n", + " return [0 if x in (7, 8) else x for x in lst]\n", + "\n", + "\n", + "tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}\n", + "id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}\n", + "\n", + "\n", + "df['ner_tags'] = df['ner_tags'].apply(replace_7_8)\n", + "df['text_withseg'] = df['tokens'].apply(join_tokens)\n", + "df['text_raw'] = df['tokens'].apply(reform_raw_text)\n", + "df[\"ner_labels\"] = df.ner_tags.apply(label)\n", + "df.columns = ['tokens', 'id', 'seg_text', 'raw_text', 'labels']\n", + "df\n" + ], + "outputs": [ + { + "data": { + "text/plain": [ + " tokens \\\n", + "0 [Không_khí, thật, náo_nhiệt, .] \n", + "1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n", + "2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n", + "3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n", + "4 [Nhật_ký, của, thuyền_viên, .] \n", + "... ... \n", + "16853 [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ... \n", + "16854 [Nhưng, mọi, chuyện, không, dừng, ở, đó, .] \n", + "16855 [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,... \n", + "16856 [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ... \n", + "16857 [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t... \n", + "\n", + " id \\\n", + "0 [0, 0, 0, 0] \n", + "1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n", + "4 [0, 0, 0, 0] \n", + "... ... \n", + "16853 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... \n", + "16854 [0, 0, 0, 0, 0, 0, 0, 0] \n", + "16855 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "16856 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "16857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "\n", + " seg_text \\\n", + "0 Không_khí thật náo_nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,... \n", + "4 Nhật_ký của thuyền_viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh... \n", + "16856 Biết_bao người đã tình_nguyện hiến_dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành_tích ” tiêu tiền của m... \n", + "\n", + " raw_text \\\n", + "0 Không khí thật náo nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,... \n", + "4 Nhật ký của thuyền viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh... \n", + "16856 Biết bao người đã tình nguyện hiến dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành tích ” tiêu tiền của m... \n", + "\n", + " labels \n", + "0 [O, O, O, O] \n", + "1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n", + "2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n", + "3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n", + "4 [O, O, O, O] \n", + "... ... \n", + "16853 [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,... \n", + "16854 [O, O, O, O, O, O, O, O] \n", + "16855 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "16856 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "16857 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "\n", + "[16858 rows x 5 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokensidseg_textraw_textlabels
0[Không_khí, thật, náo_nhiệt, .][0, 0, 0, 0]Không_khí thật náo_nhiệt .Không khí thật náo nhiệt .[O, O, O, O]
1[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...
2[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...
3[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...
4[Nhật_ký, của, thuyền_viên, .][0, 0, 0, 0]Nhật_ký của thuyền_viên .Nhật ký của thuyền viên .[O, O, O, O]
..................
16853[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...
16854[Nhưng, mọi, chuyện, không, dừng, ở, đó, .][0, 0, 0, 0, 0, 0, 0, 0]Nhưng mọi chuyện không dừng ở đó .Nhưng mọi chuyện không dừng ở đó .[O, O, O, O, O, O, O, O]
16855[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...
16856[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...Biết bao người đã tình nguyện hiến dâng cả cuộ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
16857[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Trên đây mới là “ thành_tích ” tiêu tiền của m...Trên đây mới là “ thành tích ” tiêu tiền của m...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
\n", + "

16858 rows × 5 columns

\n", + "
" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 2 + }, + { + "cell_type": "code", + "id": "14d9b9fae58b7173", + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:21:59.373985Z", + "start_time": "2025-06-11T00:21:34.524025Z" + } + }, + "source": [ + "import torch\n", + "from transformers import AutoTokenizer, AutoModel\n", + "from tqdm import tqdm\n", + "\n", + "# Load PhoBERT tokenizer và model\n", + "tokenizer = AutoTokenizer.from_pretrained(\"vinai/phobert-base\", use_fast=False)\n", + "model = AutoModel.from_pretrained(\"vinai/phobert-base\")\n", + "model.eval()" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuda\n" + ] + }, + { + "data": { + "text/plain": [ + "RobertaModel(\n", + " (embeddings): RobertaEmbeddings(\n", + " (word_embeddings): Embedding(64001, 768, padding_idx=1)\n", + " (position_embeddings): Embedding(258, 768, padding_idx=1)\n", + " (token_type_embeddings): Embedding(1, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): RobertaEncoder(\n", + " (layer): ModuleList(\n", + " (0-11): 12 x RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSdpaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " (intermediate_act_fn): GELUActivation()\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (pooler): RobertaPooler(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (activation): Tanh()\n", + " )\n", + ")" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 3 + }, + { + "cell_type": "code", + "id": "a47ec382649c3036", + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:23:23.888583Z", + "start_time": "2025-06-11T00:23:23.885204Z" + } + }, + "source": [ + "# Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece\n", + "def group_embeddings(tokens, embeddings):\n", + " word_embeddings = []\n", + " current_vecs = []\n", + "\n", + " for token, emb in zip(tokens, embeddings):\n", + " if token in [\"\", \"\"]:\n", + " continue\n", + "\n", + " if token.endswith(\"@@\"):\n", + " current_vecs.append(emb)\n", + " else:\n", + " current_vecs.append(emb)\n", + " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n", + " word_embeddings.append(word_emb)\n", + " current_vecs = []\n", + "\n", + " if current_vecs: # Trong trường hợp sót lại cuối câu\n", + " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n", + " word_embeddings.append(word_emb)\n", + "\n", + " return word_embeddings" + ], + "outputs": [], + "execution_count": 4 + }, + { + "cell_type": "code", + "id": "f8c0ad89ae81b0c", + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:25:52.567135Z", + "start_time": "2025-06-11T00:23:56.155322Z" + } + }, + "source": [ + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "model.to(device)\n", + "\n", + "all_embeddings = [] # list of [seq_len_i, 768] tensors\n", + "all_labels = [] # list of [seq_len_i,] tensors\n", + "len_em = []\n", + "\n", + "# count = 0\n", + "\n", + "for i, row in df.iterrows():\n", + "\n", + " # count += 1\n", + " # if count == 500:\n", + " # break\n", + "\n", + " # Truy cập phần tử từng dòng\n", + " sentence = row['seg_text']\n", + " gold_labels = row[\"id\"]\n", + "\n", + " # Cho sentence đi qua SentencePiece\n", + " input_ids = tokenizer.encode(sentence, return_tensors=\"pt\").to(device)\n", + "\n", + " tokens = tokenizer.convert_ids_to_tokens(input_ids[0].to(device))\n", + "\n", + " # Encode tạo embeddings\n", + " with torch.no_grad():\n", + " outputs = model(input_ids)\n", + " last_hidden_state = outputs.last_hidden_state.squeeze(0)\n", + "\n", + " # Gộp các embeddings đã bị tách khi đi qua SentencePiece\n", + " word_embeds = group_embeddings(tokens, last_hidden_state)\n", + "\n", + " # Kiểm tra số lượng embeddings và số lượng labels\n", + " if len(word_embeds) != len(gold_labels):\n", + " continue\n", + "\n", + " # Thêm vào list tổng / Tới đây là data đã sẵn sàng cho training\n", + " all_embeddings.append(torch.stack(word_embeds))\n", + " all_labels.append(torch.tensor(gold_labels))" + ], + "outputs": [], + "execution_count": 6 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:35:23.255306Z", + "start_time": "2025-06-11T00:35:23.252026Z" + } + }, + "cell_type": "code", + "source": "# We skip 43 data since they aren't convertable", + "id": "c3e406ad994802be", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-43\n" + ] + } + ], + "execution_count": 15 + }, + { + "cell_type": "code", + "id": "cadc3a861025b3b9", + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:36:18.857012Z", + "start_time": "2025-06-11T00:36:08.257408Z" + } + }, + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_flat = []\n", + "y_flat = []\n", + "\n", + "for emb_seq, label_seq in zip(all_embeddings, all_labels):\n", + " for emb, label in zip(emb_seq, label_seq):\n", + " X_flat.append(emb.cpu().numpy()) # emb: [768]\n", + " y_flat.append(label.item()) # label: int\n", + "\n", + "X_flat = np.array(X_flat) # [N, 768]\n", + "y_flat = np.array(y_flat) # [N]\n" + ], + "outputs": [], + "execution_count": 16 + }, + { + "cell_type": "code", + "id": "52a0fe72a50d4f73", + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:39:58.211159Z", + "start_time": "2025-06-11T00:39:58.208074Z" + } + }, + "source": [ + "print(X_flat[0].shape)\n", + "print(y_flat.shape)" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(768,)\n", + "(368172,)\n" + ] + } + ], + "execution_count": 19 + }, + { + "cell_type": "code", + "id": "d6275df555f0c4c3", + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:42:00.129778Z", + "start_time": "2025-06-11T00:42:00.096986Z" + } + }, + "source": [ + "# Kiểm tra độ lệch data\n", + "unique_values, counts = np.unique(y_flat, return_counts=True)\n", + "\n", + "# In ra từng giá trị và số lần xuất hiện\n", + "for val, count in zip(unique_values, counts):\n", + " print(f\"Label {val}: {count} times\")\n" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Label 0: 344986 times\n", + "Label 1: 7450 times\n", + "Label 2: 3504 times\n", + "Label 3: 1204 times\n", + "Label 4: 2050 times\n", + "Label 5: 6211 times\n", + "Label 6: 2767 times\n" + ] + } + ], + "execution_count": 24 + }, + { + "cell_type": "code", + "id": "664020977ba9a1e2", + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:42:03.350616Z", + "start_time": "2025-06-11T00:42:02.915680Z" + } + }, + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X_flat, y_flat, test_size=0.2, random_state=42, stratify=y_flat)\n" + ], + "outputs": [], + "execution_count": 25 + }, + { + "cell_type": "code", + "id": "d4acda9c7cae3214", + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:42:25.235471Z", + "start_time": "2025-06-11T00:42:16.769480Z" + } + }, + "source": [ + "import lightgbm as lgb\n", + "from sklearn.metrics import accuracy_score, f1_score, classification_report\n", + "\n", + "\n", + "# Tạo Dataset cho LightGBM\n", + "train_data = lgb.Dataset(X_train, label=y_train)\n", + "test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)\n", + "\n", + "# Cấu hình tham số LightGBM (Random Forest mode)\n", + "params = {\n", + " \"objective\": \"multiclass\", # nếu multiclass classification\n", + " \"num_class\": len(np.unique(y_train)),\n", + " \"metric\": \"multi_logloss\",\n", + " \"boosting_type\": \"rf\", # random forest mode trong LightGBM\n", + " \"num_leaves\": 31,\n", + " \"bagging_freq\": 1,\n", + " \"bagging_fraction\": 0.8,\n", + " \"feature_fraction\": 0.8,\n", + " \"bagging_seed\": 42,\n", + " \"verbose\": -1,\n", + " \"seed\": 42,\n", + " \"is_unbalance\": True\n", + "}\n", + "\n", + "\n", + "\n", + "# Train model, tích hợp wandb callback để log metrics\n", + "model = lgb.train(\n", + " params,\n", + " train_data,\n", + " num_boost_round=2,\n", + " valid_sets=[train_data, test_data],\n", + " valid_names=[\"train\", \"test\"]\n", + ")\n", + "\n", + "# Dự đoán trên test set\n", + "y_pred_prob = model.predict(X_test)\n", + "y_pred = np.argmax(y_pred_prob, axis=1)\n", + "\n", + "# Ánh xạ số về nhãn tên entity\n", + "label_map = {\n", + " 0: 'O',\n", + " 1: 'B-PER',\n", + " 2: 'I-PER',\n", + " 3: 'B-ORG',\n", + " 4: 'I-ORG',\n", + " 5: 'B-LOC',\n", + " 6: 'I-LOC'\n", + "}\n", + "\n", + "# Chuyển y_test và y_pred sang nhãn gốc\n", + "y_test_labels = [label_map[i] for i in y_test]\n", + "y_pred_labels = [label_map[i] for i in y_pred]\n", + "\n", + "# In classification report với nhãn thật\n", + "print(\"\\nClassification Report (theo label gốc):\")\n", + "print(classification_report(y_test_labels, y_pred_labels, digits=4))\n", + "\n", + "\n" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Classification Report (theo label gốc):\n", + " precision recall f1-score support\n", + "\n", + " B-LOC 0.3679 0.5000 0.4239 1242\n", + " B-ORG 0.2639 0.3942 0.3161 241\n", + " B-PER 0.4395 0.7490 0.5540 1490\n", + " I-LOC 0.2321 0.4448 0.3050 553\n", + " I-ORG 0.1532 0.2878 0.2000 410\n", + " I-PER 0.4304 0.5863 0.4964 701\n", + " O 0.9869 0.9478 0.9669 68998\n", + "\n", + " accuracy 0.9235 73635\n", + " macro avg 0.4106 0.5586 0.4660 73635\n", + "weighted avg 0.9474 0.9235 0.9336 73635\n", + "\n" + ] + } + ], + "execution_count": 26 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:45:00.649942Z", + "start_time": "2025-06-11T00:45:00.646595Z" + } + }, + "cell_type": "code", + "source": "print(model.feature_importance().shape)", + "id": "b1cf76bc3e58bc93", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(768,)\n" + ] + } + ], + "execution_count": 35 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:52:36.844604Z", + "start_time": "2025-06-11T00:52:36.827018Z" + } + }, + "cell_type": "code", + "source": [ + "correct = 0\n", + "for i in range(73635):\n", + " if y_pred[i] == y_test[i]:\n", + " correct += 1\n", + "correct" + ], + "id": "39d391e67a51211c", + "outputs": [ + { + "data": { + "text/plain": [ + "68001" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 58 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-06-11T00:57:45.109129Z", + "start_time": "2025-06-11T00:57:45.105078Z" + } + }, + "cell_type": "code", + "source": "print(y_test.shape)", + "id": "1a0ba8f0410c5589", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(73635,)\n" + ] + } + ], + "execution_count": 61 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_Rule_base.ipynb b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_Rule_base.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7c83bb0362115cc0088fdced047e1aaadacd1327 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Kien_Rule_base.ipynb @@ -0,0 +1,8495 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7bPdA3aUaZqD", + "outputId": "e0cca7aa-2bee-4d86-ceb1-663e3733e103" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}\n", + "df_train = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"train\"])\n", + "df_valid = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"valid\"])\n", + "df = pd.concat([df_train, df_valid]).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "UGHTgnkil_4R" + }, + "outputs": [], + "source": [ + "tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-NAT': 7, 'I-NAT': 8}\n", + "id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-NAT', 8: 'I-NAT'}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 423 + }, + "id": "gg8a9_1AibFj", + "outputId": "ff63e8d2-2782-4b64-c135-f4d15b43c818" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"df\",\n \"rows\": 16858,\n \"fields\": [\n {\n \"column\": \"tokens\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_tags\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_labels\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokensner_tagsner_labels
0[Không_khí, thật, náo_nhiệt, .][0, 0, 0, 0][O, O, O, O]
1[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...
2[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...
3[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...
4[Nhật_ký, của, thuyền_viên, .][0, 0, 0, 0][O, O, O, O]
............
16853[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...
16854[Nhưng, mọi, chuyện, không, dừng, ở, đó, .][0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O]
16855[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...
16856[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
16857[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
\n", + "

16858 rows × 3 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " tokens \\\n", + "0 [Không_khí, thật, náo_nhiệt, .] \n", + "1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n", + "2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n", + "3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n", + "4 [Nhật_ký, của, thuyền_viên, .] \n", + "... ... \n", + "16853 [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ... \n", + "16854 [Nhưng, mọi, chuyện, không, dừng, ở, đó, .] \n", + "16855 [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,... \n", + "16856 [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ... \n", + "16857 [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t... \n", + "\n", + " ner_tags \\\n", + "0 [0, 0, 0, 0] \n", + "1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n", + "4 [0, 0, 0, 0] \n", + "... ... \n", + "16853 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... \n", + "16854 [0, 0, 0, 0, 0, 0, 0, 0] \n", + "16855 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "16856 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "16857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "\n", + " ner_labels \n", + "0 [O, O, O, O] \n", + "1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n", + "2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n", + "3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n", + "4 [O, O, O, O] \n", + "... ... \n", + "16853 [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,... \n", + "16854 [O, O, O, O, O, O, O, O] \n", + "16855 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "16856 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "16857 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "\n", + "[16858 rows x 3 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def label(x):\n", + " return [id_tag[int(i)] for i in x]\n", + "df[\"ner_labels\"] = df.ner_tags.apply(label)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l3KdOY9Imz8D" + }, + "source": [ + "# Overview" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "dvkgtj0Ilzno", + "outputId": "3053d59d-b399-4729-9507-916b127e62fa" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"df\",\n \"rows\": 16858,\n \"fields\": [\n {\n \"column\": \"tokens\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_tags\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_labels\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokensner_tagsner_labels
0[Không_khí, thật, náo_nhiệt, .][0, 0, 0, 0][O, O, O, O]
1[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...
2[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...
3[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...
4[Nhật_ký, của, thuyền_viên, .][0, 0, 0, 0][O, O, O, O]
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " tokens \\\n", + "0 [Không_khí, thật, náo_nhiệt, .] \n", + "1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n", + "2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n", + "3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n", + "4 [Nhật_ký, của, thuyền_viên, .] \n", + "\n", + " ner_tags \\\n", + "0 [0, 0, 0, 0] \n", + "1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n", + "4 [0, 0, 0, 0] \n", + "\n", + " ner_labels \n", + "0 [O, O, O, O] \n", + "1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n", + "2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n", + "3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n", + "4 [O, O, O, O] " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XbY94Vjqb3UY", + "outputId": "ae7af3c6-4c0d-41eb-a671-b9d1a660ff5c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 16858 entries, 0 to 16857\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 tokens 16858 non-null object\n", + " 1 ner_tags 16858 non-null object\n", + " 2 ner_labels 16858 non-null object\n", + "dtypes: object(3)\n", + "memory usage: 395.2+ KB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1vHfeaHgjpMg", + "outputId": "62a5028b-3758-4e71-e473-36aeb9cda5a3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tokens - \n", + "ner_tags - \n" + ] + } + ], + "source": [ + "print(\"tokens - \", type(df.tokens[0]))\n", + "print(\"ner_tags - \", type(df.ner_tags[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 562 + }, + "id": "Oi5-brhr2GZc", + "outputId": "e239b808-581f-4f09-85f9-dd562cb1e63a" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"df\",\n \"rows\": 16858,\n \"fields\": [\n {\n \"column\": \"tokens\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_tags\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_labels\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text_withseg\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16787,\n \"samples\": [\n \"T\\u00ednh th\\u00f4ng_minh , l\\u1ea1i t\\u00f2_m\\u00f2 , anh Ki\\u1ec7m b\\u1eaft_\\u0111\\u1ea7u \\u0111i \\u0111\\u1ebfn c\\u00e1c x\\u01b0\\u1edfng c\\u01a1_kh\\u00ed \\u0111\\u1ec3 quan_s\\u00e1t c\\u00e1c lo\\u1ea1i m\\u00e1y_m\\u00f3c , r\\u1ed3i v\\u1ec1 nh\\u00e0 suy_ngh\\u0129 v\\u00e0 c\\u1ea7m b\\u00fat v\\u1ebd ph\\u00e1c_ho\\u1ea1 ra c\\u00e1i m\\u00e1y v\\u00fat g\\u1ea1o .\",\n \"V\\u1eady th\\u00ec , h\\u1ecd c\\u1ea7n ph\\u1ea3i \\u0111\\u01b0\\u1ee3c gi\\u00fap_\\u0111\\u1ee1 , ph\\u1ea3i \\u0111\\u01b0\\u1ee3c s\\u1ed1ng \\u0111\\u00e0ng_ho\\u00e0ng , ph\\u1ea3i \\u0111\\u01b0\\u1ee3c l\\u00e0m ng\\u01b0\\u1eddi d\\u00f9 ch\\u1ec9 l\\u00e0 nh\\u1eefng ng\\u00e0y cu\\u1ed1i_c\\u00f9ng .\",\n \"Nhi\\u1ec1u ng\\u01b0\\u1eddi th\\u00f4ng_d\\u1ecbch c\\u00f9ng th\\u1eddi v\\u1edbi Nguy\\u1ec5n Trung Hi\\u1ebfu c\\u0169ng \\u0111\\u00e3 ch\\u1ebft trong khi th\\u1ef1c_hi\\u1ec7n nhi\\u1ec7m_v\\u1ee5 t\\u1ea1i chi\\u1ebfn_tr\\u01b0\\u1eddng ho\\u1eb7c tr\\u00ean \\u0111\\u01b0\\u1eddng h\\u00e0nh_qu\\u00e2n .\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text_raw\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16785,\n \"samples\": [\n \"Trong kho\\u1ea3ng th\\u1eddi gian \\u0111\\u00f3 ch\\u1ecb c\\u1ed1 c\\u00f4ng t\\u1ef1 h\\u1ecdc ti\\u1ebfng Anh .\",\n \"Sau \\u0111\\u00f3 , ch\\u00ednh b\\u00e0 Susan \\u0111\\u00e3 \\u0111\\u01b0a Mai l\\u00ean h\\u1ecdc \\u0111\\u1ea1i h\\u1ecdc , m\\u1ed7i n\\u0103m chu c\\u1ea5p cho c\\u00f4 30.000 USD .\",\n \"T\\u1eeb r\\u1ea5t l\\u00e2u r\\u1ed3i t\\u00f4i v\\u1eabn ngh\\u0129 n\\u1ebfu nh\\u01b0 cu\\u1ed1n s\\u00e1ch \\u0111\\u01b0\\u1ee3c xu\\u1ea5t b\\u1ea3n , ho\\u1eb7c ng\\u01b0\\u1eddi ta l\\u00e0m phim v\\u1ec1 n\\u00f3 th\\u00ec t\\u00f4i s\\u1ebd d\\u00f9ng s\\u1ed1 ti\\u1ec1n b\\u00e1n s\\u00e1ch \\u0111\\u1ec3 thi\\u1ebft l\\u1eadp m\\u1ed9t s\\u1ed1 gi\\u01b0\\u1eddng b\\u1ec7nh t\\u1ea1i H\\u00e0 N\\u1ed9i .\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokensner_tagsner_labelstext_withsegtext_raw
0[Không_khí, thật, náo_nhiệt, .][0, 0, 0, 0][O, O, O, O]Không_khí thật náo_nhiệt .Không khí thật náo nhiệt .
1[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...
2[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...
3[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...
4[Nhật_ký, của, thuyền_viên, .][0, 0, 0, 0][O, O, O, O]Nhật_ký của thuyền_viên .Nhật ký của thuyền viên .
..................
16853[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...
16854[Nhưng, mọi, chuyện, không, dừng, ở, đó, .][0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O]Nhưng mọi chuyện không dừng ở đó .Nhưng mọi chuyện không dừng ở đó .
16855[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...
16856[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...Biết bao người đã tình nguyện hiến dâng cả cuộ...
16857[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...Trên đây mới là “ thành_tích ” tiêu tiền của m...Trên đây mới là “ thành tích ” tiêu tiền của m...
\n", + "

16858 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " tokens \\\n", + "0 [Không_khí, thật, náo_nhiệt, .] \n", + "1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n", + "2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n", + "3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n", + "4 [Nhật_ký, của, thuyền_viên, .] \n", + "... ... \n", + "16853 [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ... \n", + "16854 [Nhưng, mọi, chuyện, không, dừng, ở, đó, .] \n", + "16855 [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,... \n", + "16856 [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ... \n", + "16857 [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t... \n", + "\n", + " ner_tags \\\n", + "0 [0, 0, 0, 0] \n", + "1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n", + "4 [0, 0, 0, 0] \n", + "... ... \n", + "16853 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... \n", + "16854 [0, 0, 0, 0, 0, 0, 0, 0] \n", + "16855 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "16856 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "16857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "\n", + " ner_labels \\\n", + "0 [O, O, O, O] \n", + "1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n", + "2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n", + "3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n", + "4 [O, O, O, O] \n", + "... ... \n", + "16853 [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,... \n", + "16854 [O, O, O, O, O, O, O, O] \n", + "16855 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "16856 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "16857 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "\n", + " text_withseg \\\n", + "0 Không_khí thật náo_nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,... \n", + "4 Nhật_ký của thuyền_viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh... \n", + "16856 Biết_bao người đã tình_nguyện hiến_dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành_tích ” tiêu tiền của m... \n", + "\n", + " text_raw \n", + "0 Không khí thật náo nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,... \n", + "4 Nhật ký của thuyền viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh... \n", + "16856 Biết bao người đã tình nguyện hiến dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành tích ” tiêu tiền của m... \n", + "\n", + "[16858 rows x 5 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Hàm gộp và xử lý dấu câu\n", + "def join_tokens(tokens):\n", + " text = ' '.join(tokens)\n", + " return text\n", + "\n", + "def reform_raw_text(tokens):\n", + " text = ' '.join(tokens)\n", + " return text.replace(\"_\", \" \")\n", + "\n", + "df['text_withseg'] = df['tokens'].apply(join_tokens)\n", + "df['text_raw'] = df['tokens'].apply(reform_raw_text)\n", + "\n", + "def label(x):\n", + " return [id_tag[int(i)] for i in x]\n", + "df[\"ner_labels\"] = df.ner_tags.apply(label)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sYlRxG7K9PYx" + }, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "lkTKRcYPsZiD" + }, + "outputs": [], + "source": [ + "time = 0\n", + "org_idx = []\n", + "token = []\n", + "tag = []\n", + "for i in (df.index):\n", + " for a in range(len(df.ner_tags[i])):\n", + " # if df.ner_tags[i][a] == 6 or df.ner_tags[i][a] == 5:\n", + " if df.ner_labels[i][a] != 'O':\n", + " token.append(df.tokens[i][[a]])\n", + " tag.append(df.ner_labels[i][a])\n", + " org_idx.append(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 366 + }, + "id": "vTufL8Blu5fe", + "outputId": "ed82540c-c2d0-450b-9eff-0ba680a11698" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
tag
B-PER7479
B-LOC6244
I-PER3522
I-LOC2783
I-ORG2055
B-ORG1212
B-NAT282
I-NAT279
\n", + "

" + ], + "text/plain": [ + "tag\n", + "B-PER 7479\n", + "B-LOC 6244\n", + "I-PER 3522\n", + "I-LOC 2783\n", + "I-ORG 2055\n", + "B-ORG 1212\n", + "B-NAT 282\n", + "I-NAT 279\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tag_data = pd.DataFrame({'org_idx': org_idx, 'token': token, 'tag': tag})\n", + "tag_data.tag.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 + }, + "id": "en1q55Tf9lD7", + "outputId": "1de18f2e-88e5-4e75-847f-8aaf6b2932b4" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"tag_data[tag_data\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"org_idx\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5203,\n \"min\": 1364,\n \"max\": 15557,\n \"num_unique_values\": 10,\n \"samples\": [\n 1364,\n 14208,\n 12272\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"token\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tag\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"B-PER\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
org_idxtokentag
113238102[Rơ]B-PER
1993614208[Thạc]B-PER
50743612[Khánh]B-PER
88706383[Bibi]B-PER
59274246[Thuỳ]B-PER
1712212272[Chương]B-PER
32362290[Claudia]B-PER
1690312150[Chương]B-PER
19531364[Thạc]B-PER
2188815557[Đợi]B-PER
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " org_idx token tag\n", + "11323 8102 [Rơ] B-PER\n", + "19936 14208 [Thạc] B-PER\n", + "5074 3612 [Khánh] B-PER\n", + "8870 6383 [Bibi] B-PER\n", + "5927 4246 [Thuỳ] B-PER\n", + "17122 12272 [Chương] B-PER\n", + "3236 2290 [Claudia] B-PER\n", + "16903 12150 [Chương] B-PER\n", + "1953 1364 [Thạc] B-PER\n", + "21888 15557 [Đợi] B-PER" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tag_data[tag_data.tag == 'B-PER'].sample(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 607 + }, + "id": "bDjW72xzQTwS", + "outputId": "305f8523-dda8-4a34-d116-63b535e766fa" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "tag_counts = tag_data.tag.value_counts()\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "tag_counts.plot(kind='bar')\n", + "plt.title('Distribution of NER Label Frequency')\n", + "plt.xlabel('NER Label')\n", + "plt.ylabel('Frequency')\n", + "plt.xticks(rotation=45, ha='right')\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s_2oNNTJARjZ" + }, + "source": [ + "Nhãn B-PER có tần suất cao nhất, vượt quá 7.000.\n", + "\n", + "Nhãn B-LOC đứng thứ hai với khoảng 6.000 lần xuất hiện.\n", + "\n", + "Các nhãn I-PER, I-LOC, và I-ORG có tần suất giảm dần, lần lượt khoảng 4.000, 3000, và 2.000.\n", + "\n", + "Nhãn B-ORG và B-NAT có tần suất thấp hơn, dưới 1.000.\n", + "\n", + "Nhãn I-NAT có tần suất rất thấp." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BOVwaIiGAqDo" + }, + "source": [ + "Vấn đề:\n", + "\n", + "\n", + "* Lệch dữ liệu (Data Imbalance): Mô hình có thể thiên về dự đoán các nhãn phổ biến (như B-PER, B-LOC), dẫn đến hiệu suất kém với các nhãn hiếm (như I-NAT, B-NAT).\n", + "\n", + "* Khó khăn trong học tập: Các nhãn có tần suất thấp có thể không cung cấp đủ thông tin để mô hình học tốt, làm giảm độ chính xác tổng thể.\n", + "\n", + "\n", + "\n", + "* Cần kỹ thuật cân bằng: Có thể cần áp dụng các phương pháp như oversampling cho nhãn hiếm, undersampling cho nhãn phổ biến, hoặc sử dụng các hàm mất mát có trọng số (weighted loss) để cải thiện hiệu suất. (Khó nha bro\n", + ")\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8c3ogIAxQTmM" + }, + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m2WSqdVxbboH" + }, + "source": [ + "### Thay đổi các chunking chứa NAT bằng O" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 + }, + "id": "lIPB1IyCbjEd", + "outputId": "f580ffc9-9460-4aed-821a-dca9d7379699" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"tag_data[tag_data\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"org_idx\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4449,\n \"min\": 269,\n \"max\": 10741,\n \"num_unique_values\": 10,\n \"samples\": [\n 640,\n 3419,\n 9305\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"token\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tag\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"B-NAT\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
org_idxtokentag
31542236[tiếng]B-NAT
48873419[VN]B-NAT
419269[Người]B-NAT
25481821[người]B-NAT
993678[người]B-NAT
130329305[Mỹ]B-NAT
131839436[tiếng]B-NAT
1411910184[tiếng]B-NAT
942640[người]B-NAT
1484610741[người]B-NAT
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " org_idx token tag\n", + "3154 2236 [tiếng] B-NAT\n", + "4887 3419 [VN] B-NAT\n", + "419 269 [Người] B-NAT\n", + "2548 1821 [người] B-NAT\n", + "993 678 [người] B-NAT\n", + "13032 9305 [Mỹ] B-NAT\n", + "13183 9436 [tiếng] B-NAT\n", + "14119 10184 [tiếng] B-NAT\n", + "942 640 [người] B-NAT\n", + "14846 10741 [người] B-NAT" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tag_data[tag_data.tag == 'B-NAT'].sample(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SV71o4CkeNMJ", + "outputId": "ef587b35-80ad-4078-a444-a6a76f9e13f3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag counts before replacement:\n", + "tag\n", + "B-PER 7479\n", + "B-LOC 6244\n", + "I-PER 3522\n", + "I-LOC 2783\n", + "I-ORG 2055\n", + "B-ORG 1212\n", + "B-NAT 282\n", + "I-NAT 279\n", + "Name: count, dtype: int64\n", + "\n", + "Tag counts after replacing NAT with O:\n", + "tag\n", + "B-PER 7479\n", + "B-LOC 6244\n", + "I-PER 3522\n", + "I-LOC 2783\n", + "I-ORG 2055\n", + "B-ORG 1212\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "def replace_label_in_dataset(target, x):\n", + " new_labels = []\n", + " # print(x)\n", + " i = 0\n", + " while i < len(x):\n", + " if id_tag[x[i]] == target:\n", + " # Find the end of the entity\n", + " j = i\n", + " while j < len(x) and (id_tag[x[j]] == target or id_tag[x[j]] == target.replace('B-', 'I-')):\n", + " j += 1\n", + " # Replace all tags in this entity with 'O' (id 0)\n", + " for k in range(i, j):\n", + " new_labels.append(0)\n", + " i = j # Move to the end of the entity\n", + " else:\n", + " new_labels.append(x[i])\n", + " i += 1\n", + " return new_labels\n", + "\n", + "df['ner_tags_replaced_nat'] = df.ner_tags.apply(lambda x: replace_label_in_dataset('B-NAT', x))\n", + "df['ner_labels_replaced_nat'] = df.ner_tags_replaced_nat.apply(label)\n", + "\n", + "# Verify the change\n", + "time = 0\n", + "org_idx_replaced = []\n", + "token_replaced = []\n", + "tag_replaced = []\n", + "for i in (df.index):\n", + " for a in range(len(df.ner_tags_replaced_nat[i])):\n", + " if df.ner_labels_replaced_nat[i][a] != 'O':\n", + " token_replaced.append(df.tokens[i][[a]])\n", + " tag_replaced.append(df.ner_labels_replaced_nat[i][a])\n", + " org_idx_replaced.append(i)\n", + "\n", + "tag_data_replaced = pd.DataFrame({'org_idx': org_idx_replaced, 'token': token_replaced, 'tag': tag_replaced})\n", + "\n", + "print(\"Tag counts before replacement:\")\n", + "print(tag_data.tag.value_counts())\n", + "print(\"\\nTag counts after replacing NAT with O:\")\n", + "print(tag_data_replaced.tag.value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ygcdOZIFfdSg", + "outputId": "b5b76ff1-e391-4c1a-9997-127b8ddc286f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Unique values in ner_labels_replaced_nat:\n", + "O 346105\n", + "B-PER 7479\n", + "B-LOC 6244\n", + "I-PER 3522\n", + "I-LOC 2783\n", + "I-ORG 2055\n", + "B-ORG 1212\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "print(\"\\nUnique values in ner_labels_replaced_nat:\")\n", + "all_labels_replaced = [label for sublist in df['ner_labels_replaced_nat'] for label in sublist]\n", + "print(pd.Series(all_labels_replaced).value_counts())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YVtsiaqLgTo9" + }, + "source": [ + "### After process" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "collapsed": true, + "id": "-nQAuJLHfRgU", + "outputId": "608a93f9-292a-4f33-b2d8-7a074d5b8816" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"df[40:80]\",\n \"rows\": 40,\n \"fields\": [\n {\n \"column\": \"tokens\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_tags\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_labels\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text_withseg\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 40,\n \"samples\": [\n \"C\\u00f3 \\u0111\\u00fang ch\\u1ecb n\\u1eb1m tr\\u00ean m\\u1ed9t d\\u00e3y n\\u00fai cao \\u1edf mi\\u1ec1n t\\u00e2y \\u0110\\u1ee9c_Ph\\u1ed5 ?\",\n \"B\\u01b0\\u1edbc v\\u00e0o \\u0111\\u1ea7u ng\\u00f5 , nh\\u00e0 c\\u00f4 C\\u00fac ph\\u01a1i \\u0111\\u1ea7y b\\u00e1nh_tr\\u00e1ng ph\\u00eda tr\\u01b0\\u1edbc .\",\n \"Chung m\\u1ed9t ch\\u1eef \\\" L\\u01b0\\u01a1ng \\\" ...\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text_raw\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 40,\n \"samples\": [\n \"C\\u00f3 \\u0111\\u00fang ch\\u1ecb n\\u1eb1m tr\\u00ean m\\u1ed9t d\\u00e3y n\\u00fai cao \\u1edf mi\\u1ec1n t\\u00e2y \\u0110\\u1ee9c Ph\\u1ed5 ?\",\n \"B\\u01b0\\u1edbc v\\u00e0o \\u0111\\u1ea7u ng\\u00f5 , nh\\u00e0 c\\u00f4 C\\u00fac ph\\u01a1i \\u0111\\u1ea7y b\\u00e1nh tr\\u00e1ng ph\\u00eda tr\\u01b0\\u1edbc .\",\n \"Chung m\\u1ed9t ch\\u1eef \\\" L\\u01b0\\u01a1ng \\\" ...\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_tags_replaced_nat\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_labels_replaced_nat\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokensner_tagsner_labelstext_withsegtext_rawner_tags_replaced_natner_labels_replaced_nat
40[Nguyên, phân_công, anh, bạn, đồng_nghiệp, ở, ...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...Nguyên phân_công anh bạn đồng_nghiệp ở vòng ng...Nguyên phân công anh bạn đồng nghiệp ở vòng ng...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...
41[Theo, kế_hoạch, ,, những, ngày, đầu, cả, hai,...[0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, B-NAT, I-NAT, O, O...Theo kế_hoạch , những ngày đầu cả hai luyện ti...Theo kế hoạch , những ngày đầu cả hai luyện ti...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
42[Thật, đáng, tiếc, biết_bao, ,, những, ngày, n...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...Thật đáng tiếc biết_bao , những ngày này trăng...Thật đáng tiếc biết bao , những ngày này trăng...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
43[Hải, và, bố_mẹ, ngày, trước, ở, chung, với, ô...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, B-LOC, I-LO...Hải và bố_mẹ ngày trước ở chung với ông_bà trê...Hải và bố mẹ ngày trước ở chung với ông bà trê...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, B-LOC, I-LO...
44[Cho_nên, phương_án, của, ông, Phong, là, “, b...[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, B-PER, O, O, O, O, O, O, O, O, O,...Cho_nên phương_án của ông Phong là “ bán cả co...Cho nên phương án của ông Phong là “ bán cả co...[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, B-PER, O, O, O, O, O, O, O, O, O,...
45[Một, thời_gian, ngắn, sau, trận, tỉ_thí, lịch...[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, B-PER, I-PER, O, O, O...Một thời_gian ngắn sau trận tỉ_thí lịch_sử , M...Một thời gian ngắn sau trận tỉ thí lịch sử , M...[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, B-PER, I-PER, O, O, O...
46[Kao, chỉ, mới, được, gửi, lên, đây, hơn, một,...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...Kao chỉ mới được gửi lên đây hơn một tuần , nh...Kao chỉ mới được gửi lên đây hơn một tuần , nh...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...
47[An_Lư, cũng, tích_cực, đào_tạo, các, thuyền_v...[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,...An_Lư cũng tích_cực đào_tạo các thuyền_viên ng...An Lư cũng tích cực đào tạo các thuyền viên ng...[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,...
48[Anh, cười, tươi, :, \", Nếu, không, thắng, thì...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]Anh cười tươi : \" Nếu không thắng thì đâu còn ...Anh cười tươi : \" Nếu không thắng thì đâu còn ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
49[Hắn, không, có, một, dữ_liệu, nào, nằm, trong...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...Hắn không có một dữ_liệu nào nằm trong tay thá...Hắn không có một dữ liệu nào nằm trong tay thá...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
50[Và, cho_dù, xảy, ra, tình_huống, nào, thì, ôn...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O]Và cho_dù xảy ra tình_huống nào thì ông giám_đ...Và cho dù xảy ra tình huống nào thì ông giám đ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O]
51[Để, em, tính, lại, .][0, 0, 0, 0, 0][O, O, O, O, O]Để em tính lại .Để em tính lại .[0, 0, 0, 0, 0][O, O, O, O, O]
52[Ông, đến, có, khi, mặc, sắc_phục, ,, có, khi,...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...Ông đến có khi mặc sắc_phục , có khi mặc thườn...Ông đến có khi mặc sắc phục , có khi mặc thườn...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
53[Có, đứa, trả_lời, :, chưa, có, lúc, nào, thấy...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O]Có đứa trả_lời : chưa có lúc nào thấy hạnh_phú...Có đứa trả lời : chưa có lúc nào thấy hạnh phú...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O]
54[Vậy, đó, ,, lửa, thử, vàng, gian_nan, thử, sứ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O]Vậy đó , lửa thử vàng gian_nan thử sức .Vậy đó , lửa thử vàng gian nan thử sức .[0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O]
55[Chung, một, chữ, \", Lương, \", ...][0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O]Chung một chữ \" Lương \" ...Chung một chữ \" Lương \" ...[0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O]
56[Bước, vào, đầu, ngõ, ,, nhà, cô, Cúc, phơi, đ...[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, B-PER, O, O, O, O, O, O]Bước vào đầu ngõ , nhà cô Cúc phơi đầy bánh_tr...Bước vào đầu ngõ , nhà cô Cúc phơi đầy bánh tr...[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, B-PER, O, O, O, O, O, O]
57[Nếu, có, trở_ngại, một_chút, thì, đúng, là, l...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, ...[O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC...Nếu có trở_ngại một_chút thì đúng là lượng khá...Nếu có trở ngại một chút thì đúng là lượng khá...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, ...[O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC...
58[Anh, Dĩa, kéo, chúng_tôi, lên, bờ, ,, khui, b...[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, B-PER, O, O, O, O, O, O, O, O, O, O, O, O,...Anh Dĩa kéo chúng_tôi lên bờ , khui bia , rượu...Anh Dĩa kéo chúng tôi lên bờ , khui bia , rượu...[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, B-PER, O, O, O, O, O, O, O, O, O, O, O, O,...
59[Có, đúng, chị, nằm, trên, một, dãy, núi, cao,...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0][O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, O]Có đúng chị nằm trên một dãy núi cao ở miền tâ...Có đúng chị nằm trên một dãy núi cao ở miền tâ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0][O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, O]
60[Bắt_đầu, từ, năm, 1961, ,, xã, Nhuận_Đức, phá...[0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0][O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O]Bắt_đầu từ năm 1961 , xã Nhuận_Đức phát_động p...Bắt đầu từ năm 1961 , xã Nhuận Đức phát động p...[0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0][O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O]
61[Năm, nay, đầu, trên, xóm, dưới, lắc_đầu, ngao...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O]Năm nay đầu trên xóm dưới lắc_đầu ngao_ngán bỏ...Năm nay đầu trên xóm dưới lắc đầu ngao ngán bỏ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O]
62[Và, đến, nay, những, mét, hầm, cuối_cùng, cũn...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O]Và đến nay những mét hầm cuối_cùng cũng đã về ...Và đến nay những mét hầm cuối cùng cũng đã về ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O]
63[Đèn, đường, loang_loáng, ,, hoà, chung, dòng,...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC,...Đèn đường loang_loáng , hoà chung dòng xe tấp_...Đèn đường loang loáng , hoà chung dòng xe tấp ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC,...
64[Cả, đám, bắt_đầu, lên_cơn, lắc, quậy, điên_cu...[0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O]Cả đám bắt_đầu lên_cơn lắc quậy điên_cuồng ...Cả đám bắt đầu lên cơn lắc quậy điên cuồng ...[0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O]
65[Ở, trạm_xá, xã, Hưng_Long, đối_diện, với, căn...[0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, ...[O, O, B-LOC, I-LOC, O, O, O, O, O, B-PER, I-P...Ở trạm_xá xã Hưng_Long đối_diện với căn nhà tử...Ở trạm xá xã Hưng Long đối diện với căn nhà tử...[0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, ...[O, O, B-LOC, I-LOC, O, O, O, O, O, B-PER, I-P...
66[Mới_đó, mà, ta, xa, nhau, ,, thật, là, kinh_k...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O]Mới_đó mà ta xa nhau , thật là kinh_khủng .Mới đó mà ta xa nhau , thật là kinh khủng .[0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O]
67[Trong, cuộc, chiến_đấu, vì, nghĩa_vụ, quốc_tế...[0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 5, 0, ...[O, O, O, O, O, O, O, O, O, B-LOC, O, O, O, B-...Trong cuộc chiến_đấu vì nghĩa_vụ quốc_tế với n...Trong cuộc chiến đấu vì nghĩa vụ quốc tế với n...[0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 5, 0, ...[O, O, O, O, O, O, O, O, O, B-LOC, O, O, O, B-...
68[Mỗi, khi, sóng, dập, vào, và, đẩy, người, lên...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...Mỗi khi sóng dập vào và đẩy người lên theo thậ...Mỗi khi sóng dập vào và đẩy người lên theo thậ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
69[Đây, là, một, giai_đoạn, khó_khăn, ,, đau_đớn...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O]Đây là một giai_đoạn khó_khăn , đau_đớn nhất c...Đây là một giai đoạn khó khăn , đau đớn nhất c...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O]
70[Và, hôm_nay, ,, chúng_ta, cũng, cần, hỏi, câu...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O, O]Và hôm_nay , chúng_ta cũng cần hỏi câu hỏi này...Và hôm nay , chúng ta cũng cần hỏi câu hỏi này...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O, O]
71[Thạc_sĩ, thú_y, với, bầy, muông_thú, .][0, 0, 0, 0, 0, 0][O, O, O, O, O, O]Thạc_sĩ thú_y với bầy muông_thú .Thạc sĩ thú y với bầy muông thú .[0, 0, 0, 0, 0, 0][O, O, O, O, O, O]
72[Tôi, lại, điện, hỏi_thăm, ông, khi, về, tới, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...Tôi lại điện hỏi_thăm ông khi về tới nhà , ông...Tôi lại điện hỏi thăm ông khi về tới nhà , ông...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
73[Hương_Rừng, xuất_hiện, ở, nhiều, nơi, từ, nội...[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...[B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,...Hương_Rừng xuất_hiện ở nhiều nơi từ nội_thành ...Hương Rừng xuất hiện ở nhiều nơi từ nội thành ...[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...[B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,...
74[Nhìn, những, cảnh, đó, mình, cười, mà, nước_m...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O, O]Nhìn những cảnh đó mình cười mà nước_mắt chực ...Nhìn những cảnh đó mình cười mà nước mắt chực ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O, O]
75[Ông, nhớ, mãi, năm, cô, con, gái, út, học, cấ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...Ông nhớ mãi năm cô con gái út học cấp II , thư...Ông nhớ mãi năm cô con gái út học cấp II , thư...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
76[Đường, về, xã, Ia_Yeng, cắt, ngang, cánh, đồn...[0, 0, 5, 6, 0, 0, 0, 0, 5, 0][O, O, B-LOC, I-LOC, O, O, O, O, B-LOC, O]Đường về xã Ia_Yeng cắt ngang cánh đồng Ayun_Hạ .Đường về xã Ia Yeng cắt ngang cánh đồng Ayun Hạ .[0, 0, 5, 6, 0, 0, 0, 0, 5, 0][O, O, B-LOC, I-LOC, O, O, O, O, B-LOC, O]
77[Sang, đây, ,, đầu_tiên, tôi, làm, nghề, rửa, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...Sang đây , đầu_tiên tôi làm nghề rửa chén ở nh...Sang đây , đầu tiên tôi làm nghề rửa chén ở nh...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
78[Cộng_tác_viên, của, Thanh, ở, Berlin, tìm, đế...[0, 0, 1, 0, 5, 0, 0, 0, 0, 1, 0, 0][O, O, B-PER, O, B-LOC, O, O, O, O, B-PER, O, O]Cộng_tác_viên của Thanh ở Berlin tìm đến khu_v...Cộng tác viên của Thanh ở Berlin tìm đến khu v...[0, 0, 1, 0, 5, 0, 0, 0, 0, 1, 0, 0][O, O, B-PER, O, B-LOC, O, O, O, O, B-PER, O, O]
79[Trời, đang, mưa, lớn, ,, con, tàu, bị, chao, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...Trời đang mưa lớn , con tàu bị chao lắc rất mạ...Trời đang mưa lớn , con tàu bị chao lắc rất mạ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " tokens \\\n", + "40 [Nguyên, phân_công, anh, bạn, đồng_nghiệp, ở, ... \n", + "41 [Theo, kế_hoạch, ,, những, ngày, đầu, cả, hai,... \n", + "42 [Thật, đáng, tiếc, biết_bao, ,, những, ngày, n... \n", + "43 [Hải, và, bố_mẹ, ngày, trước, ở, chung, với, ô... \n", + "44 [Cho_nên, phương_án, của, ông, Phong, là, “, b... \n", + "45 [Một, thời_gian, ngắn, sau, trận, tỉ_thí, lịch... \n", + "46 [Kao, chỉ, mới, được, gửi, lên, đây, hơn, một,... \n", + "47 [An_Lư, cũng, tích_cực, đào_tạo, các, thuyền_v... \n", + "48 [Anh, cười, tươi, :, \", Nếu, không, thắng, thì... \n", + "49 [Hắn, không, có, một, dữ_liệu, nào, nằm, trong... \n", + "50 [Và, cho_dù, xảy, ra, tình_huống, nào, thì, ôn... \n", + "51 [Để, em, tính, lại, .] \n", + "52 [Ông, đến, có, khi, mặc, sắc_phục, ,, có, khi,... \n", + "53 [Có, đứa, trả_lời, :, chưa, có, lúc, nào, thấy... \n", + "54 [Vậy, đó, ,, lửa, thử, vàng, gian_nan, thử, sứ... \n", + "55 [Chung, một, chữ, \", Lương, \", ...] \n", + "56 [Bước, vào, đầu, ngõ, ,, nhà, cô, Cúc, phơi, đ... \n", + "57 [Nếu, có, trở_ngại, một_chút, thì, đúng, là, l... \n", + "58 [Anh, Dĩa, kéo, chúng_tôi, lên, bờ, ,, khui, b... \n", + "59 [Có, đúng, chị, nằm, trên, một, dãy, núi, cao,... \n", + "60 [Bắt_đầu, từ, năm, 1961, ,, xã, Nhuận_Đức, phá... \n", + "61 [Năm, nay, đầu, trên, xóm, dưới, lắc_đầu, ngao... \n", + "62 [Và, đến, nay, những, mét, hầm, cuối_cùng, cũn... \n", + "63 [Đèn, đường, loang_loáng, ,, hoà, chung, dòng,... \n", + "64 [Cả, đám, bắt_đầu, lên_cơn, lắc, quậy, điên_cu... \n", + "65 [Ở, trạm_xá, xã, Hưng_Long, đối_diện, với, căn... \n", + "66 [Mới_đó, mà, ta, xa, nhau, ,, thật, là, kinh_k... \n", + "67 [Trong, cuộc, chiến_đấu, vì, nghĩa_vụ, quốc_tế... \n", + "68 [Mỗi, khi, sóng, dập, vào, và, đẩy, người, lên... \n", + "69 [Đây, là, một, giai_đoạn, khó_khăn, ,, đau_đớn... \n", + "70 [Và, hôm_nay, ,, chúng_ta, cũng, cần, hỏi, câu... \n", + "71 [Thạc_sĩ, thú_y, với, bầy, muông_thú, .] \n", + "72 [Tôi, lại, điện, hỏi_thăm, ông, khi, về, tới, ... \n", + "73 [Hương_Rừng, xuất_hiện, ở, nhiều, nơi, từ, nội... \n", + "74 [Nhìn, những, cảnh, đó, mình, cười, mà, nước_m... \n", + "75 [Ông, nhớ, mãi, năm, cô, con, gái, út, học, cấ... \n", + "76 [Đường, về, xã, Ia_Yeng, cắt, ngang, cánh, đồn... \n", + "77 [Sang, đây, ,, đầu_tiên, tôi, làm, nghề, rửa, ... \n", + "78 [Cộng_tác_viên, của, Thanh, ở, Berlin, tìm, đế... \n", + "79 [Trời, đang, mưa, lớn, ,, con, tàu, bị, chao, ... \n", + "\n", + " ner_tags \\\n", + "40 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "41 [0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, ... \n", + "42 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "43 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, ... \n", + "44 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "45 [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, ... \n", + "46 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "47 [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "48 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "49 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "50 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "51 [0, 0, 0, 0, 0] \n", + "52 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "53 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "54 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "55 [0, 0, 0, 0, 0, 0, 0] \n", + "56 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0] \n", + "57 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, ... \n", + "58 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "59 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0] \n", + "60 [0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0] \n", + "61 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "62 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "63 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, ... \n", + "64 [0, 0, 0, 0, 0, 0, 0, 0] \n", + "65 [0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, ... \n", + "66 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "67 [0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 5, 0, ... \n", + "68 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "69 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "70 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "71 [0, 0, 0, 0, 0, 0] \n", + "72 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "73 [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ... \n", + "74 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "75 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "76 [0, 0, 5, 6, 0, 0, 0, 0, 5, 0] \n", + "77 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "78 [0, 0, 1, 0, 5, 0, 0, 0, 0, 1, 0, 0] \n", + "79 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "\n", + " ner_labels \\\n", + "40 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "41 [O, O, O, O, O, O, O, O, O, B-NAT, I-NAT, O, O... \n", + "42 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "43 [B-PER, O, O, O, O, O, O, O, O, O, B-LOC, I-LO... \n", + "44 [O, O, O, O, B-PER, O, O, O, O, O, O, O, O, O,... \n", + "45 [O, O, O, O, O, O, O, O, B-PER, I-PER, O, O, O... \n", + "46 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "47 [B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "48 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "49 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "50 [O, O, O, O, O, O, O, O, O, O, O, O] \n", + "51 [O, O, O, O, O] \n", + "52 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "53 [O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "54 [O, O, O, O, O, O, O, O, O, O] \n", + "55 [O, O, O, O, O, O, O] \n", + "56 [O, O, O, O, O, O, O, B-PER, O, O, O, O, O, O] \n", + "57 [O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC... \n", + "58 [O, B-PER, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "59 [O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, O] \n", + "60 [O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O] \n", + "61 [O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "62 [O, O, O, O, O, O, O, O, O, O, O, O] \n", + "63 [O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC,... \n", + "64 [O, O, O, O, O, O, O, O] \n", + "65 [O, O, B-LOC, I-LOC, O, O, O, O, O, B-PER, I-P... \n", + "66 [O, O, O, O, O, O, O, O, O, O] \n", + "67 [O, O, O, O, O, O, O, O, O, B-LOC, O, O, O, B-... \n", + "68 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "69 [O, O, O, O, O, O, O, O, O, O, O, O] \n", + "70 [O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "71 [O, O, O, O, O, O] \n", + "72 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "73 [B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "74 [O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "75 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "76 [O, O, B-LOC, I-LOC, O, O, O, O, B-LOC, O] \n", + "77 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "78 [O, O, B-PER, O, B-LOC, O, O, O, O, B-PER, O, O] \n", + "79 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "\n", + " text_withseg \\\n", + "40 Nguyên phân_công anh bạn đồng_nghiệp ở vòng ng... \n", + "41 Theo kế_hoạch , những ngày đầu cả hai luyện ti... \n", + "42 Thật đáng tiếc biết_bao , những ngày này trăng... \n", + "43 Hải và bố_mẹ ngày trước ở chung với ông_bà trê... \n", + "44 Cho_nên phương_án của ông Phong là “ bán cả co... \n", + "45 Một thời_gian ngắn sau trận tỉ_thí lịch_sử , M... \n", + "46 Kao chỉ mới được gửi lên đây hơn một tuần , nh... \n", + "47 An_Lư cũng tích_cực đào_tạo các thuyền_viên ng... \n", + "48 Anh cười tươi : \" Nếu không thắng thì đâu còn ... \n", + "49 Hắn không có một dữ_liệu nào nằm trong tay thá... \n", + "50 Và cho_dù xảy ra tình_huống nào thì ông giám_đ... \n", + "51 Để em tính lại . \n", + "52 Ông đến có khi mặc sắc_phục , có khi mặc thườn... \n", + "53 Có đứa trả_lời : chưa có lúc nào thấy hạnh_phú... \n", + "54 Vậy đó , lửa thử vàng gian_nan thử sức . \n", + "55 Chung một chữ \" Lương \" ... \n", + "56 Bước vào đầu ngõ , nhà cô Cúc phơi đầy bánh_tr... \n", + "57 Nếu có trở_ngại một_chút thì đúng là lượng khá... \n", + "58 Anh Dĩa kéo chúng_tôi lên bờ , khui bia , rượu... \n", + "59 Có đúng chị nằm trên một dãy núi cao ở miền tâ... \n", + "60 Bắt_đầu từ năm 1961 , xã Nhuận_Đức phát_động p... \n", + "61 Năm nay đầu trên xóm dưới lắc_đầu ngao_ngán bỏ... \n", + "62 Và đến nay những mét hầm cuối_cùng cũng đã về ... \n", + "63 Đèn đường loang_loáng , hoà chung dòng xe tấp_... \n", + "64 Cả đám bắt_đầu lên_cơn lắc quậy điên_cuồng ... \n", + "65 Ở trạm_xá xã Hưng_Long đối_diện với căn nhà tử... \n", + "66 Mới_đó mà ta xa nhau , thật là kinh_khủng . \n", + "67 Trong cuộc chiến_đấu vì nghĩa_vụ quốc_tế với n... \n", + "68 Mỗi khi sóng dập vào và đẩy người lên theo thậ... \n", + "69 Đây là một giai_đoạn khó_khăn , đau_đớn nhất c... \n", + "70 Và hôm_nay , chúng_ta cũng cần hỏi câu hỏi này... \n", + "71 Thạc_sĩ thú_y với bầy muông_thú . \n", + "72 Tôi lại điện hỏi_thăm ông khi về tới nhà , ông... \n", + "73 Hương_Rừng xuất_hiện ở nhiều nơi từ nội_thành ... \n", + "74 Nhìn những cảnh đó mình cười mà nước_mắt chực ... \n", + "75 Ông nhớ mãi năm cô con gái út học cấp II , thư... \n", + "76 Đường về xã Ia_Yeng cắt ngang cánh đồng Ayun_Hạ . \n", + "77 Sang đây , đầu_tiên tôi làm nghề rửa chén ở nh... \n", + "78 Cộng_tác_viên của Thanh ở Berlin tìm đến khu_v... \n", + "79 Trời đang mưa lớn , con tàu bị chao lắc rất mạ... \n", + "\n", + " text_raw \\\n", + "40 Nguyên phân công anh bạn đồng nghiệp ở vòng ng... \n", + "41 Theo kế hoạch , những ngày đầu cả hai luyện ti... \n", + "42 Thật đáng tiếc biết bao , những ngày này trăng... \n", + "43 Hải và bố mẹ ngày trước ở chung với ông bà trê... \n", + "44 Cho nên phương án của ông Phong là “ bán cả co... \n", + "45 Một thời gian ngắn sau trận tỉ thí lịch sử , M... \n", + "46 Kao chỉ mới được gửi lên đây hơn một tuần , nh... \n", + "47 An Lư cũng tích cực đào tạo các thuyền viên ng... \n", + "48 Anh cười tươi : \" Nếu không thắng thì đâu còn ... \n", + "49 Hắn không có một dữ liệu nào nằm trong tay thá... \n", + "50 Và cho dù xảy ra tình huống nào thì ông giám đ... \n", + "51 Để em tính lại . \n", + "52 Ông đến có khi mặc sắc phục , có khi mặc thườn... \n", + "53 Có đứa trả lời : chưa có lúc nào thấy hạnh phú... \n", + "54 Vậy đó , lửa thử vàng gian nan thử sức . \n", + "55 Chung một chữ \" Lương \" ... \n", + "56 Bước vào đầu ngõ , nhà cô Cúc phơi đầy bánh tr... \n", + "57 Nếu có trở ngại một chút thì đúng là lượng khá... \n", + "58 Anh Dĩa kéo chúng tôi lên bờ , khui bia , rượu... \n", + "59 Có đúng chị nằm trên một dãy núi cao ở miền tâ... \n", + "60 Bắt đầu từ năm 1961 , xã Nhuận Đức phát động p... \n", + "61 Năm nay đầu trên xóm dưới lắc đầu ngao ngán bỏ... \n", + "62 Và đến nay những mét hầm cuối cùng cũng đã về ... \n", + "63 Đèn đường loang loáng , hoà chung dòng xe tấp ... \n", + "64 Cả đám bắt đầu lên cơn lắc quậy điên cuồng ... \n", + "65 Ở trạm xá xã Hưng Long đối diện với căn nhà tử... \n", + "66 Mới đó mà ta xa nhau , thật là kinh khủng . \n", + "67 Trong cuộc chiến đấu vì nghĩa vụ quốc tế với n... \n", + "68 Mỗi khi sóng dập vào và đẩy người lên theo thậ... \n", + "69 Đây là một giai đoạn khó khăn , đau đớn nhất c... \n", + "70 Và hôm nay , chúng ta cũng cần hỏi câu hỏi này... \n", + "71 Thạc sĩ thú y với bầy muông thú . \n", + "72 Tôi lại điện hỏi thăm ông khi về tới nhà , ông... \n", + "73 Hương Rừng xuất hiện ở nhiều nơi từ nội thành ... \n", + "74 Nhìn những cảnh đó mình cười mà nước mắt chực ... \n", + "75 Ông nhớ mãi năm cô con gái út học cấp II , thư... \n", + "76 Đường về xã Ia Yeng cắt ngang cánh đồng Ayun Hạ . \n", + "77 Sang đây , đầu tiên tôi làm nghề rửa chén ở nh... \n", + "78 Cộng tác viên của Thanh ở Berlin tìm đến khu v... \n", + "79 Trời đang mưa lớn , con tàu bị chao lắc rất mạ... \n", + "\n", + " ner_tags_replaced_nat \\\n", + "40 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "41 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "42 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "43 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, ... \n", + "44 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "45 [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, ... \n", + "46 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "47 [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "48 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "49 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "50 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "51 [0, 0, 0, 0, 0] \n", + "52 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "53 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "54 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "55 [0, 0, 0, 0, 0, 0, 0] \n", + "56 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0] \n", + "57 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, ... \n", + "58 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "59 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0] \n", + "60 [0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0] \n", + "61 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "62 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "63 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, ... \n", + "64 [0, 0, 0, 0, 0, 0, 0, 0] \n", + "65 [0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, ... \n", + "66 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "67 [0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 5, 0, ... \n", + "68 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "69 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "70 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "71 [0, 0, 0, 0, 0, 0] \n", + "72 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "73 [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ... \n", + "74 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "75 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "76 [0, 0, 5, 6, 0, 0, 0, 0, 5, 0] \n", + "77 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "78 [0, 0, 1, 0, 5, 0, 0, 0, 0, 1, 0, 0] \n", + "79 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "\n", + " ner_labels_replaced_nat \n", + "40 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "41 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "42 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "43 [B-PER, O, O, O, O, O, O, O, O, O, B-LOC, I-LO... \n", + "44 [O, O, O, O, B-PER, O, O, O, O, O, O, O, O, O,... \n", + "45 [O, O, O, O, O, O, O, O, B-PER, I-PER, O, O, O... \n", + "46 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "47 [B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "48 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "49 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "50 [O, O, O, O, O, O, O, O, O, O, O, O] \n", + "51 [O, O, O, O, O] \n", + "52 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "53 [O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "54 [O, O, O, O, O, O, O, O, O, O] \n", + "55 [O, O, O, O, O, O, O] \n", + "56 [O, O, O, O, O, O, O, B-PER, O, O, O, O, O, O] \n", + "57 [O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC... \n", + "58 [O, B-PER, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "59 [O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, O] \n", + "60 [O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O] \n", + "61 [O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "62 [O, O, O, O, O, O, O, O, O, O, O, O] \n", + "63 [O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC,... \n", + "64 [O, O, O, O, O, O, O, O] \n", + "65 [O, O, B-LOC, I-LOC, O, O, O, O, O, B-PER, I-P... \n", + "66 [O, O, O, O, O, O, O, O, O, O] \n", + "67 [O, O, O, O, O, O, O, O, O, B-LOC, O, O, O, B-... \n", + "68 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "69 [O, O, O, O, O, O, O, O, O, O, O, O] \n", + "70 [O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "71 [O, O, O, O, O, O] \n", + "72 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "73 [B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "74 [O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "75 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "76 [O, O, B-LOC, I-LOC, O, O, O, O, B-LOC, O] \n", + "77 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "78 [O, O, B-PER, O, B-LOC, O, O, O, O, B-PER, O, O] \n", + "79 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[40:80]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "Ugoe49T1hHvF" + }, + "outputs": [], + "source": [ + "df = df.drop(columns=['ner_tags','ner_labels'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "yQ1v1VhIlUuF" + }, + "outputs": [], + "source": [ + "df.rename(columns={'ner_tags_replaced_nat': 'ner_tags', 'ner_labels_replaced_nat': 'ner_labels'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 487 + }, + "id": "0RfL3A9FlmVA", + "outputId": "b6dd057a-14a6-42a6-b309-48f7816eb5e7" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"df[41:50]\",\n \"rows\": 9,\n \"fields\": [\n {\n \"column\": \"tokens\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text_withseg\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 9,\n \"samples\": [\n \"Anh c\\u01b0\\u1eddi t\\u01b0\\u01a1i : \\\" N\\u1ebfu kh\\u00f4ng th\\u1eafng th\\u00ec \\u0111\\u00e2u c\\u00f2n s\\u1ed1ng \\u0111\\u1ebfn b\\u00e2y_gi\\u1edd .\",\n \"Th\\u1eadt \\u0111\\u00e1ng ti\\u1ebfc bi\\u1ebft_bao , nh\\u1eefng ng\\u00e0y n\\u00e0y tr\\u0103ng r\\u1eafc b\\u1ee5i v\\u00e0ng tr\\u00ean c\\u00e1c n\\u1ebbo \\u0111\\u01b0\\u1eddng H\\u00e0_N\\u1ed9i .\",\n \"Kao ch\\u1ec9 m\\u1edbi \\u0111\\u01b0\\u1ee3c g\\u1eedi l\\u00ean \\u0111\\u00e2y h\\u01a1n m\\u1ed9t tu\\u1ea7n , nh\\u1eefng b\\u00e0i_t\\u1eadp \\u0111\\u1ea7u_ti\\u00ean m\\u1edbi_ch\\u1ec9 l\\u00e0 nh\\u1eefng \\u0111i\\u1ec7u m\\u00faa kh\\u1edfi_\\u0111\\u1ed9ng \\u201c Wai-kru \\u201d m\\u00e0 Kao r\\u1ea5t th\\u00edch , nh\\u01b0ng em \\u0111\\u00e2u th\\u1ec3 bi\\u1ebft ph\\u00eda tr\\u01b0\\u1edbc s\\u1ebd l\\u00e0 con \\u0111\\u01b0\\u1eddng d\\u00e0i \\u0111\\u1ea7y m\\u00e1u v\\u00e0 n\\u01b0\\u1edbc_m\\u1eaft \\u2026\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text_raw\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 9,\n \"samples\": [\n \"Anh c\\u01b0\\u1eddi t\\u01b0\\u01a1i : \\\" N\\u1ebfu kh\\u00f4ng th\\u1eafng th\\u00ec \\u0111\\u00e2u c\\u00f2n s\\u1ed1ng \\u0111\\u1ebfn b\\u00e2y gi\\u1edd .\",\n \"Th\\u1eadt \\u0111\\u00e1ng ti\\u1ebfc bi\\u1ebft bao , nh\\u1eefng ng\\u00e0y n\\u00e0y tr\\u0103ng r\\u1eafc b\\u1ee5i v\\u00e0ng tr\\u00ean c\\u00e1c n\\u1ebbo \\u0111\\u01b0\\u1eddng H\\u00e0 N\\u1ed9i .\",\n \"Kao ch\\u1ec9 m\\u1edbi \\u0111\\u01b0\\u1ee3c g\\u1eedi l\\u00ean \\u0111\\u00e2y h\\u01a1n m\\u1ed9t tu\\u1ea7n , nh\\u1eefng b\\u00e0i t\\u1eadp \\u0111\\u1ea7u ti\\u00ean m\\u1edbi ch\\u1ec9 l\\u00e0 nh\\u1eefng \\u0111i\\u1ec7u m\\u00faa kh\\u1edfi \\u0111\\u1ed9ng \\u201c Wai-kru \\u201d m\\u00e0 Kao r\\u1ea5t th\\u00edch , nh\\u01b0ng em \\u0111\\u00e2u th\\u1ec3 bi\\u1ebft ph\\u00eda tr\\u01b0\\u1edbc s\\u1ebd l\\u00e0 con \\u0111\\u01b0\\u1eddng d\\u00e0i \\u0111\\u1ea7y m\\u00e1u v\\u00e0 n\\u01b0\\u1edbc m\\u1eaft \\u2026\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_tags\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_labels\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokenstext_withsegtext_rawner_tagsner_labels
41[Theo, kế_hoạch, ,, những, ngày, đầu, cả, hai,...Theo kế_hoạch , những ngày đầu cả hai luyện ti...Theo kế hoạch , những ngày đầu cả hai luyện ti...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
42[Thật, đáng, tiếc, biết_bao, ,, những, ngày, n...Thật đáng tiếc biết_bao , những ngày này trăng...Thật đáng tiếc biết bao , những ngày này trăng...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
43[Hải, và, bố_mẹ, ngày, trước, ở, chung, với, ô...Hải và bố_mẹ ngày trước ở chung với ông_bà trê...Hải và bố mẹ ngày trước ở chung với ông bà trê...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, B-LOC, I-LO...
44[Cho_nên, phương_án, của, ông, Phong, là, “, b...Cho_nên phương_án của ông Phong là “ bán cả co...Cho nên phương án của ông Phong là “ bán cả co...[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, B-PER, O, O, O, O, O, O, O, O, O,...
45[Một, thời_gian, ngắn, sau, trận, tỉ_thí, lịch...Một thời_gian ngắn sau trận tỉ_thí lịch_sử , M...Một thời gian ngắn sau trận tỉ thí lịch sử , M...[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, B-PER, I-PER, O, O, O...
46[Kao, chỉ, mới, được, gửi, lên, đây, hơn, một,...Kao chỉ mới được gửi lên đây hơn một tuần , nh...Kao chỉ mới được gửi lên đây hơn một tuần , nh...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...
47[An_Lư, cũng, tích_cực, đào_tạo, các, thuyền_v...An_Lư cũng tích_cực đào_tạo các thuyền_viên ng...An Lư cũng tích cực đào tạo các thuyền viên ng...[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,...
48[Anh, cười, tươi, :, \", Nếu, không, thắng, thì...Anh cười tươi : \" Nếu không thắng thì đâu còn ...Anh cười tươi : \" Nếu không thắng thì đâu còn ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
49[Hắn, không, có, một, dữ_liệu, nào, nằm, trong...Hắn không có một dữ_liệu nào nằm trong tay thá...Hắn không có một dữ liệu nào nằm trong tay thá...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " tokens \\\n", + "41 [Theo, kế_hoạch, ,, những, ngày, đầu, cả, hai,... \n", + "42 [Thật, đáng, tiếc, biết_bao, ,, những, ngày, n... \n", + "43 [Hải, và, bố_mẹ, ngày, trước, ở, chung, với, ô... \n", + "44 [Cho_nên, phương_án, của, ông, Phong, là, “, b... \n", + "45 [Một, thời_gian, ngắn, sau, trận, tỉ_thí, lịch... \n", + "46 [Kao, chỉ, mới, được, gửi, lên, đây, hơn, một,... \n", + "47 [An_Lư, cũng, tích_cực, đào_tạo, các, thuyền_v... \n", + "48 [Anh, cười, tươi, :, \", Nếu, không, thắng, thì... \n", + "49 [Hắn, không, có, một, dữ_liệu, nào, nằm, trong... \n", + "\n", + " text_withseg \\\n", + "41 Theo kế_hoạch , những ngày đầu cả hai luyện ti... \n", + "42 Thật đáng tiếc biết_bao , những ngày này trăng... \n", + "43 Hải và bố_mẹ ngày trước ở chung với ông_bà trê... \n", + "44 Cho_nên phương_án của ông Phong là “ bán cả co... \n", + "45 Một thời_gian ngắn sau trận tỉ_thí lịch_sử , M... \n", + "46 Kao chỉ mới được gửi lên đây hơn một tuần , nh... \n", + "47 An_Lư cũng tích_cực đào_tạo các thuyền_viên ng... \n", + "48 Anh cười tươi : \" Nếu không thắng thì đâu còn ... \n", + "49 Hắn không có một dữ_liệu nào nằm trong tay thá... \n", + "\n", + " text_raw \\\n", + "41 Theo kế hoạch , những ngày đầu cả hai luyện ti... \n", + "42 Thật đáng tiếc biết bao , những ngày này trăng... \n", + "43 Hải và bố mẹ ngày trước ở chung với ông bà trê... \n", + "44 Cho nên phương án của ông Phong là “ bán cả co... \n", + "45 Một thời gian ngắn sau trận tỉ thí lịch sử , M... \n", + "46 Kao chỉ mới được gửi lên đây hơn một tuần , nh... \n", + "47 An Lư cũng tích cực đào tạo các thuyền viên ng... \n", + "48 Anh cười tươi : \" Nếu không thắng thì đâu còn ... \n", + "49 Hắn không có một dữ liệu nào nằm trong tay thá... \n", + "\n", + " ner_tags \\\n", + "41 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "42 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "43 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, ... \n", + "44 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "45 [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, ... \n", + "46 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "47 [5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "48 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "49 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "\n", + " ner_labels \n", + "41 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "42 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "43 [B-PER, O, O, O, O, O, O, O, O, O, B-LOC, I-LO... \n", + "44 [O, O, O, O, B-PER, O, O, O, O, O, O, O, O, O,... \n", + "45 [O, O, O, O, O, O, O, O, B-PER, I-PER, O, O, O... \n", + "46 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "47 [B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "48 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "49 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[41:50]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "XBvlZok5ntB7" + }, + "outputs": [], + "source": [ + "time = 0\n", + "org_idx = []\n", + "token = []\n", + "tag = []\n", + "for i in (df.index):\n", + " for a in range(len(df.ner_tags[i])):\n", + " # if df.ner_tags[i][a] == 6 or df.ner_tags[i][a] == 5:\n", + " if df.ner_labels[i][a] != 'O':\n", + " token.append(df.tokens[i][[a]])\n", + " tag.append(df.ner_labels[i][a])\n", + " org_idx.append(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 303 + }, + "id": "xcw4uS1lngk5", + "outputId": "a978a005-50aa-458e-e48a-3fda6e992b25" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
tag
B-PER7479
B-LOC6244
I-PER3522
I-LOC2783
I-ORG2055
B-ORG1212
\n", + "

" + ], + "text/plain": [ + "tag\n", + "B-PER 7479\n", + "B-LOC 6244\n", + "I-PER 3522\n", + "I-LOC 2783\n", + "I-ORG 2055\n", + "B-ORG 1212\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tag_data = pd.DataFrame({'org_idx': org_idx, 'token': token, 'tag': tag})\n", + "tag_data.tag.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 607 + }, + "id": "QnAnqvw6l5-l", + "outputId": "b4a7cd6c-c003-4cd7-ed74-6141d5336fd2" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "tag_counts = tag_data.tag.value_counts()\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "tag_counts.plot(kind='bar')\n", + "plt.title('Distribution of NER Label Frequency')\n", + "plt.xlabel('NER Label')\n", + "plt.ylabel('Frequency')\n", + "plt.xticks(rotation=45, ha='right')\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cZdD8L9fqsDV" + }, + "source": [ + "## NOTE\n", + "\n", + "\n", + "1. Phân bố độ dài câu\n", + "2. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 607 + }, + "id": "2mkmZj6xqqwp", + "outputId": "0836d9f9-099d-4f28-f402-9da1e444639f" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df['token_lengths'] = df['tokens'].apply(lambda token_list: [len(token) for token in token_list])\n", + "\n", + "# Flatten the list of token lengths for all rows\n", + "all_token_lengths = [length for sublist in df['token_lengths'] for length in sublist]\n", + "\n", + "# Convert to a pandas Series to easily get the distribution\n", + "token_length_series = pd.Series(all_token_lengths)\n", + "\n", + "# Plot the distribution of token lengths\n", + "plt.figure(figsize=(12, 6))\n", + "token_length_series.hist(bins=range(min(token_length_series), max(token_length_series) + 1), edgecolor='black', align='left')\n", + "plt.title('Distribution of Token Lengths')\n", + "plt.xlabel('Token Length')\n", + "plt.ylabel('Frequency')\n", + "plt.xticks(range(min(token_length_series), max(token_length_series) + 1))\n", + "plt.tight_layout()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 798 + }, + "id": "48udWdKjxUcw", + "outputId": "c593638c-b7c0-4464-c477-b1ec2bf41463" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Descriptive statistics for sentence lengths:\n", + "count 16858.000000\n", + "mean 21.912445\n", + "std 12.940344\n", + "min 1.000000\n", + "25% 13.000000\n", + "50% 20.000000\n", + "75% 29.000000\n", + "max 118.000000\n", + "Name: sentence_length, dtype: float64\n" + ] + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "df['sentence_length'] = df['tokens'].apply(len)\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "df['sentence_length'].hist(bins=50, edgecolor='black')\n", + "plt.title('Distribution of Sentence Lengths')\n", + "plt.xlabel('Sentence Length (number of tokens)')\n", + "plt.ylabel('Frequency')\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "print(\"\\nDescriptive statistics for sentence lengths:\")\n", + "print(df['sentence_length'].describe())" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 597 + }, + "id": "r7RqJtBDoIaC", + "outputId": "90317572-ab0f-45df-a6d9-8c026f45df5a" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"df\",\n \"rows\": 16858,\n \"fields\": [\n {\n \"column\": \"tokens\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text_withseg\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16787,\n \"samples\": [\n \"T\\u00ednh th\\u00f4ng_minh , l\\u1ea1i t\\u00f2_m\\u00f2 , anh Ki\\u1ec7m b\\u1eaft_\\u0111\\u1ea7u \\u0111i \\u0111\\u1ebfn c\\u00e1c x\\u01b0\\u1edfng c\\u01a1_kh\\u00ed \\u0111\\u1ec3 quan_s\\u00e1t c\\u00e1c lo\\u1ea1i m\\u00e1y_m\\u00f3c , r\\u1ed3i v\\u1ec1 nh\\u00e0 suy_ngh\\u0129 v\\u00e0 c\\u1ea7m b\\u00fat v\\u1ebd ph\\u00e1c_ho\\u1ea1 ra c\\u00e1i m\\u00e1y v\\u00fat g\\u1ea1o .\",\n \"V\\u1eady th\\u00ec , h\\u1ecd c\\u1ea7n ph\\u1ea3i \\u0111\\u01b0\\u1ee3c gi\\u00fap_\\u0111\\u1ee1 , ph\\u1ea3i \\u0111\\u01b0\\u1ee3c s\\u1ed1ng \\u0111\\u00e0ng_ho\\u00e0ng , ph\\u1ea3i \\u0111\\u01b0\\u1ee3c l\\u00e0m ng\\u01b0\\u1eddi d\\u00f9 ch\\u1ec9 l\\u00e0 nh\\u1eefng ng\\u00e0y cu\\u1ed1i_c\\u00f9ng .\",\n \"Nhi\\u1ec1u ng\\u01b0\\u1eddi th\\u00f4ng_d\\u1ecbch c\\u00f9ng th\\u1eddi v\\u1edbi Nguy\\u1ec5n Trung Hi\\u1ebfu c\\u0169ng \\u0111\\u00e3 ch\\u1ebft trong khi th\\u1ef1c_hi\\u1ec7n nhi\\u1ec7m_v\\u1ee5 t\\u1ea1i chi\\u1ebfn_tr\\u01b0\\u1eddng ho\\u1eb7c tr\\u00ean \\u0111\\u01b0\\u1eddng h\\u00e0nh_qu\\u00e2n .\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text_raw\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16785,\n \"samples\": [\n \"Trong kho\\u1ea3ng th\\u1eddi gian \\u0111\\u00f3 ch\\u1ecb c\\u1ed1 c\\u00f4ng t\\u1ef1 h\\u1ecdc ti\\u1ebfng Anh .\",\n \"Sau \\u0111\\u00f3 , ch\\u00ednh b\\u00e0 Susan \\u0111\\u00e3 \\u0111\\u01b0a Mai l\\u00ean h\\u1ecdc \\u0111\\u1ea1i h\\u1ecdc , m\\u1ed7i n\\u0103m chu c\\u1ea5p cho c\\u00f4 30.000 USD .\",\n \"T\\u1eeb r\\u1ea5t l\\u00e2u r\\u1ed3i t\\u00f4i v\\u1eabn ngh\\u0129 n\\u1ebfu nh\\u01b0 cu\\u1ed1n s\\u00e1ch \\u0111\\u01b0\\u1ee3c xu\\u1ea5t b\\u1ea3n , ho\\u1eb7c ng\\u01b0\\u1eddi ta l\\u00e0m phim v\\u1ec1 n\\u00f3 th\\u00ec t\\u00f4i s\\u1ebd d\\u00f9ng s\\u1ed1 ti\\u1ec1n b\\u00e1n s\\u00e1ch \\u0111\\u1ec3 thi\\u1ebft l\\u1eadp m\\u1ed9t s\\u1ed1 gi\\u01b0\\u1eddng b\\u1ec7nh t\\u1ea1i H\\u00e0 N\\u1ed9i .\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_tags\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ner_labels\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"token_lengths\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentence_length\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12,\n \"min\": 1,\n \"max\": 118,\n \"num_unique_values\": 99,\n \"samples\": [\n 68,\n 23,\n 96\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "df" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokenstext_withsegtext_rawner_tagsner_labelstoken_lengthssentence_length
0[Không_khí, thật, náo_nhiệt, .]Không_khí thật náo_nhiệt .Không khí thật náo nhiệt .[0, 0, 0, 0][O, O, O, O][9, 4, 9, 1]4
1[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...[3, 4, 2, 4, 2, 2, 1, 4, 4, 4, 3, 3, 4, 3, 4, ...31
2[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...[8, 3, 1, 4, 3, 3, 4, 3, 2, 2, 4, 1, 3, 3, 1, ...33
3[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...[3, 3, 3, 3, 3, 3, 2, 4, 9, 2, 1, 3, 3, 3, 3, ...43
4[Nhật_ký, của, thuyền_viên, .]Nhật_ký của thuyền_viên .Nhật ký của thuyền viên .[0, 0, 0, 0][O, O, O, O][7, 3, 11, 1]4
........................
16853[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...[4, 4, 2, 7, 5, 5, 4, 4, 5, 1, 3, 3, 3, 4, 5, ...21
16854[Nhưng, mọi, chuyện, không, dừng, ở, đó, .]Nhưng mọi chuyện không dừng ở đó .Nhưng mọi chuyện không dừng ở đó .[0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O][5, 3, 6, 5, 4, 1, 2, 1]8
16855[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...[3, 3, 9, 3, 7, 3, 1, 1, 5, 3, 8, 2, 5, 4, 3, ...29
16856[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...Biết bao người đã tình nguyện hiến dâng cả cuộ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O, O, O][8, 5, 2, 11, 9, 2, 8, 4, 2, 3, 3, 7, 1, 5, 1]15
16857[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...Trên đây mới là “ thành_tích ” tiêu tiền của m...Trên đây mới là “ thành tích ” tiêu tiền của m...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...[4, 3, 3, 2, 1, 10, 1, 4, 4, 3, 7, 5, 2, 5, 9,...22
\n", + "

16858 rows × 7 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " tokens \\\n", + "0 [Không_khí, thật, náo_nhiệt, .] \n", + "1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n", + "2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n", + "3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n", + "4 [Nhật_ký, của, thuyền_viên, .] \n", + "... ... \n", + "16853 [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ... \n", + "16854 [Nhưng, mọi, chuyện, không, dừng, ở, đó, .] \n", + "16855 [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,... \n", + "16856 [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ... \n", + "16857 [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t... \n", + "\n", + " text_withseg \\\n", + "0 Không_khí thật náo_nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,... \n", + "4 Nhật_ký của thuyền_viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh... \n", + "16856 Biết_bao người đã tình_nguyện hiến_dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành_tích ” tiêu tiền của m... \n", + "\n", + " text_raw \\\n", + "0 Không khí thật náo nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,... \n", + "4 Nhật ký của thuyền viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh... \n", + "16856 Biết bao người đã tình nguyện hiến dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành tích ” tiêu tiền của m... \n", + "\n", + " ner_tags \\\n", + "0 [0, 0, 0, 0] \n", + "1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n", + "4 [0, 0, 0, 0] \n", + "... ... \n", + "16853 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... \n", + "16854 [0, 0, 0, 0, 0, 0, 0, 0] \n", + "16855 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "16856 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "16857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "\n", + " ner_labels \\\n", + "0 [O, O, O, O] \n", + "1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n", + "2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n", + "3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n", + "4 [O, O, O, O] \n", + "... ... \n", + "16853 [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,... \n", + "16854 [O, O, O, O, O, O, O, O] \n", + "16855 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "16856 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "16857 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "\n", + " token_lengths sentence_length \n", + "0 [9, 4, 9, 1] 4 \n", + "1 [3, 4, 2, 4, 2, 2, 1, 4, 4, 4, 3, 3, 4, 3, 4, ... 31 \n", + "2 [8, 3, 1, 4, 3, 3, 4, 3, 2, 2, 4, 1, 3, 3, 1, ... 33 \n", + "3 [3, 3, 3, 3, 3, 3, 2, 4, 9, 2, 1, 3, 3, 3, 3, ... 43 \n", + "4 [7, 3, 11, 1] 4 \n", + "... ... ... \n", + "16853 [4, 4, 2, 7, 5, 5, 4, 4, 5, 1, 3, 3, 3, 4, 5, ... 21 \n", + "16854 [5, 3, 6, 5, 4, 1, 2, 1] 8 \n", + "16855 [3, 3, 9, 3, 7, 3, 1, 1, 5, 3, 8, 2, 5, 4, 3, ... 29 \n", + "16856 [8, 5, 2, 11, 9, 2, 8, 4, 2, 3, 3, 7, 1, 5, 1] 15 \n", + "16857 [4, 3, 3, 2, 1, 10, 1, 4, 4, 3, 7, 5, 2, 5, 9,... 22 \n", + "\n", + "[16858 rows x 7 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ddfgAoUkfSGa", + "outputId": "e3814e8e-7354-4bd7-b7bb-bd3bcf22310d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Không_khí\n", + "thật\n", + "náo_nhiệt\n", + ".\n" + ] + } + ], + "source": [ + "times = 1\n", + "for i, row in df.iterrows():\n", + " token = row['tokens']\n", + " for j in token:\n", + " print(j)\n", + " times -= 1\n", + " if times == 0:\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5QAdc0pBzymb" + }, + "source": [ + "Bởi vì anh bạn nigga của tôi đã thiếu hướng dẫn sử dụng nên tôi sẽ bổ sung" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jwKHJtvtz6ge" + }, + "source": [ + "### Cài đặt java (Bắt buộc)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sIipHmP1zxlj", + "outputId": "1c3fe929-198e-48c2-8b4c-a31ebfac9749" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading package lists... Done\n", + "Building dependency tree... Done\n", + "Reading state information... Done\n", + "openjdk-11-jdk is already the newest version (11.0.27+6~us1-0ubuntu1~22.04).\n", + "0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.\n" + ] + } + ], + "source": [ + "!apt-get install -y openjdk-11-jdk" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y9S13MGD0DBu" + }, + "source": [ + "#### Cài đặt model" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "focvsL3lv6ZT", + "outputId": "e75c4e2e-d7cc-4126-ebc0-d966d84a5bcc" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: py_vncorenlp in /usr/local/lib/python3.11/dist-packages (0.1.4)\n", + "Requirement already satisfied: pyjnius in /usr/local/lib/python3.11/dist-packages (from py_vncorenlp) (1.6.1)\n" + ] + } + ], + "source": [ + "pip install py_vncorenlp" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9OFgp53U0HPq" + }, + "source": [ + "###### Tạo thư mục và tải xuống mô hình" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iiOcs0dwARjQ" + }, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 244, + "referenced_widgets": [ + "132e7ba656334ec59ac72679c126bf27", + "bb767d0faca34e619107546b439b607f", + "1b9c56a6995a40d59f64bccc7f679e07", + "3c6d843da9bf44d3b971c4443c44a255", + "220afa91ce9f4d67934243437259faa6", + "6b42472a431141e182c422381ec84106", + "c9e12698a123442093c2bd71d7c5775a", + "62cb290b834a4378b43fd8a5424d3557", + "265686ab8b4e4c66bf419f6391cad15e", + "02efe4dc0e34428f87d00571063a2342", + "3f65300df13246168e409be8852ac5ea", + "19382d4c6ac74aa190a891fa855a8fdd", + "84087d814eb9493693cbba7e84db4a62", + "32bcd44ab58e4349b66ccb144b7a5eb1", + "a9b4d1e29f7645fd90a86c407bb97eb9", + "858e486c77e94ef0a0720e4d204e10b9", + "09d63d9c6e3b4befb2d13ed56eb1fecf", + "d5d0d780b2464843a1d380f0a4c0fbbc", + "fc5c675a5c0a4058a683318491f6e6da", + "4958e837d8664a298f32b7e88825fbf7", + "82fe50de6b0945198fae9f532324e7fd", + "222c29ec297f400ea78c758563378f8a", + "8d018b3f1f2e4ac699f5051019b9af74", + "7d81164902f44d289148260c2584782e", + "5a809d0a194c4e04a3a3f99ca4d98b5c", + "35144d338b1249278eef607e9f84c57a", + "b283351b3c9f4b1fb9801e2bafcc3ac7", + "d32f4b409484471891cf68e295afc1e5", + "8b865a52c0c1492283a00e7fef9a2697", + "b73314b6559c492f851262eba512aa3c", + "53dbc1a7391648d0a033644c2a97b4c2", + "7a39143f21d2438e80e32a7f0ac407a4", + "3dcbe597c2af44ab8672a58304317ff9", + "e13525732c2f4d8cad00e52b80f7e2f4", + "cc286fe8ebb64f03acf68fc630cedfe0", + "8b0f3136fecc4965a76df8c8dee661ce", + "16a99b74cefa4f0e870e93776897feca", + "0b42f930dcd949ee9df45dc2cd860214", + "fec4bac1a3344aad9f168be02dba98f2", + "5958c45737064862858bf130b105ec08", + "f4f3815411b24674bfb51ab242832999", + "83930005bf7443b08835dae0ceb66457", + "539f3fb7d9764596b3496609617e9df4", + "70035b06e7c14e54ba0a06fb59c66546", + "6954c314e1e844e49407fb2236ea76a9", + "31566827dfab451db0ca2a06458efaa3", + "f8ef07a5c6fd4bca98efdf435d883f4c", + "c67c2df8fc294b9b8fbf0055dd2ded6c", + "f4308d3a141e454cb2c1602cba4a06aa", + "2b11610bd8d94a81863e99debe1e6905", + "71c48602225b41449f5b5cbc53d519dc", + "ed4e3e35f657476c82ff20a038812243", + "eb355ce836b34a9da087fee621132e94", + "6063b69b0af744d9b31dc9d20c029c99", + "3d808cb829044c01b8cbdca369a16e3a", + "60443f69792b4726b8cf2a038a241022", + "dc92fe258486439394f2841363d4286b", + "bc2cff635fb549549afb10cd81ec252a", + "583d9194818b452c8f8531005a5802ba", + "8fa670c708c34020b46f4db42341779b", + "a7a6d4796d0349b99abb511f3ff9823b", + "8ef031c9d4b84a62a20b47d338a3dc2d", + "e29a696703cc4f87a12fc6e98f2cbcf8", + "3116450c8884414ab14124212ba0b3ef", + "7a1d4efa4f394010b9e8f6b72f457f0c", + "aa5355a04ee34ba99567184381985f82" + ] + }, + "id": "8_ABJkA6uw3M", + "outputId": "a27de901-5886-41dd-df9e-7f5383d21b81" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "132e7ba656334ec59ac72679c126bf27", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "config.json: 0%| | 0.00/678 [00:00\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokenstext_withsegtext_rawner_tagsner_labelstoken_lengthssentence_lengthencoded
0[Không_khí, thật, náo_nhiệt, .]Không_khí thật náo_nhiệt .Không khí thật náo nhiệt .[0, 0, 0, 0][O, O, O, O][9, 4, 9, 1]4[10591, 520, 13648, 5]
1[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...[3, 4, 2, 4, 2, 2, 1, 4, 4, 4, 3, 3, 4, 3, 4, ...31[1108, 19703, 6, 28163, 40, 57, 4, 68, 414, 36...
2[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...[8, 3, 1, 4, 3, 3, 4, 3, 2, 2, 4, 1, 3, 3, 1, ...33[39659, 30554, 1997, 4, 1521, 1997, 182, 2777,...
3[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...[3, 3, 3, 3, 3, 3, 2, 4, 9, 2, 1, 3, 3, 3, 3, ...43[2042, 1218, 857, 60, 89, 602, 10, 55, 9880, 1...
4[Nhật_ký, của, thuyền_viên, .]Nhật_ký của thuyền_viên .Nhật ký của thuyền viên .[0, 0, 0, 0][O, O, O, O][7, 3, 11, 1]4[17188, 7, 6494, 5]
...........................
16853[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...[4, 4, 2, 7, 5, 5, 4, 4, 5, 1, 3, 3, 3, 4, 5, ...21[3656, 108, 14, 3, 17143, 51, 24733, 102, 11, ...
16854[Nhưng, mọi, chuyện, không, dừng, ở, đó, .]Nhưng mọi chuyện không dừng ở đó .Nhưng mọi chuyện không dừng ở đó .[0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O][5, 3, 6, 5, 4, 1, 2, 1]8[293, 207, 344, 17, 772, 25, 37, 5]
16855[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...[3, 3, 9, 3, 7, 3, 1, 1, 5, 3, 8, 2, 5, 4, 3, ...29[2042, 1218, 130, 127, 11878, 957, 4, 25, 12, ...
16856[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...Biết bao người đã tình nguyện hiến dâng cả cuộ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0][O, O, O, O, O, O, O, O, O, O, O, O, O, O, O][8, 5, 2, 11, 9, 2, 8, 4, 2, 3, 3, 7, 1, 5, 1]15[53464, 18, 14, 4047, 46883, 94, 1679, 68, 90,...
16857[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...Trên đây mới là “ thành_tích ” tiêu tiền của m...Trên đây mới là “ thành tích ” tiêu tiền của m...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...[4, 3, 3, 2, 1, 10, 1, 4, 4, 3, 7, 5, 2, 5, 9,...22[880, 97, 60, 8, 556, 1417, 564, 2140, 123, 7,...
\n", + "

16858 rows × 8 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + " \n" + ], + "text/plain": [ + " tokens \\\n", + "0 [Không_khí, thật, náo_nhiệt, .] \n", + "1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n", + "2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n", + "3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n", + "4 [Nhật_ký, của, thuyền_viên, .] \n", + "... ... \n", + "16853 [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ... \n", + "16854 [Nhưng, mọi, chuyện, không, dừng, ở, đó, .] \n", + "16855 [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,... \n", + "16856 [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ... \n", + "16857 [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t... \n", + "\n", + " text_withseg \\\n", + "0 Không_khí thật náo_nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,... \n", + "4 Nhật_ký của thuyền_viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh... \n", + "16856 Biết_bao người đã tình_nguyện hiến_dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành_tích ” tiêu tiền của m... \n", + "\n", + " text_raw \\\n", + "0 Không khí thật náo nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,... \n", + "4 Nhật ký của thuyền viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh... \n", + "16856 Biết bao người đã tình nguyện hiến dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành tích ” tiêu tiền của m... \n", + "\n", + " ner_tags \\\n", + "0 [0, 0, 0, 0] \n", + "1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n", + "4 [0, 0, 0, 0] \n", + "... ... \n", + "16853 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... \n", + "16854 [0, 0, 0, 0, 0, 0, 0, 0] \n", + "16855 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "16856 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "16857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "\n", + " ner_labels \\\n", + "0 [O, O, O, O] \n", + "1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n", + "2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n", + "3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n", + "4 [O, O, O, O] \n", + "... ... \n", + "16853 [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,... \n", + "16854 [O, O, O, O, O, O, O, O] \n", + "16855 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "16856 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "16857 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "\n", + " token_lengths sentence_length \\\n", + "0 [9, 4, 9, 1] 4 \n", + "1 [3, 4, 2, 4, 2, 2, 1, 4, 4, 4, 3, 3, 4, 3, 4, ... 31 \n", + "2 [8, 3, 1, 4, 3, 3, 4, 3, 2, 2, 4, 1, 3, 3, 1, ... 33 \n", + "3 [3, 3, 3, 3, 3, 3, 2, 4, 9, 2, 1, 3, 3, 3, 3, ... 43 \n", + "4 [7, 3, 11, 1] 4 \n", + "... ... ... \n", + "16853 [4, 4, 2, 7, 5, 5, 4, 4, 5, 1, 3, 3, 3, 4, 5, ... 21 \n", + "16854 [5, 3, 6, 5, 4, 1, 2, 1] 8 \n", + "16855 [3, 3, 9, 3, 7, 3, 1, 1, 5, 3, 8, 2, 5, 4, 3, ... 29 \n", + "16856 [8, 5, 2, 11, 9, 2, 8, 4, 2, 3, 3, 7, 1, 5, 1] 15 \n", + "16857 [4, 3, 3, 2, 1, 10, 1, 4, 4, 3, 7, 5, 2, 5, 9,... 22 \n", + "\n", + " encoded \n", + "0 [10591, 520, 13648, 5] \n", + "1 [1108, 19703, 6, 28163, 40, 57, 4, 68, 414, 36... \n", + "2 [39659, 30554, 1997, 4, 1521, 1997, 182, 2777,... \n", + "3 [2042, 1218, 857, 60, 89, 602, 10, 55, 9880, 1... \n", + "4 [17188, 7, 6494, 5] \n", + "... ... \n", + "16853 [3656, 108, 14, 3, 17143, 51, 24733, 102, 11, ... \n", + "16854 [293, 207, 344, 17, 772, 25, 37, 5] \n", + "16855 [2042, 1218, 130, 127, 11878, 957, 4, 25, 12, ... \n", + "16856 [53464, 18, 14, 4047, 46883, 94, 1679, 68, 90,... \n", + "16857 [880, 97, 60, 8, 556, 1417, 564, 2140, 123, 7,... \n", + "\n", + "[16858 rows x 8 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "id": "knRBbpNcUk1Q" + }, + "outputs": [], + "source": [ + "# Machine Learning imports for Random Forest NER\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import cross_val_predict, cross_val_score, train_test_split\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "from sklearn.preprocessing import LabelEncoder\n", + "import warnings\n", + "import numpy as np\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3BJj02AgCEtD", + "outputId": "15cf83aa-01e6-455a-a663-ded3a549ef33" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ PhoBERT feature extraction function created\n", + "📊 Creates 40-dimensional feature vectors per token\n", + "🎯 Combines linguistic patterns, semantics, and context\n", + "🇻🇳 Optimized for Vietnamese NER tasks\n" + ] + } + ], + "source": [ + "# CELL 2: PhoBERT Feature Extraction Function\n", + "# ============================================\n", + "\n", + "def extract_phobert_token_features(token, label, token_id, position, sentence_length, prev_token, next_token, tokenizer):\n", + " \"\"\"\n", + " CORE FEATURE EXTRACTION: Create rich feature vector for each token\n", + "\n", + " This function creates a 40-dimensional feature vector combining:\n", + " 1. Basic word characteristics (8 features)\n", + " 2. PhoBERT tokenizer information (4 features)\n", + " 3. Vietnamese-specific patterns (10 features)\n", + " 4. Position and context information (4 features)\n", + " 5. Previous token context (4 features)\n", + " 6. Next token context (4 features)\n", + " 7. Character-level patterns (6 features)\n", + "\n", + " Total: 40 features per token\n", + " \"\"\"\n", + " features = []\n", + "\n", + " # ==========================================\n", + " # 1. BASIC WORD FEATURES (8 features)\n", + " # ==========================================\n", + " # These capture fundamental word characteristics\n", + " features.extend([\n", + " token.istitle(), # Feature 1: Is first letter capitalized? (Important for proper nouns)\n", + " token.islower(), # Feature 2: Is all lowercase? (Common words)\n", + " token.isupper(), # Feature 3: Is all uppercase? (Acronyms, emphasis)\n", + " token.isdigit(), # Feature 4: Is purely numeric? (Dates, quantities)\n", + " token.isalpha(), # Feature 5: Contains only letters? (Pure words)\n", + " len(token), # Feature 6: Word length (longer words often entities)\n", + " len(token) > 5, # Feature 7: Is long word? (Complex entities)\n", + " len(token) < 3, # Feature 8: Is short word? (Articles, prepositions)\n", + " ])\n", + "\n", + " # ==========================================\n", + " # 2. PHOBERT TOKENIZER FEATURES (4 features)\n", + " # ==========================================\n", + " # These leverage PhoBERT's semantic understanding\n", + " features.extend([\n", + " token_id, # Feature 9: PhoBERT token ID (semantic representation)\n", + " token_id % 1000, # Feature 10: Token ID modulo (clustering similar semantics)\n", + " token_id // 1000, # Feature 11: Token ID division (high-level semantic groups)\n", + " len(tokenizer.tokenize(token)), # Feature 12: Number of sub-tokens (complexity measure)\n", + " ])\n", + "\n", + " # ==========================================\n", + " # 3. VIETNAMESE-SPECIFIC FEATURES (10 features)\n", + " # ==========================================\n", + " # These capture Vietnamese naming and geographic patterns\n", + " features.extend([\n", + " # Common Vietnamese surnames and name patterns\n", + " token.startswith('Ng'), # Feature 13: Nguyễn, Ng (most common Vietnamese surname)\n", + " token.startswith('Tr'), # Feature 14: Trần, Trương (common Vietnamese surnames)\n", + " token.startswith('Lê'), # Feature 15: Lê (common Vietnamese surname)\n", + " token.startswith('Phạm'), # Feature 16: Phạm (common Vietnamese surname)\n", + " token.startswith('Vũ'),\n", + " token.startswith('Phan'),\n", + " token.startswith('Trương'),\n", + " token.startswith('Bùi'),\n", + " token.startswith('Đặng'),\n", + "\n", + "\n", + " # Vietnamese geographic patterns\n", + " token.endswith('nh'), # Feature 17: Hành, Thành (place suffixes)\n", + " token.endswith('ại'), # Feature 18: Hải, Lại (place suffixes)\n", + " token.endswith('ương'), # Feature 19: Hương, Thương (place suffixes)\n", + "\n", + " # Vietnamese location indicators\n", + " 'Thành' in token, # Feature 20: City indicator (Thành phố)\n", + " 'phố' in token.lower(), # Feature 21: City/street indicator\n", + " 'tỉnh' in token.lower(), # Feature 22: Province indicator\n", + " 'quận' in token.lower(),\n", + " 'huyện' in token.lower(),\n", + " 'xã' in token.lower(),\n", + " ])\n", + "\n", + " # ==========================================\n", + " # 4. POSITION AND CONTEXT FEATURES (4 features)\n", + " # ==========================================\n", + " # These capture positional information in sentences\n", + " features.extend([\n", + " position, # Feature 23: Absolute position in sentence\n", + " position / sentence_length, # Feature 24: Relative position (0.0 to 1.0)\n", + " position == 0, # Feature 25: Is first word? (Often capitalized)\n", + " position == sentence_length - 1, # Feature 26: Is last word? (Punctuation effects)\n", + " ])\n", + "\n", + " # ==========================================\n", + " # 5. PREVIOUS TOKEN CONTEXT (4 features)\n", + " # ==========================================\n", + " # These capture left context for entity boundary detection\n", + " if prev_token:\n", + " prev_token_ids = tokenizer.encode(prev_token, add_special_tokens=False)\n", + " prev_token_id = prev_token_ids[0] if prev_token_ids else 0\n", + " features.extend([\n", + " prev_token.istitle(), # Feature 27: Previous word capitalized?\n", + " prev_token.islower(), # Feature 28: Previous word lowercase?\n", + " prev_token_id % 1000, # Feature 29: Previous token semantic cluster\n", + " len(prev_token), # Feature 30: Previous word length\n", + " ])\n", + " else:\n", + " # Padding for beginning of sentence\n", + " features.extend([False, False, 0, 0])\n", + "\n", + " # ==========================================\n", + " # 6. NEXT TOKEN CONTEXT (4 features)\n", + " # ==========================================\n", + " # These capture right context for entity boundary detection\n", + " if next_token:\n", + " next_token_ids = tokenizer.encode(next_token, add_special_tokens=False)\n", + " next_token_id = next_token_ids[0] if next_token_ids else 0\n", + " features.extend([\n", + " next_token.istitle(), # Feature 31: Next word capitalized?\n", + " next_token.islower(), # Feature 32: Next word lowercase?\n", + " next_token_id % 1000, # Feature 33: Next token semantic cluster\n", + " len(next_token), # Feature 34: Next word length\n", + " ])\n", + " else:\n", + " # Padding for end of sentence\n", + " features.extend([False, False, 0, 0])\n", + "\n", + " # ==========================================\n", + " # 7. CHARACTER-LEVEL PATTERNS (6 features)\n", + " # ==========================================\n", + " # These capture fine-grained character patterns\n", + " features.extend([\n", + " any(char.isdigit() for char in token), # Feature 35: Contains any digits?\n", + " '-' in token, # Feature 36: Contains hyphen? (compound words)\n", + " '.' in token, # Feature 37: Contains period? (abbreviations)\n", + " ',' in token, # Feature 38: Contains comma? (lists)\n", + " token.count('_'), # Feature 39: Underscore count (technical terms)\n", + " token.isnumeric(), # Feature 40: Fully numeric?\n", + " ])\n", + "\n", + " # Convert to numpy array with float type for sklearn compatibility\n", + " return np.array(features, dtype=float)\n", + "\n", + "print(\"✅ PhoBERT feature extraction function created\")\n", + "print(\"📊 Creates 40-dimensional feature vectors per token\")\n", + "print(\"🎯 Combines linguistic patterns, semantics, and context\")\n", + "print(\"🇻🇳 Optimized for Vietnamese NER tasks\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jxrKRbBQbBse", + "outputId": "3d7bedf0-44f4-4b32-8d07-bdf293df138f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PhoBERT data preparation functions created\n" + ] + } + ], + "source": [ + "# Prepare data for PhoBERT-based Random Forest\n", + "def prepare_phobert_features(df, tokenizer):\n", + " \"\"\"\n", + " Extract PhoBERT-based features for Random Forest NER\n", + " \"\"\"\n", + " all_features = []\n", + " all_labels = []\n", + " all_tokens = []\n", + "\n", + " for idx, row in df.iterrows():\n", + " tokens = row['tokens']\n", + " ner_labels = row['ner_labels']\n", + "\n", + " # Join tokens for PhoBERT processing\n", + " sentence_text = \" \".join(tokens)\n", + "\n", + " # Tokenize with PhoBERT\n", + " phobert_tokens = tokenizer.tokenize(sentence_text)\n", + " phobert_ids = tokenizer.encode(sentence_text, add_special_tokens=False)\n", + "\n", + " # Align original tokens with PhoBERT tokens\n", + " aligned_features, aligned_labels = align_tokens_with_phobert(\n", + " tokens, ner_labels, phobert_tokens, phobert_ids, tokenizer\n", + " )\n", + "\n", + " all_features.extend(aligned_features)\n", + " all_labels.extend(aligned_labels)\n", + " all_tokens.extend(tokens)\n", + "\n", + " return np.array(all_features), np.array(all_labels), all_tokens\n", + "\n", + "def align_tokens_with_phobert(original_tokens, original_labels, phobert_tokens, phobert_ids, tokenizer):\n", + " \"\"\"\n", + " Align original tokens with PhoBERT sub-tokens and extract features\n", + " \"\"\"\n", + " features = []\n", + " labels = []\n", + "\n", + " # Simple alignment: map each original token to its PhoBERT representation\n", + " token_idx = 0\n", + " phobert_idx = 0\n", + "\n", + " for i, (token, label) in enumerate(zip(original_tokens, original_labels)):\n", + " # Find corresponding PhoBERT tokens for this original token\n", + " token_phobert_ids = tokenizer.encode(token, add_special_tokens=False)\n", + "\n", + " if len(token_phobert_ids) > 0:\n", + " # Use the first sub-token's ID as the main representation\n", + " main_token_id = token_phobert_ids[0]\n", + "\n", + " # Extract features for this token\n", + " token_features = extract_phobert_token_features(\n", + " token, label, main_token_id, i, len(original_tokens),\n", + " original_tokens[i-1] if i > 0 else None,\n", + " original_tokens[i+1] if i < len(original_tokens)-1 else None,\n", + " tokenizer\n", + " )\n", + "\n", + " features.append(token_features)\n", + " labels.append(label)\n", + "\n", + " return features, labels\n", + "\n", + "print(\"PhoBERT data preparation functions created\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WkxDoT21bCBR", + "outputId": "2944f9b4-db37-4e83-e215-cfbcd92bda47" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting PhoBERT-based features...\n", + "🚀 Starting PhoBERT feature extraction process...\n", + "📈 This will create rich feature representations for each token\n", + "\n", + "Phase 1: Extracting PhoBERT-based features...\n", + "\n", + "📊 FEATURE EXTRACTION RESULTS:\n", + " • Feature matrix shape: (369400, 48)\n", + " • Total tokens processed: 369,400\n", + " • Features per token: 48\n", + " • Labels shape: (369400,)\n", + "\n", + "📈 FEATURE STATISTICS:\n", + " • Feature matrix memory usage: 135.3 MB\n", + " • Average features per sentence: 21.9\n", + " • Feature range: [0.00, 63893.00]\n", + "\n", + "🏷️ LABEL DISTRIBUTION IN PHOBERT DATASET:\n", + " • B-LOC: 6,244 tokens (1.7%)\n", + " • B-ORG: 1,212 tokens (0.3%)\n", + " • B-PER: 7,479 tokens (2.0%)\n", + " • I-LOC: 2,783 tokens (0.8%)\n", + " • I-ORG: 2,055 tokens (0.6%)\n", + " • I-PER: 3,522 tokens (1.0%)\n", + " • O: 346,105 tokens (93.7%)\n", + "\n", + "Phase 2: Splitting data for training and testing...\n", + "✅ DATA SPLIT COMPLETED:\n", + " • Training set: 295,520 tokens (80.0%)\n", + " • Test set: 73,880 tokens (20.0%)\n", + " • Feature dimensions maintained: 48 features\n", + "\n", + "🎯 Ready for model training!\n" + ] + } + ], + "source": [ + "# Extract PhoBERT-based features\n", + "print(\"Extracting PhoBERT-based features...\")\n", + "X_phobert, y_phobert, tokens_phobert = prepare_phobert_features(df, tokenizer)\n", + "\n", + "# CELL 3: Extract PhoBERT-Based Features from Dataset\n", + "# ===================================================\n", + "\n", + "print(\"🚀 Starting PhoBERT feature extraction process...\")\n", + "print(\"📈 This will create rich feature representations for each token\")\n", + "print()\n", + "\n", + "# Extract PhoBERT-based features from the entire dataset\n", + "print(\"Phase 1: Extracting PhoBERT-based features...\")\n", + "\n", + "print()\n", + "print(\"📊 FEATURE EXTRACTION RESULTS:\")\n", + "print(f\" • Feature matrix shape: {X_phobert.shape}\")\n", + "print(f\" • Total tokens processed: {X_phobert.shape[0]:,}\")\n", + "print(f\" • Features per token: {X_phobert.shape[1]}\")\n", + "print(f\" • Labels shape: {y_phobert.shape}\")\n", + "print()\n", + "\n", + "# Analyze feature statistics\n", + "print(\"📈 FEATURE STATISTICS:\")\n", + "print(f\" • Feature matrix memory usage: {X_phobert.nbytes / 1024 / 1024:.1f} MB\")\n", + "print(f\" • Average features per sentence: {X_phobert.shape[0] / len(df):.1f}\")\n", + "print(f\" • Feature range: [{X_phobert.min():.2f}, {X_phobert.max():.2f}]\")\n", + "print()\n", + "\n", + "# Analyze label distribution in PhoBERT dataset\n", + "print(\"🏷️ LABEL DISTRIBUTION IN PHOBERT DATASET:\")\n", + "from collections import Counter\n", + "label_counts = Counter(y_phobert)\n", + "for label, count in sorted(label_counts.items()):\n", + " percentage = (count / len(y_phobert)) * 100\n", + " print(f\" • {label}: {count:,} tokens ({percentage:.1f}%)\")\n", + "print()\n", + "\n", + "# Split data for training and testing\n", + "print(\"Phase 2: Splitting data for training and testing...\")\n", + "X_pho_train, X_pho_test, y_pho_train, y_pho_test = train_test_split(\n", + " X_phobert, y_phobert,\n", + " test_size=0.2, # 80% train, 20% test\n", + " random_state=42, # Reproducible results\n", + " stratify=y_phobert # Maintain label distribution\n", + ")\n", + "\n", + "print(\"✅ DATA SPLIT COMPLETED:\")\n", + "print(f\" • Training set: {X_pho_train.shape[0]:,} tokens ({X_pho_train.shape[0]/X_phobert.shape[0]*100:.1f}%)\")\n", + "print(f\" • Test set: {X_pho_test.shape[0]:,} tokens ({X_pho_test.shape[0]/X_phobert.shape[0]*100:.1f}%)\")\n", + "print(f\" • Feature dimensions maintained: {X_pho_train.shape[1]} features\")\n", + "print()\n", + "print(\"🎯 Ready for model training!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZQm_nY-KbDbx", + "outputId": "04f65fcb-9a11-4dd7-f20a-e17fdfc5f847" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Training PhoBERT-enhanced Random Forest model...\n", + "\n", + "🔧 MODEL CONFIGURATION DETAILS:\n", + " • n_estimators=150: More trees than basic RF for better ensemble\n", + " • max_depth=25: Deeper trees to capture complex feature interactions\n", + " • min_samples_split=3: Lower threshold for more granular splits\n", + " • min_samples_leaf=1: Allow fine-grained leaf nodes\n", + " • max_features='sqrt': Sample √40 ≈ 6 features per split (prevents overfitting)\n", + " • class_weight='balanced': Handle NER tag imbalance automatically\n", + " • n_jobs=-1: Use all CPU cores for parallel training\n", + "\n", + "🚀 Starting model training...\n", + " • Training on 295,520 tokens\n", + " • Using 48 PhoBERT-enhanced features\n", + " • Training 150 decision trees in parallel...\n", + "\n", + "✅ PhoBERT Random Forest model trained successfully!\n", + " • Training time: 2.6 seconds\n", + " • Model memory usage: ~26 MB (estimated)\n", + " • Trees trained: 150\n", + "\n", + "🎯 Model ready for prediction and evaluation!\n" + ] + } + ], + "source": [ + "# Train PhoBERT-enhanced Random Forest model\n", + "print(\"\\nTraining PhoBERT-enhanced Random Forest model...\")\n", + "print()\n", + "\n", + "# Model configuration explanation\n", + "print(\"🔧 MODEL CONFIGURATION DETAILS:\")\n", + "print(\" • n_estimators=150: More trees than basic RF for better ensemble\")\n", + "print(\" • max_depth=25: Deeper trees to capture complex feature interactions\")\n", + "print(\" • min_samples_split=3: Lower threshold for more granular splits\")\n", + "print(\" • min_samples_leaf=1: Allow fine-grained leaf nodes\")\n", + "print(\" • max_features='sqrt': Sample √40 ≈ 6 features per split (prevents overfitting)\")\n", + "print(\" • class_weight='balanced': Handle NER tag imbalance automatically\")\n", + "print(\" • n_jobs=-1: Use all CPU cores for parallel training\")\n", + "print()\n", + "\n", + "# Initialize the PhoBERT-enhanced Random Forest model\n", + "phobert_rf_model = RandomForestClassifier(\n", + " n_estimators=150, # More trees for better ensemble performance\n", + " max_depth=25, # Deeper trees for complex PhoBERT feature interactions\n", + " min_samples_split=3, # Lower split threshold for fine-grained decisions\n", + " min_samples_leaf=1, # Allow detailed leaf nodes\n", + " max_features='sqrt', # Feature sampling: √40 ≈ 6 features per split\n", + " random_state=42, # Reproducible results\n", + " n_jobs=-1, # Parallel processing\n", + " class_weight='balanced' # Automatically handle class imbalance\n", + ")\n", + "\n", + "print(\"🚀 Starting model training...\")\n", + "print(f\" • Training on {X_pho_train.shape[0]:,} tokens\")\n", + "print(f\" • Using {X_pho_train.shape[1]} PhoBERT-enhanced features\")\n", + "print(f\" • Training 150 decision trees in parallel...\")\n", + "print()\n", + "\n", + "# Train the model\n", + "import time\n", + "start_time = time.time()\n", + "\n", + "phobert_rf_model.fit(X_pho_train, y_pho_train)\n", + "\n", + "training_time = time.time() - start_time\n", + "\n", + "print(\"✅ PhoBERT Random Forest model trained successfully!\")\n", + "print(f\" • Training time: {training_time:.1f} seconds\")\n", + "print(f\" • Model memory usage: ~{training_time * 10:.0f} MB (estimated)\")\n", + "print(f\" • Trees trained: {phobert_rf_model.n_estimators}\")\n", + "print()\n", + "print(\"🎯 Model ready for prediction and evaluation!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LtkI-suibEjs", + "outputId": "f392a4f5-5313-434b-92fb-3d8b6f9cbf4b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📊 Starting comprehensive model evaluation...\n", + "\n", + "Phase 1: Making predictions on test set...\n", + " • Predicted 73,880 token labels\n", + " • Prediction completed in milliseconds\n", + "\n", + "🎯 Overall Accuracy: 0.9825 (98.25%)\n", + "\n", + "📈 DETAILED CLASSIFICATION REPORT:\n", + "============================================================\n", + " precision recall f1-score support\n", + "\n", + " B-LOC 0.85 0.77 0.81 1249\n", + " B-ORG 0.78 0.58 0.66 242\n", + " B-PER 0.79 0.94 0.86 1496\n", + " I-LOC 0.73 0.65 0.69 557\n", + " I-ORG 0.73 0.46 0.57 411\n", + " I-PER 0.87 0.95 0.91 704\n", + " O 0.99 0.99 0.99 69221\n", + "\n", + " accuracy 0.98 73880\n", + " macro avg 0.82 0.76 0.78 73880\n", + "weighted avg 0.98 0.98 0.98 73880\n", + "\n", + "============================================================\n", + "\n", + "📚 METRICS EXPLANATION:\n", + " • Precision: Of predicted entities, how many were correct?\n", + " • Recall: Of actual entities, how many were found?\n", + " • F1-score: Harmonic mean of precision and recall\n", + " • Support: Number of actual instances of each class\n", + "\n", + "Phase 2: Cross-validation for robust performance estimate...\n", + " • Using 5-fold cross-validation\n", + " • This trains 5 different models to avoid overfitting\n", + "\n", + "✅ CROSS-VALIDATION RESULTS:\n", + " • Individual fold scores: ['0.9817', '0.9817', '0.9814', '0.9814', '0.9814']\n", + " • Mean CV F1 score: 0.9815\n", + " • Standard deviation: ±0.0001\n", + " • 95% confidence interval: 0.9815 ± 0.0003\n", + "\n", + "🏆 PhoBERT Random Forest shows consistent performance across folds!\n" + ] + } + ], + "source": [ + "# CELL 5: Evaluate PhoBERT Random Forest Model Performance\n", + "# ========================================================\n", + "\n", + "print(\"📊 Starting comprehensive model evaluation...\")\n", + "print()\n", + "\n", + "# Make predictions on test set\n", + "print(\"Phase 1: Making predictions on test set...\")\n", + "y_pho_pred = phobert_rf_model.predict(X_pho_test)\n", + "\n", + "print(f\" • Predicted {len(y_pho_pred):,} token labels\")\n", + "print(f\" • Prediction completed in milliseconds\")\n", + "print()\n", + "\n", + "# Calculate basic accuracy\n", + "accuracy = (y_pho_pred == y_pho_test).mean()\n", + "print(f\"🎯 Overall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)\")\n", + "print()\n", + "\n", + "# Generate detailed classification report\n", + "print(\"📈 DETAILED CLASSIFICATION REPORT:\")\n", + "print(\"=\" * 60)\n", + "classification_rep = classification_report(y_pho_test, y_pho_pred)\n", + "print(classification_rep)\n", + "print(\"=\" * 60)\n", + "print()\n", + "\n", + "# Explain classification metrics\n", + "print(\"📚 METRICS EXPLANATION:\")\n", + "print(\" • Precision: Of predicted entities, how many were correct?\")\n", + "print(\" • Recall: Of actual entities, how many were found?\")\n", + "print(\" • F1-score: Harmonic mean of precision and recall\")\n", + "print(\" • Support: Number of actual instances of each class\")\n", + "print()\n", + "\n", + "# Cross-validation for robust evaluation\n", + "print(\"Phase 2: Cross-validation for robust performance estimate...\")\n", + "print(\" • Using 5-fold cross-validation\")\n", + "print(\" • This trains 5 different models to avoid overfitting\")\n", + "print()\n", + "\n", + "phobert_cv_scores = cross_val_score(\n", + " phobert_rf_model, X_phobert, y_phobert,\n", + " cv=5, # 5-fold cross-validation\n", + " scoring='f1_weighted', # Weighted F1 score (accounts for class imbalance)\n", + " n_jobs=-1 # Parallel processing\n", + ")\n", + "\n", + "print(\"✅ CROSS-VALIDATION RESULTS:\")\n", + "print(f\" • Individual fold scores: {[f'{score:.4f}' for score in phobert_cv_scores]}\")\n", + "print(f\" • Mean CV F1 score: {phobert_cv_scores.mean():.4f}\")\n", + "print(f\" • Standard deviation: ±{phobert_cv_scores.std():.4f}\")\n", + "print(f\" • 95% confidence interval: {phobert_cv_scores.mean():.4f} ± {phobert_cv_scores.std() * 2:.4f}\")\n", + "print()\n", + "print(\"🏆 PhoBERT Random Forest shows consistent performance across folds!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "trr6-fss-aOM", + "outputId": "caf50264-d922-435d-973e-8194c082d431" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(47,)" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "phobert_rf_model.feature_importances_[0:47].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "_9m3ZcifbGeu", + "outputId": "fda2c826-5dc5-4d4a-9be1-10bfe32f95e2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🔍 Analyzing PhoBERT feature importance...\n", + "This shows which features are most valuable for NER decisions\n", + "\n", + "🏆 TOP 15 MOST IMPORTANT PHOBERT FEATURES:\n", + "======================================================================\n", + " 1. prev_length | 0.0984 | 9.8%\n", + " 2. token_id | 0.0962 | 9.6%\n", + " 3. token_id_mod | 0.0697 | 7.0%\n", + " 4. next_is_title | 0.0621 | 6.2%\n", + " 5. prev_is_lower | 0.0575 | 5.8%\n", + " 6. is_title | 0.0553 | 5.5%\n", + " 7. next_length | 0.0551 | 5.5%\n", + " 8. rel_position | 0.0501 | 5.0%\n", + " 9. token_id_div | 0.0492 | 4.9%\n", + "10. is_beginning | 0.0483 | 4.8%\n", + "11. is_lower | 0.0482 | 4.8%\n", + "12. word_length | 0.0475 | 4.8%\n", + "13. prev_token_id_mod | 0.0440 | 4.4%\n", + "14. has_digits | 0.0428 | 4.3%\n", + "15. next_token_id_mod | 0.0243 | 2.4%\n", + "======================================================================\n", + "\n", + "📊 FEATURE CATEGORY ANALYSIS:\n", + " • Character Patterns : 0.3093 (30.9%)\n", + " • PhoBERT Features : 0.2263 (22.6%)\n", + " • Basic Word Features : 0.2116 (21.2%)\n", + " • Next Token : 0.1017 (10.2%)\n", + " • Vietnamese Patterns : 0.0123 (1.2%)\n", + " • Position/Context : 0.0036 (0.4%)\n", + " • Previous Token : 0.0034 (0.3%)\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "💡 INSIGHTS FROM FEATURE IMPORTANCE:\n", + " • Most important feature: prev_length\n", + " • PhoBERT token features rank: [np.int64(2), np.int64(3), np.int64(9), np.int64(13), np.int64(15), np.int64(21)]\n", + " • Vietnamese patterns contribute: 1.2% of total importance\n", + " • Context features (prev/next) contribute: 10.5%\n" + ] + } + ], + "source": [ + "# PhoBERT Feature Importance Analysis\n", + "# ===========================================\n", + "\n", + "print(\"🔍 Analyzing PhoBERT feature importance...\")\n", + "print(\"This shows which features are most valuable for NER decisions\")\n", + "print()\n", + "\n", + "# Define feature names with detailed descriptions\n", + "phobert_feature_names = [\n", + " # Basic word features (1-8) + 5\n", + " 'is_title', 'is_lower', 'is_upper', 'is_digit', 'is_alpha', 'word_length',\n", + " 'is_long_word', 'is_short_word',\n", + "\n", + " # PhoBERT tokenizer features (9-12)\n", + " 'token_id', 'token_id_mod', 'token_id_div', 'subtoken_count',\n", + "\n", + " # Vietnamese-specific features (13-22) + 3\n", + " 'starts_Ng', 'starts_Tr', 'starts_Lê', 'starts_Phạm', 'starts_Vũ', 'starts_Phan', 'starts_Trương', 'starts_Bùi', 'starts_Đặng'\n", + " 'ends_nh', 'ends_ại', 'ends_ương', 'has_Thành', 'has_phố', 'has_tỉnh', 'has_quận', 'has_huyện', 'has_xã',\n", + "\n", + " # Position and context features (23-26)\n", + " 'abs_position', 'rel_position', 'is_beginning', 'is_end',\n", + "\n", + " # Previous token context (27-30)\n", + " 'prev_is_title', 'prev_is_lower', 'prev_token_id_mod', 'prev_length',\n", + "\n", + " # Next token context (31-34)\n", + " 'next_is_title', 'next_is_lower', 'next_token_id_mod', 'next_length',\n", + "\n", + " # Character-level features (35-40) => 48\n", + " 'has_digits', 'has_hyphen', 'has_period', 'has_comma', 'underscore_count', 'is_numeric'\n", + "]\n", + "\n", + "# Get feature importance from trained model\n", + "phobert_importances = phobert_rf_model.feature_importances_[0:47] # ??? How the FUCK is there 1 extra dimension\n", + "\n", + "# Create feature importance DataFrame\n", + "phobert_feature_importance_df = pd.DataFrame({\n", + " 'feature': phobert_feature_names,\n", + " 'importance': phobert_importances,\n", + " 'rank': range(1, len(phobert_feature_names) + 1)\n", + "}).sort_values('importance', ascending=False)\n", + "\n", + "# Reset rank after sorting\n", + "phobert_feature_importance_df['rank'] = range(1, len(phobert_feature_importance_df) + 1)\n", + "\n", + "print(\"🏆 TOP 15 MOST IMPORTANT PHOBERT FEATURES:\")\n", + "print(\"=\" * 70)\n", + "for i, row in phobert_feature_importance_df.head(15).iterrows():\n", + " print(f\"{row['rank']:2d}. {row['feature']:20s} | {row['importance']:.4f} | {row['importance']*100:.1f}%\")\n", + "print(\"=\" * 70)\n", + "print()\n", + "\n", + "# Analyze feature categories\n", + "print(\"📊 FEATURE CATEGORY ANALYSIS:\")\n", + "category_importance = {\n", + " 'Basic Word Features': phobert_importances[0:8].sum(),\n", + " 'PhoBERT Features': phobert_importances[8:12].sum(),\n", + " 'Vietnamese Patterns': phobert_importances[12:22].sum(),\n", + " 'Position/Context': phobert_importances[22:26].sum(),\n", + " 'Previous Token': phobert_importances[26:30].sum(),\n", + " 'Next Token': phobert_importances[30:34].sum(),\n", + " 'Character Patterns': phobert_importances[34:40].sum()\n", + "}\n", + "\n", + "for category, importance in sorted(category_importance.items(), key=lambda x: x[1], reverse=True):\n", + " print(f\" • {category:20s}: {importance:.4f} ({importance*100:.1f}%)\")\n", + "print()\n", + "\n", + "# Create visualization\n", + "plt.figure(figsize=(14, 10))\n", + "top_features = phobert_feature_importance_df.head(20)\n", + "plt.barh(range(len(top_features)), top_features['importance'], color='skyblue')\n", + "plt.yticks(range(len(top_features)), top_features['feature'])\n", + "plt.xlabel('Feature Importance')\n", + "plt.title('Top 20 PhoBERT Feature Importance for Random Forest NER\\n(Higher values = More important for NER decisions)')\n", + "plt.gca().invert_yaxis() # Most important at top\n", + "plt.tight_layout()\n", + "plt.grid(axis='x', alpha=0.3)\n", + "plt.show()\n", + "\n", + "print(\"💡 INSIGHTS FROM FEATURE IMPORTANCE:\")\n", + "print(f\" • Most important feature: {phobert_feature_importance_df.iloc[0]['feature']}\")\n", + "print(f\" • PhoBERT token features rank: {list(phobert_feature_importance_df[phobert_feature_importance_df['feature'].str.contains('token')]['rank'].values)}\")\n", + "print(f\" • Vietnamese patterns contribute: {category_importance['Vietnamese Patterns']*100:.1f}% of total importance\")\n", + "print(f\" • Context features (prev/next) contribute: {(category_importance['Previous Token'] + category_importance['Next Token'])*100:.1f}%\")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "khRcnxRrbHWK", + "outputId": "46df401a-5457-42b6-e3b6-5d86f1060e5e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ PhoBERT prediction function created\n", + "\n", + "🧪 TESTING PHOBERT PREDICTION FUNCTION:\n", + "============================================================\n", + "\n", + "📝 Test Case 1 (Sentence 5):\n", + "🔮 Predicting NER tags for 28 tokens...\n", + " Tokens: ['Cách' 'đây' 'hai' 'tháng' 'những' 'người' 'thợ' 'phía' 'nam' 'Hải_Vân'\n", + " 'đã' 'về' '\"' 'đích' '\"' 'với' 'chiều' 'dài' 'hầm' 'hơn' '4.800' 'm' '('\n", + " 'chính' 'và' 'phụ' ')' '.']\n", + " True tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n", + " Predicted: [np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('B-LOC'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O')]\n", + " Accuracy: 100.00%\n", + " Match: ['✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓']\n", + "\n", + "📝 Test Case 2 (Sentence 10):\n", + "🔮 Predicting NER tags for 36 tokens...\n", + " Tokens: ['Người' 'môi_giới' 'Malaysia' 'đưa' 'chúng_tôi' 'đến' 'thăm' 'chỗ' 'ở'\n", + " 'của' 'LĐ' ',' 'tình_cờ' 'chứng_kiến' 'một' 'sự_việc' 'đau_lòng' 'khi'\n", + " 'hai' 'nhân_viên' 'bảo_vệ' 'khu' 'nhà' 'đang' 'chuẩn_bị' 'đạp' 'tung'\n", + " 'một' 'cánh' 'cửa' 'phòng' 'của' 'nữ' 'LĐ' 'VN' '.']\n", + " True tags: ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']\n", + " Predicted: [np.str_('O'), np.str_('O'), np.str_('B-LOC'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('B-LOC'), np.str_('O')]\n", + " Accuracy: 100.00%\n", + " Match: ['✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓']\n", + "\n", + "📝 Test Case 3 (Sentence 15):\n", + "🔮 Predicting NER tags for 25 tokens...\n", + " Tokens: ['Đằng' 'sau' 'những' 'khát_vọng' 'đổi_đời' 'ấy' 'là' 'những' 'người'\n", + " 'cha' ',' 'người' 'mẹ' 'và' 'cả' 'những' 'người' 'bạn' 'cùng' 'lớp'\n", + " 'âm_thầm' 'thắp' 'lửa' 'yêu_thương' '.']\n", + " True tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n", + " Predicted: [np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O')]\n", + " Accuracy: 100.00%\n", + " Match: ['✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓']\n", + "\n", + "📝 Test Case 4 (Sentence 25):\n", + "🔮 Predicting NER tags for 38 tokens...\n", + " Tokens: ['Trong' 'khi' 'đó' 'một' 'nhóm' 'ngư_dân' 'Indonesia' 'đang' 'túm_tụm'\n", + " 'trên' 'phà' 'xem' 'báo' ',' 'họ' 'bàn_tán' 'ghê' 'lắm' 'về' 'thông_tin'\n", + " 'Bộ' 'Quốc_phòng' 'Indonesia' 'đang' 'đặt' 'mua' '22' 'tàu_chiến' 'loại'\n", + " 'hiện_đại' 'nhất' 'để' 'tăng_cường' 'tuần_tra' 'trên' 'vùng' 'eo_biển'\n", + " '.']\n", + " True tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n", + " Predicted: [np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('B-LOC'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('B-ORG'), np.str_('I-ORG'), np.str_('I-ORG'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O'), np.str_('O')]\n", + " Accuracy: 100.00%\n", + " Match: ['✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓', '✓']\n", + "\n", + "============================================================\n", + "💡 PREDICTION INSIGHTS:\n", + " • Each token gets 40 PhoBERT-enhanced features\n", + " • Model considers context (previous/next tokens)\n", + " • Vietnamese-specific patterns help with person/place names\n", + " • PhoBERT tokenization provides semantic understanding\n" + ] + } + ], + "source": [ + "# CELL 7: PhoBERT Random Forest Prediction Function\n", + "# ================================================\n", + "\n", + "def predict_ner_with_phobert_rf(sentence_tokens, model, tokenizer):\n", + " \"\"\"\n", + " PREDICTION FUNCTION: Predict NER tags for new sentences\n", + "\n", + " Process:\n", + " 1. For each token in the sentence, extract PhoBERT-enhanced features\n", + " 2. Use the same feature extraction as training (40 features)\n", + " 3. Apply the trained Random Forest model\n", + " 4. Return predicted NER tags\n", + "\n", + " Input:\n", + " - sentence_tokens: List of tokens [\"Nguyễn\", \"Văn\", \"A\", \"sống\", \"ở\", \"Hà\", \"Nội\"]\n", + " - model: Trained PhoBERT Random Forest model\n", + " - tokenizer: PhoBERT tokenizer\n", + "\n", + " Output:\n", + " - List of predicted NER tags [\"B-PER\", \"I-PER\", \"I-PER\", \"O\", \"O\", \"B-LOC\", \"I-LOC\"]\n", + " \"\"\"\n", + " sentence_length = len(sentence_tokens)\n", + " predictions = []\n", + "\n", + " print(f\"🔮 Predicting NER tags for {sentence_length} tokens...\")\n", + "\n", + " for pos, token in enumerate(sentence_tokens):\n", + " # Get context tokens (previous and next)\n", + " prev_token = sentence_tokens[pos-1] if pos > 0 else None\n", + " next_token = sentence_tokens[pos+1] if pos < len(sentence_tokens)-1 else None\n", + "\n", + " # Get PhoBERT token ID for this token\n", + " token_ids = tokenizer.encode(token, add_special_tokens=False)\n", + " main_token_id = token_ids[0] if token_ids else 0\n", + "\n", + " # Extract the same 40 features used in training\n", + " word_features = extract_phobert_token_features(\n", + " token=token,\n", + " label=None, # We don't know the true label when predicting\n", + " token_id=main_token_id,\n", + " position=pos,\n", + " sentence_length=sentence_length,\n", + " prev_token=prev_token,\n", + " next_token=next_token,\n", + " tokenizer=tokenizer\n", + " )\n", + "\n", + " # Make prediction using trained model\n", + " # model.predict expects 2D array, so we wrap in list: [word_features]\n", + " pred_tag = model.predict([word_features])[0]\n", + " predictions.append(pred_tag)\n", + "\n", + " return predictions\n", + "\n", + "print(\"✅ PhoBERT prediction function created\")\n", + "print()\n", + "\n", + "# Test the prediction function with multiple samples\n", + "print(\"🧪 TESTING PHOBERT PREDICTION FUNCTION:\")\n", + "print(\"=\" * 60)\n", + "\n", + "# Test with different sample indices to show variety\n", + "test_indices = [5, 10, 15, 25]\n", + "\n", + "for i, sample_idx in enumerate(test_indices):\n", + " print(f\"\\n📝 Test Case {i+1} (Sentence {sample_idx}):\")\n", + "\n", + " sample_tokens = df.iloc[sample_idx]['tokens']\n", + " sample_true_tags = df.iloc[sample_idx]['ner_labels']\n", + "\n", + " # Make prediction\n", + " sample_phobert_pred = predict_ner_with_phobert_rf(sample_tokens, phobert_rf_model, tokenizer)\n", + "\n", + " print(f\" Tokens: {sample_tokens}\")\n", + " print(f\" True tags: {sample_true_tags}\")\n", + " print(f\" Predicted: {sample_phobert_pred}\")\n", + "\n", + " # Calculate accuracy for this sample\n", + " sample_accuracy = sum(t == p for t, p in zip(sample_true_tags, sample_phobert_pred)) / len(sample_true_tags)\n", + " print(f\" Accuracy: {sample_accuracy:.2%}\")\n", + "\n", + " # Show which predictions were correct/incorrect\n", + " correct_predictions = [\"✓\" if t == p else \"✗\" for t, p in zip(sample_true_tags, sample_phobert_pred)]\n", + " print(f\" Match: {correct_predictions}\")\n", + "\n", + "print(\"\\n\" + \"=\" * 60)\n", + "print(\"💡 PREDICTION INSIGHTS:\")\n", + "print(\" • Each token gets 40 PhoBERT-enhanced features\")\n", + "print(\" • Model considers context (previous/next tokens)\")\n", + "print(\" • Vietnamese-specific patterns help with person/place names\")\n", + "print(\" • PhoBERT tokenization provides semantic understanding\")" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "id": "KeVUuXHkbIg1" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "TPU", + "colab": { + "gpuType": "V28", + "machine_shape": "hm", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "02efe4dc0e34428f87d00571063a2342": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "09d63d9c6e3b4befb2d13ed56eb1fecf": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0b42f930dcd949ee9df45dc2cd860214": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "132e7ba656334ec59ac72679c126bf27": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_bb767d0faca34e619107546b439b607f", + "IPY_MODEL_1b9c56a6995a40d59f64bccc7f679e07", + "IPY_MODEL_3c6d843da9bf44d3b971c4443c44a255" + ], + "layout": "IPY_MODEL_220afa91ce9f4d67934243437259faa6" + } + }, + "16a99b74cefa4f0e870e93776897feca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_539f3fb7d9764596b3496609617e9df4", + "placeholder": "​", + "style": "IPY_MODEL_70035b06e7c14e54ba0a06fb59c66546", + "value": " 895k/895k [00:00<00:00, 14.4MB/s]" + } + }, + "19382d4c6ac74aa190a891fa855a8fdd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_84087d814eb9493693cbba7e84db4a62", + "IPY_MODEL_32bcd44ab58e4349b66ccb144b7a5eb1", + "IPY_MODEL_a9b4d1e29f7645fd90a86c407bb97eb9" + ], + "layout": "IPY_MODEL_858e486c77e94ef0a0720e4d204e10b9" + } + }, + "1b9c56a6995a40d59f64bccc7f679e07": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_62cb290b834a4378b43fd8a5424d3557", + "max": 678, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_265686ab8b4e4c66bf419f6391cad15e", + "value": 678 + } + }, + "220afa91ce9f4d67934243437259faa6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "222c29ec297f400ea78c758563378f8a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "265686ab8b4e4c66bf419f6391cad15e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2b11610bd8d94a81863e99debe1e6905": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3116450c8884414ab14124212ba0b3ef": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "31566827dfab451db0ca2a06458efaa3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2b11610bd8d94a81863e99debe1e6905", + "placeholder": "​", + "style": "IPY_MODEL_71c48602225b41449f5b5cbc53d519dc", + "value": "bpe.codes: 100%" + } + }, + "32bcd44ab58e4349b66ccb144b7a5eb1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fc5c675a5c0a4058a683318491f6e6da", + "max": 540322347, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4958e837d8664a298f32b7e88825fbf7", + "value": 540322347 + } + }, + "35144d338b1249278eef607e9f84c57a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7a39143f21d2438e80e32a7f0ac407a4", + "placeholder": "​", + "style": "IPY_MODEL_3dcbe597c2af44ab8672a58304317ff9", + "value": " 540M/540M [00:13<00:00, 49.9MB/s]" + } + }, + "3c6d843da9bf44d3b971c4443c44a255": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_02efe4dc0e34428f87d00571063a2342", + "placeholder": "​", + "style": "IPY_MODEL_3f65300df13246168e409be8852ac5ea", + "value": " 678/678 [00:00<00:00, 74.7kB/s]" + } + }, + "3d808cb829044c01b8cbdca369a16e3a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3dcbe597c2af44ab8672a58304317ff9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3f65300df13246168e409be8852ac5ea": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4958e837d8664a298f32b7e88825fbf7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "539f3fb7d9764596b3496609617e9df4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "53dbc1a7391648d0a033644c2a97b4c2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "583d9194818b452c8f8531005a5802ba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7a1d4efa4f394010b9e8f6b72f457f0c", + "placeholder": "​", + "style": "IPY_MODEL_aa5355a04ee34ba99567184381985f82", + "value": " 3.13M/3.13M [00:00<00:00, 38.1MB/s]" + } + }, + "5958c45737064862858bf130b105ec08": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5a809d0a194c4e04a3a3f99ca4d98b5c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b73314b6559c492f851262eba512aa3c", + "max": 540281612, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_53dbc1a7391648d0a033644c2a97b4c2", + "value": 540281612 + } + }, + "60443f69792b4726b8cf2a038a241022": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_dc92fe258486439394f2841363d4286b", + "IPY_MODEL_bc2cff635fb549549afb10cd81ec252a", + "IPY_MODEL_583d9194818b452c8f8531005a5802ba" + ], + "layout": "IPY_MODEL_8fa670c708c34020b46f4db42341779b" + } + }, + "6063b69b0af744d9b31dc9d20c029c99": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "62cb290b834a4378b43fd8a5424d3557": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6954c314e1e844e49407fb2236ea76a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_31566827dfab451db0ca2a06458efaa3", + "IPY_MODEL_f8ef07a5c6fd4bca98efdf435d883f4c", + "IPY_MODEL_c67c2df8fc294b9b8fbf0055dd2ded6c" + ], + "layout": "IPY_MODEL_f4308d3a141e454cb2c1602cba4a06aa" + } + }, + "6b42472a431141e182c422381ec84106": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "70035b06e7c14e54ba0a06fb59c66546": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "71c48602225b41449f5b5cbc53d519dc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7a1d4efa4f394010b9e8f6b72f457f0c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7a39143f21d2438e80e32a7f0ac407a4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7d81164902f44d289148260c2584782e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d32f4b409484471891cf68e295afc1e5", + "placeholder": "​", + "style": "IPY_MODEL_8b865a52c0c1492283a00e7fef9a2697", + "value": "model.safetensors: 100%" + } + }, + "82fe50de6b0945198fae9f532324e7fd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "83930005bf7443b08835dae0ceb66457": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "84087d814eb9493693cbba7e84db4a62": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_09d63d9c6e3b4befb2d13ed56eb1fecf", + "placeholder": "​", + "style": "IPY_MODEL_d5d0d780b2464843a1d380f0a4c0fbbc", + "value": "pytorch_model.bin: 100%" + } + }, + "858e486c77e94ef0a0720e4d204e10b9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8b0f3136fecc4965a76df8c8dee661ce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f4f3815411b24674bfb51ab242832999", + "max": 895321, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_83930005bf7443b08835dae0ceb66457", + "value": 895321 + } + }, + "8b865a52c0c1492283a00e7fef9a2697": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8d018b3f1f2e4ac699f5051019b9af74": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7d81164902f44d289148260c2584782e", + "IPY_MODEL_5a809d0a194c4e04a3a3f99ca4d98b5c", + "IPY_MODEL_35144d338b1249278eef607e9f84c57a" + ], + "layout": "IPY_MODEL_b283351b3c9f4b1fb9801e2bafcc3ac7" + } + }, + "8ef031c9d4b84a62a20b47d338a3dc2d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8fa670c708c34020b46f4db42341779b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a7a6d4796d0349b99abb511f3ff9823b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a9b4d1e29f7645fd90a86c407bb97eb9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_82fe50de6b0945198fae9f532324e7fd", + "placeholder": "​", + "style": "IPY_MODEL_222c29ec297f400ea78c758563378f8a", + "value": " 540M/540M [00:01<00:00, 290MB/s]" + } + }, + "aa5355a04ee34ba99567184381985f82": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b283351b3c9f4b1fb9801e2bafcc3ac7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b73314b6559c492f851262eba512aa3c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bb767d0faca34e619107546b439b607f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6b42472a431141e182c422381ec84106", + "placeholder": "​", + "style": "IPY_MODEL_c9e12698a123442093c2bd71d7c5775a", + "value": "config.json: 100%" + } + }, + "bc2cff635fb549549afb10cd81ec252a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e29a696703cc4f87a12fc6e98f2cbcf8", + "max": 3132320, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3116450c8884414ab14124212ba0b3ef", + "value": 3132320 + } + }, + "c67c2df8fc294b9b8fbf0055dd2ded6c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6063b69b0af744d9b31dc9d20c029c99", + "placeholder": "​", + "style": "IPY_MODEL_3d808cb829044c01b8cbdca369a16e3a", + "value": " 1.14M/1.14M [00:00<00:00, 20.9MB/s]" + } + }, + "c9e12698a123442093c2bd71d7c5775a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "cc286fe8ebb64f03acf68fc630cedfe0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fec4bac1a3344aad9f168be02dba98f2", + "placeholder": "​", + "style": "IPY_MODEL_5958c45737064862858bf130b105ec08", + "value": "vocab.txt: 100%" + } + }, + "d32f4b409484471891cf68e295afc1e5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d5d0d780b2464843a1d380f0a4c0fbbc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dc92fe258486439394f2841363d4286b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a7a6d4796d0349b99abb511f3ff9823b", + "placeholder": "​", + "style": "IPY_MODEL_8ef031c9d4b84a62a20b47d338a3dc2d", + "value": "tokenizer.json: 100%" + } + }, + "e13525732c2f4d8cad00e52b80f7e2f4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_cc286fe8ebb64f03acf68fc630cedfe0", + "IPY_MODEL_8b0f3136fecc4965a76df8c8dee661ce", + "IPY_MODEL_16a99b74cefa4f0e870e93776897feca" + ], + "layout": "IPY_MODEL_0b42f930dcd949ee9df45dc2cd860214" + } + }, + "e29a696703cc4f87a12fc6e98f2cbcf8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "eb355ce836b34a9da087fee621132e94": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ed4e3e35f657476c82ff20a038812243": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f4308d3a141e454cb2c1602cba4a06aa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f4f3815411b24674bfb51ab242832999": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f8ef07a5c6fd4bca98efdf435d883f4c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ed4e3e35f657476c82ff20a038812243", + "max": 1135173, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_eb355ce836b34a9da087fee621132e94", + "value": 1135173 + } + }, + "fc5c675a5c0a4058a683318491f6e6da": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fec4bac1a3344aad9f168be02dba98f2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Softmax_PhoBERT.ipynb b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Softmax_PhoBERT.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..cb628b986a3ff0c5e805b834cceb3a3124524091 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/notebooks/Softmax_PhoBERT.ipynb @@ -0,0 +1,5063 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 191 + }, + "id": "yRrmkevlCjXr", + "outputId": "b0abb114-925d-4ebf-f9ab-1abe0ce61723" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "application/javascript": [ + "\n", + " window._wandbApiKey = new Promise((resolve, reject) => {\n", + " function loadScript(url) {\n", + " return new Promise(function(resolve, reject) {\n", + " let newScript = document.createElement(\"script\");\n", + " newScript.onerror = reject;\n", + " newScript.onload = resolve;\n", + " document.body.appendChild(newScript);\n", + " newScript.src = url;\n", + " });\n", + " }\n", + " loadScript(\"https://cdn.jsdelivr.net/npm/postmate/build/postmate.min.js\").then(() => {\n", + " const iframe = document.createElement('iframe')\n", + " iframe.style.cssText = \"width:0;height:0;border:none\"\n", + " document.body.appendChild(iframe)\n", + " const handshake = new Postmate({\n", + " container: iframe,\n", + " url: 'https://wandb.ai/authorize'\n", + " });\n", + " const timeout = setTimeout(() => reject(\"Couldn't auto authenticate\"), 5000)\n", + " handshake.then(function(child) {\n", + " child.on('authorize', data => {\n", + " clearTimeout(timeout)\n", + " resolve(data)\n", + " });\n", + " });\n", + " })\n", + " });\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n", + "wandb: Paste an API key from your profile and hit enter:" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ··········\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: No netrc file found, creating one.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mlenguyenquocanh-vn-fptu\u001b[0m (\u001b[33mlenguyenquocanh-vn-fptu-fpt-university\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 1 + } + ], + "source": [ + "import wandb\n", + "wandb.login()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YY74yDYXID_a" + }, + "source": [ + "# Data Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "66m2J73nGXEV", + "outputId": "24173ca4-38fe-4f9e-f9a8-b39bdfdabe72" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}\n", + "df_train = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"train\"])\n", + "df_valid = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"valid\"])\n", + "df = pd.concat([df_train, df_valid]).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "U81OmhBeGmMM", + "outputId": "f6c51bb0-3b7b-4029-e1fc-ae8d9a75ef87" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " tokens \\\n", + "0 [Không_khí, thật, náo_nhiệt, .] \n", + "1 [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n... \n", + "2 [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ... \n", + "3 [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n... \n", + "4 [Nhật_ký, của, thuyền_viên, .] \n", + "... ... \n", + "16853 [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ... \n", + "16854 [Nhưng, mọi, chuyện, không, dừng, ở, đó, .] \n", + "16855 [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,... \n", + "16856 [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ... \n", + "16857 [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t... \n", + "\n", + " id \\\n", + "0 [0, 0, 0, 0] \n", + "1 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "2 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "3 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ... \n", + "4 [0, 0, 0, 0] \n", + "... ... \n", + "16853 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... \n", + "16854 [0, 0, 0, 0, 0, 0, 0, 0] \n", + "16855 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "16856 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n", + "16857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", + "\n", + " seg_text \\\n", + "0 Không_khí thật náo_nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,... \n", + "4 Nhật_ký của thuyền_viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh... \n", + "16856 Biết_bao người đã tình_nguyện hiến_dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành_tích ” tiêu tiền của m... \n", + "\n", + " raw_text \\\n", + "0 Không khí thật náo nhiệt . \n", + "1 Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch... \n", + "2 Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ... \n", + "3 Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,... \n", + "4 Nhật ký của thuyền viên . \n", + "... ... \n", + "16853 Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ... \n", + "16854 Nhưng mọi chuyện không dừng ở đó . \n", + "16855 Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh... \n", + "16856 Biết bao người đã tình nguyện hiến dâng cả cuộ... \n", + "16857 Trên đây mới là “ thành tích ” tiêu tiền của m... \n", + "\n", + " labels \n", + "0 [O, O, O, O] \n", + "1 [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O... \n", + "2 [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,... \n", + "3 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-... \n", + "4 [O, O, O, O] \n", + "... ... \n", + "16853 [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,... \n", + "16854 [O, O, O, O, O, O, O, O] \n", + "16855 [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,... \n", + "16856 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O] \n", + "16857 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n", + "\n", + "[16858 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tokensidseg_textraw_textlabels
0[Không_khí, thật, náo_nhiệt, .][0, 0, 0, 0]Không_khí thật náo_nhiệt .Không khí thật náo nhiệt .[O, O, O, O]
1[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...
2[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...
3[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...
4[Nhật_ký, của, thuyền_viên, .][0, 0, 0, 0]Nhật_ký của thuyền_viên .Nhật ký của thuyền viên .[O, O, O, O]
..................
16853[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...
16854[Nhưng, mọi, chuyện, không, dừng, ở, đó, .][0, 0, 0, 0, 0, 0, 0, 0]Nhưng mọi chuyện không dừng ở đó .Nhưng mọi chuyện không dừng ở đó .[O, O, O, O, O, O, O, O]
16855[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...
16856[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...Biết bao người đã tình nguyện hiến dâng cả cuộ...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
16857[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...Trên đây mới là “ thành_tích ” tiêu tiền của m...Trên đây mới là “ thành tích ” tiêu tiền của m...[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
\n", + "

16858 rows × 5 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 16858,\n \"fields\": [\n {\n \"column\": \"tokens\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"seg_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16787,\n \"samples\": [\n \"T\\u00ednh th\\u00f4ng_minh , l\\u1ea1i t\\u00f2_m\\u00f2 , anh Ki\\u1ec7m b\\u1eaft_\\u0111\\u1ea7u \\u0111i \\u0111\\u1ebfn c\\u00e1c x\\u01b0\\u1edfng c\\u01a1_kh\\u00ed \\u0111\\u1ec3 quan_s\\u00e1t c\\u00e1c lo\\u1ea1i m\\u00e1y_m\\u00f3c , r\\u1ed3i v\\u1ec1 nh\\u00e0 suy_ngh\\u0129 v\\u00e0 c\\u1ea7m b\\u00fat v\\u1ebd ph\\u00e1c_ho\\u1ea1 ra c\\u00e1i m\\u00e1y v\\u00fat g\\u1ea1o .\",\n \"V\\u1eady th\\u00ec , h\\u1ecd c\\u1ea7n ph\\u1ea3i \\u0111\\u01b0\\u1ee3c gi\\u00fap_\\u0111\\u1ee1 , ph\\u1ea3i \\u0111\\u01b0\\u1ee3c s\\u1ed1ng \\u0111\\u00e0ng_ho\\u00e0ng , ph\\u1ea3i \\u0111\\u01b0\\u1ee3c l\\u00e0m ng\\u01b0\\u1eddi d\\u00f9 ch\\u1ec9 l\\u00e0 nh\\u1eefng ng\\u00e0y cu\\u1ed1i_c\\u00f9ng .\",\n \"Nhi\\u1ec1u ng\\u01b0\\u1eddi th\\u00f4ng_d\\u1ecbch c\\u00f9ng th\\u1eddi v\\u1edbi Nguy\\u1ec5n Trung Hi\\u1ebfu c\\u0169ng \\u0111\\u00e3 ch\\u1ebft trong khi th\\u1ef1c_hi\\u1ec7n nhi\\u1ec7m_v\\u1ee5 t\\u1ea1i chi\\u1ebfn_tr\\u01b0\\u1eddng ho\\u1eb7c tr\\u00ean \\u0111\\u01b0\\u1eddng h\\u00e0nh_qu\\u00e2n .\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"raw_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 16785,\n \"samples\": [\n \"Trong kho\\u1ea3ng th\\u1eddi gian \\u0111\\u00f3 ch\\u1ecb c\\u1ed1 c\\u00f4ng t\\u1ef1 h\\u1ecdc ti\\u1ebfng Anh .\",\n \"Sau \\u0111\\u00f3 , ch\\u00ednh b\\u00e0 Susan \\u0111\\u00e3 \\u0111\\u01b0a Mai l\\u00ean h\\u1ecdc \\u0111\\u1ea1i h\\u1ecdc , m\\u1ed7i n\\u0103m chu c\\u1ea5p cho c\\u00f4 30.000 USD .\",\n \"T\\u1eeb r\\u1ea5t l\\u00e2u r\\u1ed3i t\\u00f4i v\\u1eabn ngh\\u0129 n\\u1ebfu nh\\u01b0 cu\\u1ed1n s\\u00e1ch \\u0111\\u01b0\\u1ee3c xu\\u1ea5t b\\u1ea3n , ho\\u1eb7c ng\\u01b0\\u1eddi ta l\\u00e0m phim v\\u1ec1 n\\u00f3 th\\u00ec t\\u00f4i s\\u1ebd d\\u00f9ng s\\u1ed1 ti\\u1ec1n b\\u00e1n s\\u00e1ch \\u0111\\u1ec3 thi\\u1ebft l\\u1eadp m\\u1ed9t s\\u1ed1 gi\\u01b0\\u1eddng b\\u1ec7nh t\\u1ea1i H\\u00e0 N\\u1ed9i .\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"labels\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "# Tạo thêm các cột khác\n", + "def join_tokens(tokens):\n", + " text = ' '.join(tokens)\n", + " return text\n", + "\n", + "def reform_raw_text(tokens):\n", + " text = ' '.join(tokens)\n", + " return text.replace(\"_\", \" \")\n", + "\n", + "def label(x):\n", + " return [id_tag[int(i)] for i in x]\n", + "\n", + "def replace_7_8(lst):\n", + " return [0 if x in (7, 8) else x for x in lst]\n", + "\n", + "\n", + "tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}\n", + "id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}\n", + "\n", + "\n", + "df['ner_tags'] = df['ner_tags'].apply(replace_7_8)\n", + "df['text_withseg'] = df['tokens'].apply(join_tokens)\n", + "df['text_raw'] = df['tokens'].apply(reform_raw_text)\n", + "df[\"ner_labels\"] = df.ner_tags.apply(label)\n", + "df.columns = ['tokens', 'id', 'seg_text', 'raw_text', 'labels']\n", + "df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Sgpm-btsXxzt", + "outputId": "4275e090-0bb5-47a2-9b51-682d13bd7e45" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Chị Lãnh và Xăng ra đi , mình đứng nhìn hai chị quần xắn tròn trên vế , lặn_lội qua dòng suối nước chảy rần_rần , tự_nhiên nước_mắt mình rưng_rưng ...\n" + ] + } + ], + "source": [ + "print(df['seg_text'][1])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "I_VaVTLfXxzu", + "outputId": "f5a568e0-0235-40b5-9a87-9df21b39af44" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Chị Lãnh và Xăng ra đi , mình đứng nhìn hai chị quần xắn tròn trên vế , lặn lội qua dòng suối nước chảy rần rần , tự nhiên nước mắt mình rưng rưng ...\n" + ] + } + ], + "source": [ + "print(df['raw_text'][1])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Zum7uCLSXxzu", + "outputId": "44b2d50f-fec1-42c5-fd52-854d510ba13d" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[np.int64(0), np.int64(1), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0)]\n" + ] + } + ], + "source": [ + "print(df['id'][1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ooewb479FdqS" + }, + "source": [ + "# Get Embedding Vectors" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 889, + "referenced_widgets": [ + "1d9aaa035056485c959f66b60cf41714", + "71054a4397e344a2a66e32892a37b59b", + "6c582399be1d42c3b5a4ef21743d1a26", + "26a6ef7f967a4504a698ff3152ccb24e", + "82669dd27686486588fdd7d11f49edd3", + "461c136b3eac4a9dadb8a3af7c11c98a", + "4fb68dcea1ac4e54b82c92ad64e9be95", + "df397db853874f6db911acb667785ec0", + "2de5befe0ab24de9a62ba076e5abf78e", + "d7da38e7c5e1484597bb1faae3c2d7f5", + "2ec3573cb04143a8ba5e555bfaf60165", + "67c557f2651b4e5c81e9af82531898fe", + "5a5a04069ebd41fdba7835e1b5da585e", + "6c5b30dc67d94071af4d4b14cf4be7e4", + "e9200c8269fa4ebd9c1157cc1b871005", + "50ab44e431a54c27b409dc74c068c392", + "9929425b4df94d3792dc454afe59b3fd", + "03efbd25cd4341cea6714ffc3585632a", + "ae22d9a4de574ce3905a6f6c82fac1aa", + "71fe669e2f68444ba4b81ffc14a39c03", + "4d4c131206f448c7ba6d5c4e41126d41", + "ec260d8279bd4a40ad2bbc3c1d7dadaa", + "08611ca37f8c462db079dc2883f06002", + "47eca14208cc4e5085d44cce42872a35", + "49d3e88f60e744e7b37bcbcca5bbb087", + "81b9bb3315e4402ea5b6768d0d189591", + "4b88329de5ed48738ca0da054a1f0131", + "1507e771a5ca4056b0605cd453d89c60", + "44946e0a5d31408a851b0e8ab5217c43", + "ee8c999b66e84cbda17702e916d48a3f", + "6c6228d416944599b110ffa97b20bd8c", + "c519c27334b742ee8f14e29da2ebdf9b", + "7d422740d70546559703cd0304be663f", + "371ab2b9d7c84402b3c4b934e89eca4b", + "7ead683f167c408f88ba72b2ee1599d1", + "737d3d1e5a3146de96c17ca8ec72d75e", + "7ec9beb535c9428a954367613fa7f4cd", + "3c51b3c0b7ba4a2eacb1b8b2be8e024c", + "50dc5fc6a7354394ace536241fa01714", + "78ba04a44d9e42dba9fb7617d28c91e9", + "ca87704af3bc4c7590eea8f8f0f50d94", + "f3870bce67da4affa8925d9d898638da", + "e6efac23366643dd861caf121a8a220b", + "1044cf40d7e54337859bab0057aa0b54", + "3a6bb86ac3db4f82ae139507f94607e0", + "04a96e9ef4774bd1a3cd3a1dd20fe194", + "6158b1f05f6c4851b492df312f0312ca", + "b81c4146511045e280ba4fa226074679", + "3676c8217c654790972189be8c1f4627", + "762b2c8d23824070b2eb115e151f0c73", + "d9d57b09b01846a4805882b4adf64e55", + "e804cb9279dd458aa8b661d28c4427ff", + "3169169dbabb4b1aa7906a0415eacdcb", + "408130d71bb74141906cbc1d2123bb63", + "4be265bd67e1470cbd856dd268908c00" + ] + }, + "id": "b04c2Xq7IBac", + "outputId": "b09c7e51-baad-4dde-fea2-9d380a5988d9" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/557 [00:00\", \"\"]:\n", + " continue\n", + " #Token là subword (có đuôi \"@@\")\n", + " if token.endswith(\"@@\"):\n", + " current_vecs.append(emb)\n", + " else: #Token là phần cuối của một từ (không có \"@@\")\n", + " current_vecs.append(emb)\n", + " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n", + " word_embeddings.append(word_emb)\n", + " current_vecs = []\n", + "\n", + " if current_vecs: # Trong trường hợp sót lại cuối câu\n", + " word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n", + " word_embeddings.append(word_emb)\n", + "\n", + " return word_embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FSAhQKN1Xxzw", + "outputId": "ddd0edd3-145e-4966-b78c-2f66f83bfd14" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Không khí thật náo nhiệt .\n" + ] + } + ], + "source": [ + "raw_e = df['raw_text'][0]\n", + "print(raw_e)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OJ7ifS6wXxzw", + "outputId": "4908dbef-b495-4a17-e2f6-19a6b2b85eb3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Số lượng từ trong câu là: 31\n", + "[0, 1108, 19703, 6, 28163, 40, 57, 4, 68, 414, 364, 82, 213, 2747, 20899, 2533, 34, 23798, 4, 13468, 89, 532, 3364, 58, 2181, 33151, 4, 1124, 2396, 68, 17865, 135, 2]\n", + "31\n", + "độ dài của tokens 33\n", + "\n" + ] + } + ], + "source": [ + "sentence_e = 'Chị Lãnh và Xăng ra đi , mình đứng nhìn hai chị quần xắn tròn trên vế , lặn_lội qua dòng suối nước chảy rần_rần , tự_nhiên nước_mắt mình rưng_rưng ...'\n", + "id_e = [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", + "def dem_so_tu(cau):\n", + " # Tách câu thành các từ bằng khoảng trắng\n", + " tu_danh_sach = cau.split()\n", + " # Đếm số lượng từ\n", + " return len(tu_danh_sach)\n", + "\n", + "# Ví dụ sử dụng\n", + "sentence_e = 'Chị Lãnh và Xăng ra đi , mình đứng nhìn hai chị quần xắn tròn trên vế , lặn_lội qua dòng suối nước chảy rần_rần , tự_nhiên nước_mắt mình rưng_rưng ...'\n", + "so_tu = dem_so_tu(sentence_e)\n", + "print(\"Số lượng từ trong câu là:\", so_tu)\n", + "input_e = tokenizer.encode(sentence_e)\n", + "tokens_e = tokenizer.convert_ids_to_tokens(input_e[0])\n", + "print(input_e)\n", + "print(len(id_e))\n", + "print('độ dài của tokens',len(input_e))\n", + "print(tokens_e)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 830, + "referenced_widgets": [ + "fabfacd2a2964d23994070bcb6bc4b3c", + "09db5366452347dfa40bbd192d22d489", + "706046f4266a4ade9f5f1718fc0bbf4f", + "d39e9db3bbf84736b090e12ecf9bd1fd", + "654a5b527d0c4c51afce2e65fb3b36aa", + "fa1045a3003d4496a2a5c3055355120d", + "f00793fc0f1948fca2fda701eb461505", + "9994f44df6a0451792559446361557e9", + "cc2a9437c039472f8447f9d0194459dc", + "2c64cf42d35a4722a3b32b366d1dcd1b", + "a9886da1334d4df0b4c6536255df6420" + ] + }, + "id": "3wpjBGK3JuwS", + "outputId": "ec11cd7f-84ca-402d-c7c0-b86db3ea555c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + " 0%| | 18/16858 [00:00<08:06, 34.64it/s]" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "model.safetensors: 0%| | 0.00/543M [00:00 Train Loss: {total_train_loss/len(train_loader):.4f} | \"\n", + "# f\"Test Loss: {avg_test_loss:.4f} | test_f1: {f1:.4f} | test_acc: {accuracy:.4f}\")\n", + "\n", + "\n", + "# print(\"\\n--- Training Finished ---\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "xnPOKnLBJF7A" + }, + "outputs": [], + "source": [ + "# # --- 5. Final Evaluation Report ---\n", + "# print(\"\\nFinal Test Set Performance:\")\n", + "# model.eval()\n", + "# all_preds_final, all_true_final = [], []\n", + "# with torch.no_grad():\n", + "# for x, y, lengths in tqdm(test_loader, desc=\"Generating Final Report\"):\n", + "# x, y = x.to(device), y.to(device)\n", + "# preds = torch.argmax(model(x), dim=2)\n", + "# for i in range(len(lengths)):\n", + "# true_len = lengths[i]\n", + "# all_true_final.extend(y[i, :true_len].cpu().numpy())\n", + "# all_preds_final.extend(preds[i, :true_len].cpu().numpy())\n", + "\n", + "# # Generate and print the classification report\n", + "# target_names = [id_tag[i] for i in range(NUM_TAGS)]\n", + "# report = classification_report(all_true_final, all_preds_final, target_names=target_names, digits=4)\n", + "# print(\"\\nClassification Report:\\n\", report)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "CM4FYvo4IL3e", + "outputId": "a86b87f7-0f74-4c96-dd9a-9450ca3b905b" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Tracking run with wandb version 0.19.11" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Run data is saved locally in /content/wandb/run-20250610_121142-lmnb07kv" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Syncing run Softmax_VLSP2016 to Weights & Biases (docs)
" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View project at https://wandb.ai/lenguyenquocanh-vn-fptu-fpt-university/NER" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + " View run at https://wandb.ai/lenguyenquocanh-vn-fptu-fpt-university/NER/runs/lmnb07kv" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Starting Softmax Model Training...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 1/20: 100%|██████████| 841/841 [00:05<00:00, 149.09it/s, loss=0.122]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_1.pt\n", + "Epoch 1/20 -> Train Loss: 0.1224 | Val Loss: 0.0489 | Val F1: 0.7949 | Val Acc: 0.9847\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 2/20: 100%|██████████| 841/841 [00:04<00:00, 173.24it/s, loss=0.0419]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_2.pt\n", + "Epoch 2/20 -> Train Loss: 0.0419 | Val Loss: 0.0359 | Val F1: 0.8518 | Val Acc: 0.9885\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 3/20: 100%|██████████| 841/841 [00:05<00:00, 159.28it/s, loss=0.0338]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_3.pt\n", + "Epoch 3/20 -> Train Loss: 0.0338 | Val Loss: 0.0314 | Val F1: 0.8666 | Val Acc: 0.9898\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 4/20: 100%|██████████| 841/841 [00:04<00:00, 174.77it/s, loss=0.03]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_4.pt\n", + "Epoch 4/20 -> Train Loss: 0.0300 | Val Loss: 0.0295 | Val F1: 0.8674 | Val Acc: 0.9899\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 5/20: 100%|██████████| 841/841 [00:05<00:00, 157.59it/s, loss=0.0279]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_5.pt\n", + "Epoch 5/20 -> Train Loss: 0.0279 | Val Loss: 0.0281 | Val F1: 0.8838 | Val Acc: 0.9908\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 6/20: 100%|██████████| 841/841 [00:04<00:00, 172.50it/s, loss=0.026]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_6.pt\n", + "Epoch 6/20 -> Train Loss: 0.0260 | Val Loss: 0.0268 | Val F1: 0.8838 | Val Acc: 0.9909\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 7/20: 100%|██████████| 841/841 [00:05<00:00, 156.51it/s, loss=0.0249]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_7.pt\n", + "Epoch 7/20 -> Train Loss: 0.0249 | Val Loss: 0.0262 | Val F1: 0.8855 | Val Acc: 0.9910\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 8/20: 100%|██████████| 841/841 [00:04<00:00, 173.05it/s, loss=0.0238]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_8.pt\n", + "Epoch 8/20 -> Train Loss: 0.0238 | Val Loss: 0.0258 | Val F1: 0.8849 | Val Acc: 0.9912\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 9/20: 100%|██████████| 841/841 [00:05<00:00, 158.86it/s, loss=0.0228]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 9/20 -> Train Loss: 0.0228 | Val Loss: 0.0256 | Val F1: 0.8850 | Val Acc: 0.9912\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 10/20: 100%|██████████| 841/841 [00:04<00:00, 170.77it/s, loss=0.0224]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_10.pt\n", + "Epoch 10/20 -> Train Loss: 0.0224 | Val Loss: 0.0254 | Val F1: 0.8866 | Val Acc: 0.9914\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 11/20: 100%|██████████| 841/841 [00:05<00:00, 163.16it/s, loss=0.0218]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_11.pt\n", + "Epoch 11/20 -> Train Loss: 0.0218 | Val Loss: 0.0249 | Val F1: 0.8908 | Val Acc: 0.9916\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 12/20: 100%|██████████| 841/841 [00:04<00:00, 170.64it/s, loss=0.021]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 12/20 -> Train Loss: 0.0210 | Val Loss: 0.0252 | Val F1: 0.8885 | Val Acc: 0.9914\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 13/20: 100%|██████████| 841/841 [00:05<00:00, 161.40it/s, loss=0.0209]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 13/20 -> Train Loss: 0.0209 | Val Loss: 0.0250 | Val F1: 0.8902 | Val Acc: 0.9915\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 14/20: 100%|██████████| 841/841 [00:04<00:00, 170.76it/s, loss=0.0203]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 14/20 -> Train Loss: 0.0203 | Val Loss: 0.0251 | Val F1: 0.8895 | Val Acc: 0.9915\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 15/20: 100%|██████████| 841/841 [00:05<00:00, 162.77it/s, loss=0.0199]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 15/20 -> Train Loss: 0.0199 | Val Loss: 0.0250 | Val F1: 0.8868 | Val Acc: 0.9913\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 16/20: 100%|██████████| 841/841 [00:04<00:00, 171.25it/s, loss=0.0197]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 16/20 -> Train Loss: 0.0197 | Val Loss: 0.0253 | Val F1: 0.8888 | Val Acc: 0.9912\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 17/20: 100%|██████████| 841/841 [00:05<00:00, 160.48it/s, loss=0.0195]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 17/20 -> Train Loss: 0.0195 | Val Loss: 0.0250 | Val F1: 0.8900 | Val Acc: 0.9915\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 18/20: 100%|██████████| 841/841 [00:04<00:00, 168.69it/s, loss=0.0192]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 18/20 -> Train Loss: 0.0192 | Val Loss: 0.0250 | Val F1: 0.8893 | Val Acc: 0.9914\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 19/20: 100%|██████████| 841/841 [00:05<00:00, 163.39it/s, loss=0.0188]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_19.pt\n", + "Epoch 19/20 -> Train Loss: 0.0188 | Val Loss: 0.0253 | Val F1: 0.8926 | Val Acc: 0.9915\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Epoch 20/20: 100%|██████████| 841/841 [00:04<00:00, 168.43it/s, loss=0.0188]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved improved model to checkpoints/best_epoch_20.pt\n", + "Epoch 20/20 -> Train Loss: 0.0188 | Val Loss: 0.0249 | Val F1: 0.8936 | Val Acc: 0.9918\n", + "\n", + "--- Training Finished ---\n", + "\n", + "Final Test Set Performance:\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " O 0.9973 0.9973 0.9973 68476\n", + " B-PER 0.9869 0.9768 0.9818 1464\n", + " I-PER 0.9810 0.9767 0.9788 686\n", + " B-ORG 0.7709 0.8249 0.7970 257\n", + " I-ORG 0.7981 0.7721 0.7849 430\n", + " B-LOC 0.8809 0.9001 0.8904 1241\n", + " I-LOC 0.8339 0.8159 0.8248 554\n", + "\n", + " accuracy 0.9918 73108\n", + " macro avg 0.8927 0.8948 0.8936 73108\n", + "weighted avg 0.9918 0.9918 0.9918 73108\n", + "\n" + ] + } + ], + "source": [ + "import os\n", + "import torch\n", + "import torch.nn as nn\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score\n", + "from tqdm import tqdm\n", + "import wandb\n", + "\n", + "# Create checkpoint directory\n", + "os.makedirs(\"checkpoints\", exist_ok=True)\n", + "\n", + "# Initialize Weights & Biases\n", + "wandb.init(\n", + " project=\"NER\",\n", + " name=\"Softmax_VLSP2016\",\n", + " config={\n", + " \"epochs\": 20,\n", + " \"batch_size\": 16,\n", + " \"learning_rate\": 1e-3,\n", + " \"input_dim\": 768,\n", + " \"test_size\": 0.2\n", + " }\n", + ")\n", + "\n", + "# --- Dataset ---\n", + "class NERDataset(Dataset):\n", + " def __init__(self, embeddings, labels):\n", + " self.embeddings = embeddings\n", + " self.labels = labels\n", + "\n", + " def __len__(self):\n", + " return len(self.embeddings)\n", + "\n", + " def __getitem__(self, idx):\n", + " return self.embeddings[idx], self.labels[idx]\n", + "\n", + "def collate_fn(batch):\n", + " embeddings, labels = zip(*batch)\n", + " lengths = [len(x) for x in embeddings]\n", + " max_len = max(lengths)\n", + "\n", + " padded_embs = torch.stack([\n", + " torch.cat([e, torch.zeros(max_len - len(e), e.size(1))]) for e in embeddings\n", + " ])\n", + " padded_labels = torch.stack([\n", + " torch.cat([l, torch.full((max_len - len(l),), -1)]) for l in labels\n", + " ])\n", + " return padded_embs, padded_labels, lengths\n", + "\n", + "# --- Model ---\n", + "class SoftmaxTagger(nn.Module):\n", + " def __init__(self, input_dim, num_tags):\n", + " super().__init__()\n", + " self.hidden2tag = nn.Linear(input_dim, num_tags)\n", + "\n", + " def forward(self, x):\n", + " return self.hidden2tag(x)\n", + "\n", + " def save_model(self, path):\n", + " torch.save(self.state_dict(), path)\n", + "\n", + " def load_model(self, path):\n", + " self.load_state_dict(torch.load(path))\n", + " self.eval()\n", + "\n", + "# --- Evaluation ---\n", + "def evaluate_softmax(model, dataloader, loss_fn, device):\n", + " model.eval()\n", + " total_loss = 0\n", + " all_preds, all_true = [], []\n", + "\n", + " with torch.no_grad():\n", + " for x, y, lengths in dataloader:\n", + " x, y = x.to(device), y.to(device)\n", + " emissions = model(x)\n", + " loss = loss_fn(emissions.view(-1, model.hidden2tag.out_features), y.view(-1))\n", + " total_loss += loss.item()\n", + " preds = torch.argmax(emissions, dim=2)\n", + " for i in range(len(lengths)):\n", + " true_len = lengths[i]\n", + " all_true.extend(y[i, :true_len].cpu().numpy())\n", + " all_preds.extend(preds[i, :true_len].cpu().numpy())\n", + "\n", + " precision, recall, f1, _ = precision_recall_fscore_support(all_true, all_preds, average='macro', zero_division=0)\n", + " accuracy = accuracy_score(all_true, all_preds)\n", + "\n", + " return total_loss / len(dataloader), precision, recall, f1, accuracy, all_preds, all_true\n", + "\n", + "# Train/test split\n", + "train_embs, test_embs, train_labels, test_labels = train_test_split(\n", + " all_embeddings, all_labels, test_size=0.2, random_state=42\n", + ")\n", + "\n", + "train_dataset = NERDataset(train_embs, train_labels)\n", + "test_dataset = NERDataset(test_embs, test_labels)\n", + "\n", + "BATCH_SIZE = wandb.config.batch_size\n", + "train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)\n", + "test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)\n", + "\n", + "INPUT_DIM = wandb.config.input_dim\n", + "NUM_TAGS = max(label.max().item() for label in all_labels) + 1\n", + "LEARNING_RATE = wandb.config.learning_rate\n", + "EPOCHS = wandb.config.epochs\n", + "\n", + "model = SoftmaxTagger(INPUT_DIM, NUM_TAGS).to(device)\n", + "loss_fn = nn.CrossEntropyLoss(ignore_index=-1)\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)\n", + "\n", + "best_f1 = 0\n", + "best_acc = 0\n", + "\n", + "# --- Training Loop ---\n", + "print(\"Starting Softmax Model Training...\")\n", + "for epoch in range(1, EPOCHS + 1):\n", + " model.train()\n", + " total_train_loss = 0\n", + " all_train_preds, all_train_true = [], []\n", + "\n", + " train_bar = tqdm(train_loader, desc=f\"Epoch {epoch}/{EPOCHS}\")\n", + " for x, y, lengths in train_bar:\n", + " x, y = x.to(device), y.to(device)\n", + " emissions = model(x)\n", + " loss = loss_fn(emissions.view(-1, NUM_TAGS), y.view(-1))\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + " total_train_loss += loss.item()\n", + " train_bar.set_postfix(loss=total_train_loss / len(train_bar))\n", + "\n", + " preds = torch.argmax(emissions, dim=2)\n", + " for i in range(len(lengths)):\n", + " true_len = lengths[i]\n", + " all_train_true.extend(y[i, :true_len].cpu().numpy())\n", + " all_train_preds.extend(preds[i, :true_len].cpu().numpy())\n", + "\n", + " train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(\n", + " all_train_true, all_train_preds, average='macro', zero_division=0\n", + " )\n", + " train_acc = accuracy_score(all_train_true, all_train_preds)\n", + "\n", + " # Validation\n", + " val_loss, val_precision, val_recall, val_f1, val_acc, _, _ = evaluate_softmax(model, test_loader, loss_fn, device)\n", + "\n", + " # Logging to wandb\n", + " wandb.log({\n", + " \"epoch\": epoch,\n", + " \"avg_train_loss\": total_train_loss / len(train_loader),\n", + " \"train_precision\": train_precision,\n", + " \"train_recall\": train_recall,\n", + " \"train_f1\": train_f1,\n", + " \"train_acc\": train_acc,\n", + " \"val_loss\": val_loss,\n", + " \"val_precision\": val_precision,\n", + " \"val_recall\": val_recall,\n", + " \"val_f1\": val_f1,\n", + " \"val_acc\": val_acc,\n", + " })\n", + "\n", + " # Save best model\n", + " if val_f1 > best_f1 or val_acc > best_acc:\n", + " best_f1 = max(val_f1, best_f1)\n", + " best_acc = max(val_acc, best_acc)\n", + " ckpt_path = f\"checkpoints/best_epoch_{epoch}.pt\"\n", + " model.save_model(ckpt_path)\n", + " wandb.save(ckpt_path)\n", + " print(f\"Saved improved model to {ckpt_path}\")\n", + "\n", + " print(f\"Epoch {epoch}/{EPOCHS} -> Train Loss: {total_train_loss/len(train_loader):.4f} | \"\n", + " f\"Val Loss: {val_loss:.4f} | Val F1: {val_f1:.4f} | Val Acc: {val_acc:.4f}\")\n", + "\n", + "print(\"\\n--- Training Finished ---\")\n", + "\n", + "# --- Final Evaluation Report ---\n", + "print(\"\\nFinal Test Set Performance:\")\n", + "model.eval()\n", + "_, _, _, _, _, all_preds_final, all_true_final = evaluate_softmax(model, test_loader, loss_fn, device)\n", + "\n", + "# Classification report table\n", + "target_names = [id_tag[i] for i in range(NUM_TAGS)]\n", + "report = classification_report(all_true_final, all_preds_final, target_names=target_names, digits=4, output_dict=True)\n", + "\n", + "# Log report as wandb table\n", + "table = wandb.Table(columns=[\"Label\", \"Precision\", \"Recall\", \"F1-score\", \"Support\"])\n", + "for label in target_names:\n", + " row = report[label]\n", + " table.add_data(label, row[\"precision\"], row[\"recall\"], row[\"f1-score\"], row[\"support\"])\n", + "\n", + "wandb.log({\"Test Classification Report\": table})\n", + "print(\"\\nClassification Report:\\n\", classification_report(all_true_final, all_preds_final, target_names=target_names, digits=4))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4Ppa-bdT8r2v" + }, + "source": [ + "# Lưu data" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "s9GulKoGqx6d" + }, + "outputs": [], + "source": [ + "def save_tensors(all_embeddings, all_labels, embed_path='embeddings.pt', label_path='labels.pt'):\n", + " torch.save(all_embeddings, embed_path)\n", + " torch.save(all_labels, label_path)\n", + " print(f\"Saved embeddings to {embed_path} and labels to {label_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "id": "AGAJZH_h8ve6", + "outputId": "1893fbfa-dbcc-48f9-b6e3-ef17f9eef51c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saved embeddings to embeddings.pt and labels to labels.pt\n", + "Mounted at /content/drive\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/content/drive/My Drive/labels.pt'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 22 + } + ], + "source": [ + "from google.colab import drive\n", + "import shutil\n", + "\n", + "# Gọi hàm đã viết\n", + "save_tensors(all_embeddings, all_labels)\n", + "\n", + "# Mount và tải lên Drive\n", + "drive.mount('/content/drive')\n", + "shutil.copy('embeddings.pt', '/content/drive/My Drive')\n", + "shutil.copy('labels.pt', '/content/drive/My Drive')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "id": "ESWu8QI59dwl" + }, + "outputs": [], + "source": [ + "torch.save(model.state_dict(), \"softmax_tagger.pth\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "p_iixQwcVuum", + "outputId": "9d4875b8-f67a-46aa-c5c4-a001836e9cb6" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "SoftmaxTagger(\n", + " (hidden2tag): Linear(in_features=768, out_features=7, bias=True)\n", + ")" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ], + "source": [ + "model = SoftmaxTagger(INPUT_DIM, NUM_TAGS)\n", + "model.load_state_dict(torch.load(\"softmax_tagger.pth\"))\n", + "model.eval() # chuyển sang chế độ đánh giá nếu cần\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "vM2lbEBkXxzy" + }, + "outputs": [], + "source": [ + "from transformers import AutoModel, AutoTokenizer\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"vinai/phobert-base\")\n", + "model_bert = AutoModel.from_pretrained(\"vinai/phobert-base\").to(device) # PhoBERT để lấy embedding\n", + "\n", + "# model là SoftmaxTagger đã train xong\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "InnYf_SyXxzy" + }, + "outputs": [], + "source": [ + "def predict_ner(text):\n", + " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + " model.eval()\n", + "\n", + " # Tokenize văn bản đầu vào\n", + " input_ids = tokenizer.encode(text, return_tensors=\"pt\").to(device)\n", + " tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())\n", + "\n", + " # Lấy embedding đầu ra từ PhoBERT\n", + " with torch.no_grad():\n", + " outputs = model_bert(input_ids)\n", + " last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()\n", + "\n", + " # Gộp embedding của từ bị tách (sentencepiece)\n", + " word_embeds = group_embeddings(tokens, last_hidden_state)\n", + "\n", + " # Chuyển sang tensor\n", + " x_tensor = torch.stack(word_embeds).unsqueeze(0).to(device) # (1, seq_len, 768)\n", + "\n", + " # Dự đoán\n", + " with torch.no_grad():\n", + " emissions = model(x_tensor)\n", + " preds = torch.argmax(emissions, dim=2).squeeze(0).cpu().tolist()\n", + "\n", + " # Trích xuất token gốc không bị tách '@@'\n", + " final_tokens = []\n", + " current_token = \"\"\n", + " for tok in tokens:\n", + " if tok in [\"\", \"\"]:\n", + " continue\n", + " if tok.endswith(\"@@\"):\n", + " current_token += tok[:-2]\n", + " else:\n", + " current_token += tok\n", + " final_tokens.append(current_token)\n", + " current_token = \"\"\n", + "\n", + " # Ánh xạ sang tên nhãn\n", + " label_names = [id_tag[i] for i in preds]\n", + "\n", + " return preds, label_names, final_tokens\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "aSj0-dl1Xxzy", + "outputId": "26bbf3fe-b011-4a59-bb3e-03d9511be21a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: gradio in /usr/local/lib/python3.11/dist-packages (5.31.0)\n", + "Requirement already satisfied: aiofiles<25.0,>=22.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (24.1.0)\n", + "Requirement already satisfied: anyio<5.0,>=3.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (4.9.0)\n", + "Requirement already satisfied: fastapi<1.0,>=0.115.2 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.115.12)\n", + "Requirement already satisfied: ffmpy in /usr/local/lib/python3.11/dist-packages (from gradio) (0.6.0)\n", + "Requirement already satisfied: gradio-client==1.10.1 in /usr/local/lib/python3.11/dist-packages (from gradio) (1.10.1)\n", + "Requirement already satisfied: groovy~=0.1 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.1.2)\n", + "Requirement already satisfied: httpx>=0.24.1 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.28.1)\n", + "Requirement already satisfied: huggingface-hub>=0.28.1 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.32.4)\n", + "Requirement already satisfied: jinja2<4.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (3.1.6)\n", + "Requirement already satisfied: markupsafe<4.0,>=2.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (3.0.2)\n", + "Requirement already satisfied: numpy<3.0,>=1.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (2.0.2)\n", + "Requirement already satisfied: orjson~=3.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (3.10.18)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from gradio) (24.2)\n", + "Requirement already satisfied: pandas<3.0,>=1.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (2.2.2)\n", + "Requirement already satisfied: pillow<12.0,>=8.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (11.2.1)\n", + "Requirement already satisfied: pydantic<2.12,>=2.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (2.11.5)\n", + "Requirement already satisfied: pydub in /usr/local/lib/python3.11/dist-packages (from gradio) (0.25.1)\n", + "Requirement already satisfied: python-multipart>=0.0.18 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.0.20)\n", + "Requirement already satisfied: pyyaml<7.0,>=5.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (6.0.2)\n", + "Requirement already satisfied: ruff>=0.9.3 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.11.12)\n", + "Requirement already satisfied: safehttpx<0.2.0,>=0.1.6 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.1.6)\n", + "Requirement already satisfied: semantic-version~=2.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (2.10.0)\n", + "Requirement already satisfied: starlette<1.0,>=0.40.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.46.2)\n", + "Requirement already satisfied: tomlkit<0.14.0,>=0.12.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.13.2)\n", + "Requirement already satisfied: typer<1.0,>=0.12 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.16.0)\n", + "Requirement already satisfied: typing-extensions~=4.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (4.14.0)\n", + "Requirement already satisfied: uvicorn>=0.14.0 in /usr/local/lib/python3.11/dist-packages (from gradio) (0.34.3)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from gradio-client==1.10.1->gradio) (2025.3.2)\n", + "Requirement already satisfied: websockets<16.0,>=10.0 in /usr/local/lib/python3.11/dist-packages (from gradio-client==1.10.1->gradio) (15.0.1)\n", + "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.11/dist-packages (from anyio<5.0,>=3.0->gradio) (3.10)\n", + "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.11/dist-packages (from anyio<5.0,>=3.0->gradio) (1.3.1)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from httpx>=0.24.1->gradio) (2025.4.26)\n", + "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx>=0.24.1->gradio) (1.0.9)\n", + "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx>=0.24.1->gradio) (0.16.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.28.1->gradio) (3.18.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.28.1->gradio) (2.32.3)\n", + "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.28.1->gradio) (4.67.1)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.28.1->gradio) (1.1.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0,>=1.0->gradio) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0,>=1.0->gradio) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0,>=1.0->gradio) (2025.2)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<2.12,>=2.0->gradio) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.33.2 in /usr/local/lib/python3.11/dist-packages (from pydantic<2.12,>=2.0->gradio) (2.33.2)\n", + "Requirement already satisfied: typing-inspection>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<2.12,>=2.0->gradio) (0.4.1)\n", + "Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0,>=0.12->gradio) (8.2.1)\n", + "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0,>=0.12->gradio) (1.5.4)\n", + "Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.11/dist-packages (from typer<1.0,>=0.12->gradio) (13.9.4)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas<3.0,>=1.0->gradio) (1.17.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich>=10.11.0->typer<1.0,>=0.12->gradio) (2.19.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.28.1->gradio) (3.4.2)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.28.1->gradio) (2.4.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0,>=0.12->gradio) (0.1.2)\n" + ] + } + ], + "source": [ + "pip install gradio" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SshnFWQzj6aS", + "outputId": "f25a7aa5-f179-472f-a79c-11df166497be" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "SoftmaxTagger(\n", + " (hidden2tag): Linear(in_features=768, out_features=7, bias=True)\n", + ")" + ] + }, + "metadata": {}, + "execution_count": 28 + } + ], + "source": [ + "model = SoftmaxTagger(INPUT_DIM, NUM_TAGS) # Make sure INPUT_DIM and NUM_TAGS are defined or accessible here\n", + "model.load_state_dict(torch.load(\"softmax_tagger.pth\"))\n", + "model.eval() # chuyển sang chế độ đánh giá nếu cần\n", + "model.to(device) # Add this line to move the model to the device" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 625 + }, + "id": "wIK-QRWmXxzz", + "outputId": "2547cdda-a687-46a9-9243-cea7a8916de6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n", + "\n", + "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n", + "* Running on public URL: https://c3f739dbf40a0a0681.gradio.live\n", + "\n", + "This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "
" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [] + }, + "metadata": {}, + "execution_count": 29 + } + ], + "source": [ + "import gradio as gr\n", + "import json\n", + "import tempfile\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "\n", + "def ner_interface(text):\n", + " try:\n", + " ids, labels, tokens = predict_ner(text)\n", + " data = [[token, label, _id] for token, label, _id in zip(tokens, labels, ids)]\n", + " json_result = {\n", + " \"tokens\": tokens,\n", + " \"labels\": labels,\n", + " \"label_ids\": ids\n", + " }\n", + " return data, json_result\n", + " except Exception as e:\n", + " print(\"Error:\", e)\n", + " return [[\"Lỗi\", str(e), \"\"]], {\"error\": str(e)}\n", + "\n", + "def json_to_file(json_data):\n", + " # Tạo file tạm thời để trả về cho gr.File tải về\n", + " tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8')\n", + " json.dump(json_data, tmp, ensure_ascii=False, indent=2)\n", + " tmp.close()\n", + " return tmp.name\n", + "\n", + "\n", + "with gr.Blocks(title=\"Nhận dạng Thực thể (NER) với PhoBERT\") as demo:\n", + " gr.Markdown(\"## 📌 Hệ thống Nhận dạng Thực thể Tên (NER) sử dụng PhoBERT + Softmax\")\n", + "\n", + " with gr.Row():\n", + " with gr.Column(scale=3):\n", + " input_text = gr.Textbox(\n", + " lines=4,\n", + " label=\"✍️ Nhập văn bản đầu vào\",\n", + " placeholder=\"Ví dụ: Nguyễn Văn A sinh ra ở Hà Nội.\"\n", + " )\n", + " btn = gr.Button(\"🚀 Nhận dạng Thực thể\")\n", + "\n", + " with gr.Column(scale=5):\n", + " output_table = gr.Dataframe(\n", + " headers=[\"Token\", \"Label\", \"ID\"],\n", + " label=\"📄 Kết quả nhận dạng thực thể\",\n", + " wrap=True\n", + " )\n", + " output_json = gr.JSON(visible=False) # Có thể bật nếu muốn hiển thị JSON\n", + "\n", + " with gr.Row():\n", + " download_trigger = gr.Button(\"💾 Tải kết quả dưới dạng JSON\")\n", + " download_file = gr.File(label=\"📥 File JSON đã xử lý\")\n", + "\n", + " # Hành động xử lý NER\n", + " btn.click(fn=ner_interface, inputs=input_text, outputs=[output_table, output_json])\n", + " download_trigger.click(fn=json_to_file, inputs=output_json, outputs=download_file)\n", + "\n", + "demo.launch()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "mvQgpNetjo02" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [ + "DiCxlUcHQ9NJ" + ], + "gpuType": "T4", + "provenance": [] + }, + "kaggle": { + "accelerator": "nvidiaTeslaT4", + "dataSources": [], + "dockerImageVersionId": 31040, + "isGpuEnabled": true, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "1d9aaa035056485c959f66b60cf41714": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_71054a4397e344a2a66e32892a37b59b", + "IPY_MODEL_6c582399be1d42c3b5a4ef21743d1a26", + "IPY_MODEL_26a6ef7f967a4504a698ff3152ccb24e" + ], + "layout": "IPY_MODEL_82669dd27686486588fdd7d11f49edd3" + } + }, + "71054a4397e344a2a66e32892a37b59b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_461c136b3eac4a9dadb8a3af7c11c98a", + "placeholder": "​", + "style": "IPY_MODEL_4fb68dcea1ac4e54b82c92ad64e9be95", + "value": "config.json: 100%" + } + }, + "6c582399be1d42c3b5a4ef21743d1a26": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_df397db853874f6db911acb667785ec0", + "max": 557, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2de5befe0ab24de9a62ba076e5abf78e", + "value": 557 + } + }, + "26a6ef7f967a4504a698ff3152ccb24e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d7da38e7c5e1484597bb1faae3c2d7f5", + "placeholder": "​", + "style": "IPY_MODEL_2ec3573cb04143a8ba5e555bfaf60165", + "value": " 557/557 [00:00<00:00, 52.7kB/s]" + } + }, + "82669dd27686486588fdd7d11f49edd3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "461c136b3eac4a9dadb8a3af7c11c98a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4fb68dcea1ac4e54b82c92ad64e9be95": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "df397db853874f6db911acb667785ec0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2de5befe0ab24de9a62ba076e5abf78e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d7da38e7c5e1484597bb1faae3c2d7f5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2ec3573cb04143a8ba5e555bfaf60165": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "67c557f2651b4e5c81e9af82531898fe": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5a5a04069ebd41fdba7835e1b5da585e", + "IPY_MODEL_6c5b30dc67d94071af4d4b14cf4be7e4", + "IPY_MODEL_e9200c8269fa4ebd9c1157cc1b871005" + ], + "layout": "IPY_MODEL_50ab44e431a54c27b409dc74c068c392" + } + }, + "5a5a04069ebd41fdba7835e1b5da585e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9929425b4df94d3792dc454afe59b3fd", + "placeholder": "​", + "style": "IPY_MODEL_03efbd25cd4341cea6714ffc3585632a", + "value": "vocab.txt: 100%" + } + }, + "6c5b30dc67d94071af4d4b14cf4be7e4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ae22d9a4de574ce3905a6f6c82fac1aa", + "max": 895321, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_71fe669e2f68444ba4b81ffc14a39c03", + "value": 895321 + } + }, + "e9200c8269fa4ebd9c1157cc1b871005": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4d4c131206f448c7ba6d5c4e41126d41", + "placeholder": "​", + "style": "IPY_MODEL_ec260d8279bd4a40ad2bbc3c1d7dadaa", + "value": " 895k/895k [00:00<00:00, 1.91MB/s]" + } + }, + "50ab44e431a54c27b409dc74c068c392": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9929425b4df94d3792dc454afe59b3fd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "03efbd25cd4341cea6714ffc3585632a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ae22d9a4de574ce3905a6f6c82fac1aa": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "71fe669e2f68444ba4b81ffc14a39c03": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4d4c131206f448c7ba6d5c4e41126d41": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ec260d8279bd4a40ad2bbc3c1d7dadaa": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "08611ca37f8c462db079dc2883f06002": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_47eca14208cc4e5085d44cce42872a35", + "IPY_MODEL_49d3e88f60e744e7b37bcbcca5bbb087", + "IPY_MODEL_81b9bb3315e4402ea5b6768d0d189591" + ], + "layout": "IPY_MODEL_4b88329de5ed48738ca0da054a1f0131" + } + }, + "47eca14208cc4e5085d44cce42872a35": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1507e771a5ca4056b0605cd453d89c60", + "placeholder": "​", + "style": "IPY_MODEL_44946e0a5d31408a851b0e8ab5217c43", + "value": "bpe.codes: 100%" + } + }, + "49d3e88f60e744e7b37bcbcca5bbb087": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee8c999b66e84cbda17702e916d48a3f", + "max": 1135173, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6c6228d416944599b110ffa97b20bd8c", + "value": 1135173 + } + }, + "81b9bb3315e4402ea5b6768d0d189591": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c519c27334b742ee8f14e29da2ebdf9b", + "placeholder": "​", + "style": "IPY_MODEL_7d422740d70546559703cd0304be663f", + "value": " 1.14M/1.14M [00:00<00:00, 1.74MB/s]" + } + }, + "4b88329de5ed48738ca0da054a1f0131": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1507e771a5ca4056b0605cd453d89c60": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "44946e0a5d31408a851b0e8ab5217c43": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ee8c999b66e84cbda17702e916d48a3f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6c6228d416944599b110ffa97b20bd8c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c519c27334b742ee8f14e29da2ebdf9b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7d422740d70546559703cd0304be663f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "371ab2b9d7c84402b3c4b934e89eca4b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7ead683f167c408f88ba72b2ee1599d1", + "IPY_MODEL_737d3d1e5a3146de96c17ca8ec72d75e", + "IPY_MODEL_7ec9beb535c9428a954367613fa7f4cd" + ], + "layout": "IPY_MODEL_3c51b3c0b7ba4a2eacb1b8b2be8e024c" + } + }, + "7ead683f167c408f88ba72b2ee1599d1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_50dc5fc6a7354394ace536241fa01714", + "placeholder": "​", + "style": "IPY_MODEL_78ba04a44d9e42dba9fb7617d28c91e9", + "value": "tokenizer.json: 100%" + } + }, + "737d3d1e5a3146de96c17ca8ec72d75e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ca87704af3bc4c7590eea8f8f0f50d94", + "max": 3132320, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f3870bce67da4affa8925d9d898638da", + "value": 3132320 + } + }, + "7ec9beb535c9428a954367613fa7f4cd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e6efac23366643dd861caf121a8a220b", + "placeholder": "​", + "style": "IPY_MODEL_1044cf40d7e54337859bab0057aa0b54", + "value": " 3.13M/3.13M [00:00<00:00, 6.78MB/s]" + } + }, + "3c51b3c0b7ba4a2eacb1b8b2be8e024c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "50dc5fc6a7354394ace536241fa01714": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "78ba04a44d9e42dba9fb7617d28c91e9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ca87704af3bc4c7590eea8f8f0f50d94": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f3870bce67da4affa8925d9d898638da": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e6efac23366643dd861caf121a8a220b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1044cf40d7e54337859bab0057aa0b54": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3a6bb86ac3db4f82ae139507f94607e0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_04a96e9ef4774bd1a3cd3a1dd20fe194", + "IPY_MODEL_6158b1f05f6c4851b492df312f0312ca", + "IPY_MODEL_b81c4146511045e280ba4fa226074679" + ], + "layout": "IPY_MODEL_3676c8217c654790972189be8c1f4627" + } + }, + "04a96e9ef4774bd1a3cd3a1dd20fe194": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_762b2c8d23824070b2eb115e151f0c73", + "placeholder": "​", + "style": "IPY_MODEL_d9d57b09b01846a4805882b4adf64e55", + "value": "pytorch_model.bin: 100%" + } + }, + "6158b1f05f6c4851b492df312f0312ca": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e804cb9279dd458aa8b661d28c4427ff", + "max": 542923308, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3169169dbabb4b1aa7906a0415eacdcb", + "value": 542923308 + } + }, + "b81c4146511045e280ba4fa226074679": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_408130d71bb74141906cbc1d2123bb63", + "placeholder": "​", + "style": "IPY_MODEL_4be265bd67e1470cbd856dd268908c00", + "value": " 543M/543M [00:01<00:00, 366MB/s]" + } + }, + "3676c8217c654790972189be8c1f4627": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "762b2c8d23824070b2eb115e151f0c73": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d9d57b09b01846a4805882b4adf64e55": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e804cb9279dd458aa8b661d28c4427ff": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3169169dbabb4b1aa7906a0415eacdcb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "408130d71bb74141906cbc1d2123bb63": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4be265bd67e1470cbd856dd268908c00": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fabfacd2a2964d23994070bcb6bc4b3c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_09db5366452347dfa40bbd192d22d489", + "IPY_MODEL_706046f4266a4ade9f5f1718fc0bbf4f", + "IPY_MODEL_d39e9db3bbf84736b090e12ecf9bd1fd" + ], + "layout": "IPY_MODEL_654a5b527d0c4c51afce2e65fb3b36aa" + } + }, + "09db5366452347dfa40bbd192d22d489": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fa1045a3003d4496a2a5c3055355120d", + "placeholder": "​", + "style": "IPY_MODEL_f00793fc0f1948fca2fda701eb461505", + "value": "model.safetensors: 100%" + } + }, + "706046f4266a4ade9f5f1718fc0bbf4f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9994f44df6a0451792559446361557e9", + "max": 542900336, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_cc2a9437c039472f8447f9d0194459dc", + "value": 542900336 + } + }, + "d39e9db3bbf84736b090e12ecf9bd1fd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2c64cf42d35a4722a3b32b366d1dcd1b", + "placeholder": "​", + "style": "IPY_MODEL_a9886da1334d4df0b4c6536255df6420", + "value": " 543M/543M [00:02<00:00, 213MB/s]" + } + }, + "654a5b527d0c4c51afce2e65fb3b36aa": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fa1045a3003d4496a2a5c3055355120d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f00793fc0f1948fca2fda701eb461505": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9994f44df6a0451792559446361557e9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cc2a9437c039472f8447f9d0194459dc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2c64cf42d35a4722a3b32b366d1dcd1b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a9886da1334d4df0b4c6536255df6420": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8823d2c82071e62bde7663c8e04f180d019119a0 Binary files /dev/null and b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/requirements.txt differ diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/run.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/run.py new file mode 100644 index 0000000000000000000000000000000000000000..1031ac8d5ec0a4a37475f1eb9f7d0158d0111c34 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/run.py @@ -0,0 +1,73 @@ +from src.preprocessing import download_raw_data, preprocess_data_for_EDA, load_phoBERT_model_and_tokenizer, create_embeddings, split_dataset +from src.data_set import NerDataset, collate_fn +from src.configs import configs +from src.model import CRF_Tagger +from src.train import train_model + +import torch +from torch.utils.data import DataLoader + +import warnings +warnings.filterwarnings("ignore") + + +def main(): + + # Download VLSP2016 from hgface + print("Download raw data ...") + df = download_raw_data() + + # Save raw data + df.to_csv(r".\data\raw_data.csv", index=False) + print("Save at data\raw_data.csv \n") + + # Process data for EDA + print("Process data for EDA ...") + df = preprocess_data_for_EDA(df) + df.to_csv(r".\data\processed_data_EDA.csv", index=False) + print("Save at data\processed_data_EDA.csv \n") + + # Init PhoBERT Tokenizer and PhoBERT Model + print("Embedding data ...") + model, tokenizer = load_phoBERT_model_and_tokenizer() + + # Embeddings data + processed_data = create_embeddings(df, model, tokenizer) + torch.save(processed_data, r".\data\processed_data_full.pt") + print("Save at data\processed_data_full.pt \n") + + # Split data into train/valid/test + print("Train/Valid/Test Split ...") + X_train, Y_train, X_val, Y_val, X_test, Y_test = split_dataset(processed_data) + print("Done \n") + + # Data Agumentation for training set + # Pass + + # Init DataLoader + print("Init DataLoader ...") + datasets = { + 'train': NerDataset(X_train, Y_train), + 'val': NerDataset(X_val, Y_val), + 'test': NerDataset(X_test, Y_test) + } + + loaders = { + split: DataLoader(dataset, batch_size=configs["batch_size"], shuffle=(split=='train'), collate_fn=collate_fn) + for split, dataset in datasets.items() + } + print("Done \n") + + # Init sequence label model + print("Init Model ...") + NUM_TAGS = 7 + model = CRF_Tagger(input_dim=X_train[0].size(1), num_tags=NUM_TAGS) + optimizer = torch.optim.Adam(model.parameters(), lr=configs["learning_rate"]) + print("Done \n") + + # Training Model + print("Start training ...") + train_model(model, optimizer, configs, loaders) + +if __name__ == "__main__": + main() diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitattributes b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ca121044777ca34994a9cc1568ddc703dfb631bb --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/README.md @@ -0,0 +1,87 @@ +--- +title: Vietnamese NER Demo +emoji: 🧠 +colorFrom: indigo +colorTo: yellow +sdk: streamlit +sdk_version: 1.46.1 +app_file: src/app.py +pinned: false +--- +# Vietnamese Named Entity Recognition + +## 🛠️ Set Up Your Environment With Conda + +### Option 1: Using `requirements.txt` + +```bash +conda create --name vnner python=3.10 +conda activate vnner +pip install -r requirements.txt +``` + +### Option 2: Using `environment.yml` + +```bash +conda env create -f environment.yml +conda activate vnner +``` + +## Run +```bash +python run.py +``` +--- + +## 📂 Project Structure + +``` +my_ai_project/ +│ +├── data/ +│ ├── raw_data.csv # Dữ liệu gốc +│ ├── processed_data_EDA.csv # Dữ liệu sau khi tiền xử lý +│ └── processed_data_full.csv # Dữ liệu sẵn sàng training +│ +├── notebooks/ # Thử nghiệm và khám phá dữ liệu +│ ├── Duc_Notebook.ipynb # CRF + RandomForest +│ ├── Softmax_PhoBERT.ipynb # Softmax +│ +├── src/ # Mã nguồn chính của dự án +│ ├── __init__.py +│ ├── data_loader.py # Nạp và xử lý dữ liệu +│ ├── preprocessing.py # Hàm tiền xử lý dữ liệu +│ ├── model.py # Định nghĩa kiến trúc mô hình +│ ├── train.py # Huấn luyện mô hình +│ ├── evaluate.py # Đánh giá mô hình +│ └── predict.py # Dự đoán với mô hình đã huấn luyện +│ +├── models/ # Mô hình đã lưu sau khi huấn luyện +│ └── best_model.pth # File trọng số mô hình +│ +├── outputs/ # Kết quả, biểu đồ, log, metrics +│ ├── logs/ # Nhật ký huấn luyện (tensorboard/logging) +│ └── figures/ # Biểu đồ trực quan hóa +│ +├── configs/ # File cấu hình cho mô hình, huấn luyện +│ └── config.yaml +│ +├── tests/ # Unit test cho các hàm chính +│ +├── requirements.txt # Thư viện cần cài đặt +├── environment.yml # Môi trường Conda +├── README.md # Giới thiệu dự án +└── run.py # Script chính để chạy toàn bộ pipeline +``` + +--- + +## 📚 Additional Resources (Optional) + +If you have any questions about the project structure, consider reading these helpful articles first: + +* [Understanding `__init__.py`](https://zetcode.com/python/init-file/) +* [Markdown Basic Syntax](https://www.markdownguide.org/basic-syntax/#escaping-characters) +* [Difference Between `requirements.txt` and `environment.yml`](https://www.reddit.com/r/learnpython/comments/xvlpdz/why_do_people_provide_a_requirementstxt_or/) + +These resources could be useful for you! diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3b40a81bb18807c24cf7e3f6b74cccb7a202ce4c --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/__init__.py @@ -0,0 +1,4 @@ +"""Marks the directory as a Python package.""" +__version__ = "1.0.0" +__author__ = "Duc Lai" +PACKAGE_NAME = "src" diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py new file mode 100644 index 0000000000000000000000000000000000000000..a5aac4ac6062689b85ee31ac6f2d3f6af0ab74d4 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py @@ -0,0 +1,64 @@ +import streamlit as st +import pandas as pd +from src.predict import predict_demo +from src.front import render_html + +st.set_page_config(page_title="Vietnamese NER", layout="wide") + +# ===== Tiêu đề chính ===== +st.title("🔍 Ứng dụng nhận diện thực thể có tên (NER) cho tiếng Việt") + +# Tabs +tab1, tab2, tab3 = st.tabs(["📊 Phân tích dữ liệu", "📈 Kết quả huấn luyện", "🧪 Demo mô hình"]) + +# --- Tab 1: PHÂN TÍCH DỮ LIỆU --- +with tab1: + st.header("📊 Phân tích dữ liệu") + + df = pd.DataFrame({ + "Loại thực thể": ["PER", "LOC", "ORG", "MISC"], + "Số lượng": [3200, 2500, 1800, 900] + }) + + st.bar_chart(df.set_index("Loại thực thể")) + +# --- Tab 2: KẾT QUẢ HUẤN LUYỆN --- +with tab2: + st.header("📈 Kết quả huấn luyện") + + loss = [0.9, 0.7, 0.5, 0.35, 0.28] + epoch = [1, 2, 3, 4, 5] + df_loss = pd.DataFrame({"Epoch": epoch, "Loss": loss}) + st.line_chart(df_loss.set_index("Epoch")) + + st.subheader("Đánh giá mô hình") + df_eval = pd.DataFrame({ + "Phiên bản": ["v1", "v2", "v3"], + "F1-score": [0.78, 0.83, 0.86], + "Accuracy": [0.81, 0.85, 0.88] + }) + st.dataframe(df_eval) + +# --- Tab 3: DEMO MÔ HÌNH --- +with tab3: + st.header("🧪 Vietnamese Named Entity Recognition") + + text = st.text_input("Nhập văn bản tiếng Việt:", "Nguyễn Văn A đang làm việc tại Hà Nội") + + if st.button("Phân tích"): + if not text.strip(): + st.warning("Vui lòng nhập văn bản!") + else: + tokens, labels = predict_demo(text) + + st.subheader("Thực thể được phát hiện") + entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"] + + if entities: + for tok, lab in entities: + st.markdown(f"🔹 **{tok}** — *{lab}*") + else: + st.info("Không phát hiện thực thể.") + + st.subheader("Highlight trong văn bản:") + st.markdown(render_html(tokens, labels), unsafe_allow_html=True) diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/configs.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/configs.py new file mode 100644 index 0000000000000000000000000000000000000000..95bff5fc5f07392fb8e402fe97dda4d8277a439e --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/configs.py @@ -0,0 +1,15 @@ +configs = { + # Init + "project": "NER", + "name": "CRF_VLSP2016_Ultra", + "model": "Linear/CRF", + + # Hyperparameters + "optim": "Adam", + "learning_rate": 1e-3, + "batch_size": 16, + "epochs": 20, + "train_ratio": 0.7, + "val_ratio": 0.15, + "test_ratio": 0.15 +} \ No newline at end of file diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/data_set.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/data_set.py new file mode 100644 index 0000000000000000000000000000000000000000..0f0f2b30c2f67b012f96315d6c2996cd1770ebb8 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/data_set.py @@ -0,0 +1,31 @@ +from torch.utils.data import Dataset +import torch + +class NerDataset(Dataset): + def __init__(self, embeddings, labels): + super().__init__() + self.embeddings = embeddings + self.labels = labels + + def __len__(self): + return len(self.embeddings) + + def __getitem__(self, idx): + return self.embeddings[idx], self.labels[idx] + +def collate_fn(batch): # Batch_size x Seq_length x 768 + embeddings, labels = zip(*batch) + lengths = [e.size(0) for e in embeddings] + max_len = max(lengths) + + padded_embs = torch.stack([ + torch.cat([e, torch.zeros(max_len - e.size(0), e.size(1))]) for e in embeddings + ]) + + padded_labels = torch.stack([ + torch.cat([l, torch.full((max_len - l.size(0),), -1, dtype=torch.long)]) for l in labels + ]) + + return padded_embs, padded_labels, lengths + + \ No newline at end of file diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/evaluate.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..b6d8710c9e0a76664f5ee231b88e6b39c5eadb2f --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/evaluate.py @@ -0,0 +1,21 @@ +from src.predict import predict +from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report + +def evaluate(model, loader, count_loss=True, report=False): + + # Model Preidction (Inference) + all_preds, all_true, loss = predict(model, loader, count_loss) + class_report = None + + # Get evaluation metric + precision, recall, f1, _ = precision_recall_fscore_support(all_true, all_preds, average='macro', zero_division=0) + acc = accuracy_score(all_true, all_preds) + + # Get classification report + if report: + class_report = classification_report(all_true, all_preds) + + return precision, recall, f1, acc, loss, class_report + +def evaluate_ignore_O(model, loader): + pass \ No newline at end of file diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/front.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/front.py new file mode 100644 index 0000000000000000000000000000000000000000..19e5496cd17e4d51bbda87236f2112a49a6029b9 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/front.py @@ -0,0 +1,32 @@ +def render_html(tokens, labels): + """ + Tô màu highlight theo nhãn IOB, với màu khác nhau cho PER, ORG, LOC + """ + label_colors = { + "PER": "lightcoral", # đỏ nhạt + "ORG": "lightblue", # xanh nhạt + "LOC": "lightgreen", # xanh lá nhạt + } + + html = "" + current_label = None + + for tok, label in zip(tokens, labels): + if label.startswith("B-"): + if current_label: + html += " " + current_label = label[2:] + color = label_colors.get(current_label, "lightgray") + html += f"{tok}" + elif label.startswith("I-") and current_label: + html += f" {tok}" + else: + if current_label: + html += " " + current_label = None + html += f"{tok} " + + if current_label: + html += "" + + return f"
{html.strip()}
" diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/model.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/model.py new file mode 100644 index 0000000000000000000000000000000000000000..32ae3ebe5a0cfd7ed2e6306b0c4b2b4d981d3ebf --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/model.py @@ -0,0 +1,16 @@ +from torchcrf import CRF +import torch.nn as nn + +class CRF_Tagger(nn.Module): + def __init__(self, input_dim, num_tags): + super().__init__() + self.embed2tag = nn.Linear(input_dim, num_tags) + self.crf = CRF(num_tags, batch_first=True) + + def forward(self, x, labels, mask): + emissions = self.embed2tag(x) + return -self.crf(emissions, labels, mask=mask, reduction="mean") + + def decode(self, x, mask=None): + emissions = self.embed2tag(x) + return self.crf.decode(emissions, mask) \ No newline at end of file diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..b51d8df95c27414563244261683fa17dab16f004 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py @@ -0,0 +1,46 @@ +import torch +from src.model import CRF_Tagger +from src.preprocessing import process_demo_sentence + +def predict(model, loader, count_loss=True): + + model.eval() # Evaluation Mode, Ignore Dropout, BatchNorm, ... + all_preds, all_true = [], [] + loss = 0.0 + + with torch.no_grad(): # Stop track gradient + for x, y, _ in loader: + mask = (y != -1) + + # Get loss + if count_loss: + loss += model(x, y, mask).item() + + # Get prediction + preds = model.decode(x, mask) + + # Loop for each sentence in mini-batch + for pred_seq, true_seq, m in zip(preds, y, mask): + true_labels = true_seq[m].tolist() # tensor[mask tensor boolean] + all_preds.extend(pred_seq) + all_true.extend(true_labels) + + return all_preds, all_true, loss/len(loader) + +def predict_demo(text): + + + id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'} + + x, tokens = process_demo_sentence(text) # 1 x seq_length x 768 + NUM_TAGS = 7 + + model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS) + model.load_state_dict(torch.load(".\models\best_epoch_16.pt")) + model.eval() + with torch.no_grad(): + preds = model.decode(x) + + labels = [id_tag[lab] for lab in preds[0]] # preds[0] vì sẽ trả về nhiều batch nhưng chúng ta chỉ có 1 + + return tokens, labels diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/preprocessing.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..f452959c94dfdf494c03cc737617f946fb5e3831 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/preprocessing.py @@ -0,0 +1,171 @@ +import pandas as pd +import torch +from transformers import AutoTokenizer, AutoModel +from tqdm import tqdm +from sklearn.model_selection import train_test_split +from src.configs import configs +from pyvi import ViTokenizer + +def join_tokens(tokens): + text = ' '.join(tokens) + return text + +def reform_raw_text(tokens): + text = ' '.join(tokens) + return text.replace("_", " ") + +def label(x, ): + id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'} + return [id_tag[int(i)] for i in x] + +def replace_7_8(lst): + return [0 if x in (7, 8) else x for x in lst] + +# Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece +def group_embeddings(tokens, embeddings): + word_embeddings = [] + current_vecs = [] + + for token, emb in zip(tokens, embeddings): + if token in ["", ""]: + continue + + if token.endswith("@@"): + current_vecs.append(emb) + else: + current_vecs.append(emb) + word_emb = torch.mean(torch.stack(current_vecs), dim=0) + word_embeddings.append(word_emb) + current_vecs = [] + + if current_vecs: # Trong trường hợp sót lại cuối câu + word_emb = torch.mean(torch.stack(current_vecs), dim=0) + word_embeddings.append(word_emb) + + return word_embeddings + + +# Download the dataset form Hugging Face +def download_raw_data(): + splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'} + df_train = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["train"]) + df_valid = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["valid"]) + df = pd.concat([df_train, df_valid]).reset_index(drop=True) + + return df + +# Process dataframe for EDA +def preprocess_data_for_EDA(df): + # Define tag - id + tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6} + id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'} + + # Add columns and remove inappropriate tags + df['ner_tags'] = df['ner_tags'].apply(replace_7_8) + df['text_withseg'] = df['tokens'].apply(join_tokens) + df['text_raw'] = df['tokens'].apply(reform_raw_text) + df["ner_labels"] = df.ner_tags.apply(label) + df.columns = ['tokens', 'id_labels', 'seg_text', 'raw_text', 'labels'] + + return df + + + + +def load_phoBERT_model_and_tokenizer(): + # Load PhoBERT tokenizer và model + tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False) + model = AutoModel.from_pretrained("vinai/phobert-base") + model.eval() + return model, tokenizer + + +# Embedding text +def create_embeddings(df, model, tokenizer): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + + all_embeddings = [] # list of [seq_len_i, 768] tensors + all_labels = [] # list of [seq_len_i,] tensors + remove_index = [] + + for i, row in tqdm(df.iterrows(), total=len(df)): + + # Truy cập phần tử từng dòng + sentence = row['seg_text'] + gold_labels = row["id_labels"] + + # Cho sentence đi qua SentencePiece + input_ids = tokenizer.encode(sentence, return_tensors="pt").to(device) + + tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu()) + + # Encode tạo embeddings + with torch.no_grad(): + outputs = model(input_ids) + last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu() + + # Gộp các embeddings đã bị tách khi đi qua SentencePiece + word_embeds = group_embeddings(tokens, last_hidden_state) + + # Kiểm tra số lượng embeddings và số lượng labels, nếu conflict -> xóa dòng đó + if len(word_embeds) != len(gold_labels): + # print(f"Warning: Skip row {i} - length mismatch") + remove_index.append(i) + continue + + # Thêm vào list tổng & Tới đây là data đã sẵn sàng cho training + all_embeddings.append(torch.stack(word_embeds)) + all_labels.append(torch.tensor(gold_labels)) + + # Create Dict + processed_data = { + "embeddings": all_embeddings, + "labels": all_labels + } + + return processed_data + + +def split_dataset(data): + + # Train_Val / Test Split + X_train_val, X_test, Y_train_val, Y_test = train_test_split(data["embeddings"], data["labels"], test_size=configs["test_ratio"], random_state=42) + + # Train / Val Split + val_rest_ratio = configs["val_ratio"] / (configs["val_ratio"] + configs["train_ratio"]) + X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size = val_rest_ratio, random_state=42) + + return X_train, Y_train, X_val, Y_val, X_test, Y_test + + +# TODO: Refactor hàm process_demo_sentence, và hàm predict demo, warning nếu độ dài tokens_word không bằng độ dài sau group_embeddings + +def process_demo_sentence(text): + """ + Trả về tensor shape 1 x Seq_length x 768 + """ + segmented_text = ViTokenizer.tokenize(text) + tokens_word = segmented_text.strip().split(" ") + + model, tokenizer = load_phoBERT_model_and_tokenizer() + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + + input_ids = tokenizer.encode(segmented_text, return_tensors="pt").to(device) + + tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu()) + + with torch.no_grad(): + outputs = model(input_ids) + last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu() + + word_embeds = group_embeddings(tokens, last_hidden_state) + + all_embeddings = torch.stack(word_embeds) # seq_length x 768 + + all_embeddings = all_embeddings.unsqueeze(0) # Thêm chiều batch size là 1 -> 1 x seq_length x 768 + + return all_embeddings, tokens_word + diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/torchcrf/__init__.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/torchcrf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6ff18e484b6008dfab471e1423f78117af8288da --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/torchcrf/__init__.py @@ -0,0 +1,340 @@ +__version__ = '0.7.2' + +from typing import List, Optional + +import torch +import torch.nn as nn + + +class CRF(nn.Module): + """Conditional random field. + + This module implements a conditional random field [LMP01]_. The forward computation + of this class computes the log likelihood of the given sequence of tags and + emission score tensor. This class also has `~CRF.decode` method which finds + the best tag sequence given an emission score tensor using `Viterbi algorithm`_. + + Args: + num_tags: Number of tags. + batch_first: Whether the first dimension corresponds to the size of a minibatch. + + Attributes: + start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size + ``(num_tags,)``. + end_transitions (`~torch.nn.Parameter`): End transition score tensor of size + ``(num_tags,)``. + transitions (`~torch.nn.Parameter`): Transition score tensor of size + ``(num_tags, num_tags)``. + + + .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001). + "Conditional random fields: Probabilistic models for segmenting and + labeling sequence data". *Proc. 18th International Conf. on Machine + Learning*. Morgan Kaufmann. pp. 282–289. + + .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm + """ + + def __init__(self, num_tags: int, batch_first: bool = False) -> None: + if num_tags <= 0: + raise ValueError(f'invalid number of tags: {num_tags}') + super().__init__() + self.num_tags = num_tags + self.batch_first = batch_first + self.start_transitions = nn.Parameter(torch.empty(num_tags)) + self.end_transitions = nn.Parameter(torch.empty(num_tags)) + self.transitions = nn.Parameter(torch.empty(num_tags, num_tags)) + + self.reset_parameters() + + def reset_parameters(self) -> None: + """Initialize the transition parameters. + + The parameters will be initialized randomly from a uniform distribution + between -0.1 and 0.1. + """ + nn.init.uniform_(self.start_transitions, -0.1, 0.1) + nn.init.uniform_(self.end_transitions, -0.1, 0.1) + nn.init.uniform_(self.transitions, -0.1, 0.1) + + def __repr__(self) -> str: + return f'{self.__class__.__name__}(num_tags={self.num_tags})' + + def forward( + self, + emissions: torch.Tensor, + tags: torch.LongTensor, + mask: Optional[torch.ByteTensor] = None, + reduction: str = 'sum', + ) -> torch.Tensor: + """Compute the conditional log likelihood of a sequence of tags given emission scores. + + Args: + emissions (`~torch.Tensor`): Emission score tensor of size + ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``, + ``(batch_size, seq_length, num_tags)`` otherwise. + tags (`~torch.LongTensor`): Sequence of tags tensor of size + ``(seq_length, batch_size)`` if ``batch_first`` is ``False``, + ``(batch_size, seq_length)`` otherwise. + mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)`` + if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise. + reduction: Specifies the reduction to apply to the output: + ``none|sum|mean|token_mean``. ``none``: no reduction will be applied. + ``sum``: the output will be summed over batches. ``mean``: the output will be + averaged over batches. ``token_mean``: the output will be averaged over tokens. + + Returns: + `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if + reduction is ``none``, ``()`` otherwise. + """ + self._validate(emissions, tags=tags, mask=mask) + if reduction not in ('none', 'sum', 'mean', 'token_mean'): + raise ValueError(f'invalid reduction: {reduction}') + if mask is None: + mask = torch.ones_like(tags, dtype=torch.uint8) + + if self.batch_first: + emissions = emissions.transpose(0, 1) + tags = tags.transpose(0, 1) + mask = mask.transpose(0, 1) + + # shape: (batch_size,) + numerator = self._compute_score(emissions, tags, mask) + # shape: (batch_size,) + denominator = self._compute_normalizer(emissions, mask) + # shape: (batch_size,) + llh = numerator - denominator + + if reduction == 'none': + return llh + if reduction == 'sum': + return llh.sum() + if reduction == 'mean': + return llh.mean() + assert reduction == 'token_mean' + return llh.sum() / mask.type_as(emissions).sum() + + @torch.jit.export + def decode(self, emissions: torch.Tensor, + mask: Optional[torch.ByteTensor] = None) -> List[List[int]]: + """Find the most likely tag sequence using Viterbi algorithm. + + Args: + emissions (`~torch.Tensor`): Emission score tensor of size + ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``, + ``(batch_size, seq_length, num_tags)`` otherwise. + mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)`` + if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise. + + Returns: + List of list containing the best tag sequence for each batch. + """ + self._validate(emissions, mask=mask) + if mask is None: + mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8) + + if self.batch_first: + emissions = emissions.transpose(0, 1) + mask = mask.transpose(0, 1) + + return self._viterbi_decode(emissions, mask) + + def _validate( + self, + emissions: torch.Tensor, + tags: Optional[torch.LongTensor] = None, + mask: Optional[torch.ByteTensor] = None) -> None: + if emissions.dim() != 3: + raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}') + if emissions.size(2) != self.num_tags: + raise ValueError( + f'expected last dimension of emissions is {self.num_tags}, ' + f'got {emissions.size(2)}') + + if tags is not None: + if emissions.shape[:2] != tags.shape: + raise ValueError( + 'the first two dimensions of emissions and tags must match, ' + f'got {(emissions.shape[0], emissions.shape[1])} and {(tags.shape[0], tags.shape[1])}' + ) + + if mask is not None: + if emissions.shape[:2] != mask.shape: + raise ValueError( + 'the first two dimensions of emissions and mask must match, ' + f'got {(emissions.shape[0], emissions.shape[1])} and {(mask.shape[0], mask.shape[1])}' + ) + no_empty_seq = not self.batch_first and mask[0].all() + no_empty_seq_bf = self.batch_first and mask[:, 0].all() + if not no_empty_seq and not no_empty_seq_bf: + raise ValueError('mask of the first timestep must all be on') + + def _compute_score( + self, emissions: torch.Tensor, tags: torch.LongTensor, + mask: torch.ByteTensor) -> torch.Tensor: + # emissions: (seq_length, batch_size, num_tags) + # tags: (seq_length, batch_size) + # mask: (seq_length, batch_size) + assert emissions.dim() == 3 and tags.dim() == 2 + assert emissions.shape[:2] == tags.shape + assert emissions.size(2) == self.num_tags + assert mask.shape == tags.shape + assert mask[0].all() + + seq_length, batch_size = tags.shape + mask = mask.type_as(emissions) + + # Start transition score and first emission + # shape: (batch_size,) + score = self.start_transitions[tags[0]] + score += emissions[0, torch.arange(batch_size), tags[0]] + + for i in range(1, seq_length): + # Transition score to next tag, only added if next timestep is valid (mask == 1) + # shape: (batch_size,) + score += self.transitions[tags[i - 1], tags[i]] * mask[i] + + # Emission score for next tag, only added if next timestep is valid (mask == 1) + # shape: (batch_size,) + score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i] + + # End transition score + # shape: (batch_size,) + seq_ends = mask.long().sum(dim=0) - 1 + # shape: (batch_size,) + last_tags = tags[seq_ends, torch.arange(batch_size)] + # shape: (batch_size,) + score += self.end_transitions[last_tags] + + return score + + def _compute_normalizer( + self, emissions: torch.Tensor, mask: torch.ByteTensor) -> torch.Tensor: + # emissions: (seq_length, batch_size, num_tags) + # mask: (seq_length, batch_size) + assert emissions.dim() == 3 and mask.dim() == 2 + assert emissions.shape[:2] == mask.shape + assert emissions.size(2) == self.num_tags + assert mask[0].all() + + seq_length = emissions.size(0) + + # Start transition score and first emission; score has size of + # (batch_size, num_tags) where for each batch, the j-th column stores + # the score that the first timestep has tag j + # shape: (batch_size, num_tags) + score = self.start_transitions + emissions[0] + + for i in range(1, seq_length): + # Broadcast score for every possible next tag + # shape: (batch_size, num_tags, 1) + broadcast_score = score.unsqueeze(2) + + # Broadcast emission score for every possible current tag + # shape: (batch_size, 1, num_tags) + broadcast_emissions = emissions[i].unsqueeze(1) + + # Compute the score tensor of size (batch_size, num_tags, num_tags) where + # for each sample, entry at row i and column j stores the sum of scores of all + # possible tag sequences so far that end with transitioning from tag i to tag j + # and emitting + # shape: (batch_size, num_tags, num_tags) + next_score = broadcast_score + self.transitions + broadcast_emissions + + # Sum over all possible current tags, but we're in score space, so a sum + # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of + # all possible tag sequences so far, that end in tag i + # shape: (batch_size, num_tags) + next_score = torch.logsumexp(next_score, dim=1) + + # Set score to the next score if this timestep is valid (mask == 1) + # shape: (batch_size, num_tags) + score = torch.where(mask[i].unsqueeze(1), next_score, score) + + # End transition score + # shape: (batch_size, num_tags) + score += self.end_transitions + + # Sum (log-sum-exp) over all possible tags + # shape: (batch_size,) + return torch.logsumexp(score, dim=1) + + def _viterbi_decode(self, emissions: torch.FloatTensor, + mask: torch.ByteTensor) -> List[List[int]]: + # emissions: (seq_length, batch_size, num_tags) + # mask: (seq_length, batch_size) + assert emissions.dim() == 3 and mask.dim() == 2 + assert emissions.shape[:2] == mask.shape + assert emissions.size(2) == self.num_tags + assert mask[0].all() + + seq_length, batch_size = mask.shape + + # Start transition and first emission + # shape: (batch_size, num_tags) + score = self.start_transitions + emissions[0] + history: List[torch.Tensor] = [] + + # score is a tensor of size (batch_size, num_tags) where for every batch, + # value at column j stores the score of the best tag sequence so far that ends + # with tag j + # history saves where the best tags candidate transitioned from; this is used + # when we trace back the best tag sequence + + # Viterbi algorithm recursive case: we compute the score of the best tag sequence + # for every possible next tag + for i in range(1, seq_length): + # Broadcast viterbi score for every possible next tag + # shape: (batch_size, num_tags, 1) + broadcast_score = score.unsqueeze(2) + + # Broadcast emission score for every possible current tag + # shape: (batch_size, 1, num_tags) + broadcast_emission = emissions[i].unsqueeze(1) + + # Compute the score tensor of size (batch_size, num_tags, num_tags) where + # for each sample, entry at row i and column j stores the score of the best + # tag sequence so far that ends with transitioning from tag i to tag j and emitting + # shape: (batch_size, num_tags, num_tags) + next_score = broadcast_score + self.transitions + broadcast_emission + + # Find the maximum score over all possible current tag + # shape: (batch_size, num_tags) + next_score, indices = next_score.max(dim=1) + + # Set score to the next score if this timestep is valid (mask == 1) + # and save the index that produces the next score + # shape: (batch_size, num_tags) + score = torch.where(mask[i].unsqueeze(1), next_score, score) + history.append(indices) + + # End transition score + # shape: (batch_size, num_tags) + score += self.end_transitions + + # Now, compute the best path for each sample + + # shape: (batch_size,) + seq_ends = mask.long().sum(dim=0) - 1 + best_tags_list: List[List[int]] = [] + + for idx in range(batch_size): + # Find the tag which maximizes the score at the last timestep; this is our best tag + # for the last timestep + _, best_last_tag = score[idx].max(dim=0) + best_tags: List[int] = [] + best_tags.append(best_last_tag.item()) + + # We trace back where the best last tag comes from, append that to our best tag + # sequence, and trace it back again, and so on + # NOTE: reversed() cannot be used here because it is not supported by TorchScript, + # see https://github.com/pytorch/pytorch/issues/31772. + for hist in history[:seq_ends[idx]][::-1]: + best_last_tag = hist[idx][best_tags[-1]] + best_tags.append(best_last_tag.item()) + + # Reverse the order because we start from the last timestep + best_tags.reverse() + best_tags_list.append(best_tags) + + return best_tags_list diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/train.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/train.py new file mode 100644 index 0000000000000000000000000000000000000000..23c93d6f33939b6122723a162be5db96f0067963 --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/train.py @@ -0,0 +1,98 @@ +import wandb +from tqdm import tqdm +from src.evaluate import evaluate +import torch + +def train_model(model, optimizer, configs, loaders): + + # Login wandb + wandb.login() + + # Init Wandb for tracking training phase + wandb.init( + project=configs["project"], + name=configs["name"], + config=configs + ) + + # Log gradient of parameter + wandb.watch(model, log="all") + + # Save model checkpoint by best F1 + best_val_f1 = 0.0 + + # Training Loop + for epoch in range(1, configs["epochs"] + 1): + model.train() + total_loss = 0.0 + + # Create progress bar + train_bar = tqdm(loaders['train'], desc=f"Train Epoch {epoch}/{configs['epochs']}") + + for batch_idx, (x, y, _) in enumerate(train_bar, start=1): + mask = (y != -1) + loss = model(x, y, mask) + optimizer.zero_grad() + loss.backward() + optimizer.step() + total_loss += loss.item() + + train_bar.set_postfix(batch_loss=loss.item(), avg_loss=total_loss / batch_idx) + + # Evaluate model after each epoch + avg_train_loss = total_loss / len(loaders['train']) + train_precision, train_recall, train_f1, train_acc, _, _ = evaluate(model, loaders['train'], count_loss=False) + val_precision, val_recall, val_f1, val_acc, avg_val_loss, _= evaluate(model, loaders['val'], count_loss=True) + + # Log metric for train and val set + print(f"Epoch {epoch}: train_loss={avg_train_loss:.4f}, train_f1={train_f1:.4f}, val_loss={avg_val_loss:.4f}, val_f1={val_f1:.4f}") + wandb.log({ + + "epoch": epoch, + + # Group: Training metrics + "Train/Loss": avg_train_loss, + "Train/Precision": train_precision, + "Train/Recall": train_recall, + "Train/F1": train_f1, + "Train/Accuracy": train_acc, + + # Group: Validation metrics + "Val/Loss": avg_val_loss, + "Val/Precision": val_precision, + "Val/Recall": val_recall, + "Val/F1": val_f1, + "Val/Accuracy": val_acc + }) + + # Save best model based on val_f1 + if val_f1 > best_val_f1: + best_val_f1 = val_f1 + ckpt_path = f"./models/best_epoch_{epoch}.pt" + torch.save(model.state_dict(), ckpt_path) + wandb.save(ckpt_path) + print(f"Saved imporved model to {ckpt_path}") + + print() + + # Load best model before test + print(f"Loading best model from {ckpt_path} for final evaluation...") + model.load_state_dict(torch.load(ckpt_path)) + print("Done \n") + + + # Log metric for test set + print("Evaluation on test set ...") + test_precision, test_recall, test_f1, test_acc, avg_test_loss, report = evaluate(model, loaders['test'], count_loss=True, report=True) + wandb.log({ + "Test/Loss": avg_test_loss, + "Test/Precision": test_precision, + "Test/Recall": test_recall, + "Test/F1": test_f1, + "Test/Accuracy": test_acc, + }) + print(f"Test_loss={avg_test_loss:.4f}, Test_f1={test_f1:.4f}") + print(report) + + # Finish W&B run + wandb.finish() \ No newline at end of file diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py index a5aac4ac6062689b85ee31ac6f2d3f6af0ab74d4..b1fce6460c287fa42354153208fa47e387a02509 100644 --- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/app.py @@ -1,7 +1,10 @@ import streamlit as st import pandas as pd +import plotly.graph_objects as go + from src.predict import predict_demo from src.front import render_html +from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare st.set_page_config(page_title="Vietnamese NER", layout="wide") @@ -24,20 +27,99 @@ with tab1: # --- Tab 2: KẾT QUẢ HUẤN LUYỆN --- with tab2: - st.header("📈 Kết quả huấn luyện") - - loss = [0.9, 0.7, 0.5, 0.35, 0.28] - epoch = [1, 2, 3, 4, 5] - df_loss = pd.DataFrame({"Epoch": epoch, "Loss": loss}) - st.line_chart(df_loss.set_index("Epoch")) - - st.subheader("Đánh giá mô hình") - df_eval = pd.DataFrame({ - "Phiên bản": ["v1", "v2", "v3"], - "F1-score": [0.78, 0.83, 0.86], - "Accuracy": [0.81, 0.85, 0.88] - }) - st.dataframe(df_eval) + st.set_page_config( + page_title="My NER App", + layout="wide", + initial_sidebar_state="expanded" + ) + + # ==== TẠO FIGURES ==== + + # 1️⃣ Loss + fig_loss = go.Figure() + fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"], + mode='lines+markers', name='Train Loss')) + fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"], + mode='lines+markers', name='Val Loss')) + fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss") + + # 2️⃣ F1-Score + fig_f1 = go.Figure() + fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"], + mode='lines+markers', name='Train F1')) + fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"], + mode='lines+markers', name='Val F1')) + fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score") + + # 3️⃣ Classification Report Table & Bar + labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]] + report_data = [[lbl, + report_dict[lbl]["precision"], + report_dict[lbl]["recall"], + report_dict[lbl]["f1-score"]] + for lbl in labels] + df_report = pd.DataFrame(report_data, + columns=["Label", "Precision", "Recall", "F1-Score"]) + + fig_report = go.Figure() + for col in ["Precision", "Recall", "F1-Score"]: + fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col)) + fig_report.update_layout(barmode='group', + title="Class Report Metrics of PhoBert + CRF", + xaxis_title="Label", yaxis_title="Score", + yaxis=dict(range=[0,1.0])) + + labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]] + report_data2 = [[lbl, + report_dict_2[lbl]["precision"], + report_dict_2[lbl]["recall"], + report_dict_2[lbl]["f1-score"]] + for lbl in labels2] + df_report2 = pd.DataFrame(report_data2, + columns=["Label", "Precision", "Recall", "F1-Score"]) + + fig_report2 = go.Figure() + for col in ["Precision", "Recall", "F1-Score"]: + fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col)) + fig_report2.update_layout(barmode='group', + title="Class Report Metrics of PhoBert + Softmax", + xaxis_title="Label", yaxis_title="Score", + yaxis=dict(range=[0,1.0])) + + # 4️⃣ Model & Data Comparison Tables + df_model = pd.DataFrame( + [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()], + columns=["Model", "F1-Score", "Accuracy"] + ) + df_data = pd.DataFrame( + [[s, f1] for s, f1 in data_compare["Data"].items()], + columns=["Preprocessing", "F1-Score"] + ) + + # ==== LAYOUT RAO GỌN VỚI COLUMNS ==== + + # Row 1: Loss | F1 + col1, col2 = st.columns(2) + with col1: + st.plotly_chart(fig_loss, use_container_width=True) + with col2: + st.plotly_chart(fig_f1, use_container_width=True) + + # Row 2: Class Report Table | Bar Chart + col3, col4 = st.columns(2) + with col3: + st.plotly_chart(fig_report2, use_container_width=True) + with col4: + st.plotly_chart(fig_report, use_container_width=True) + + # Row 3: Model Compare | Data Compare + col5, col6 = st.columns(2) + with col5: + st.markdown("**Model Comparison**") + st.dataframe(df_model, use_container_width=True) + with col6: + st.markdown("**Data Preprocessing Comparison**") + st.dataframe(df_data, use_container_width=True) # --- Tab 3: DEMO MÔ HÌNH --- with tab3: diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py index b51d8df95c27414563244261683fa17dab16f004..11cb1c873c7993c3340f3aeb1500478f33e87ead 100644 --- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py @@ -36,7 +36,7 @@ def predict_demo(text): NUM_TAGS = 7 model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS) - model.load_state_dict(torch.load(".\models\best_epoch_16.pt")) + model.load_state_dict(torch.load("../models/best_epoch_16.pt")) model.eval() with torch.no_grad(): preds = model.decode(x) diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/st.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/st.py new file mode 100644 index 0000000000000000000000000000000000000000..defa69a145c9a292c87b958181ca6e620976abcc --- /dev/null +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/space/st.py @@ -0,0 +1,98 @@ +import streamlit as st +import pandas as pd +import plotly.graph_objects as go +from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare + +st.set_page_config( + page_title="My NER App", + layout="wide", + initial_sidebar_state="expanded" +) + +# ==== TẠO FIGURES ==== + +# 1️⃣ Loss +fig_loss = go.Figure() +fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"], + mode='lines+markers', name='Train Loss')) +fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"], + mode='lines+markers', name='Val Loss')) +fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss") + +# 2️⃣ F1-Score +fig_f1 = go.Figure() +fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"], + mode='lines+markers', name='Train F1')) +fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"], + mode='lines+markers', name='Val F1')) +fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score") + +# 3️⃣ Classification Report Table & Bar +labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]] +report_data = [[lbl, + report_dict[lbl]["precision"], + report_dict[lbl]["recall"], + report_dict[lbl]["f1-score"]] + for lbl in labels] +df_report = pd.DataFrame(report_data, + columns=["Label", "Precision", "Recall", "F1-Score"]) + +fig_report = go.Figure() +for col in ["Precision", "Recall", "F1-Score"]: + fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col)) +fig_report.update_layout(barmode='group', + title="Class Report Metrics of PhoBert + CRF", + xaxis_title="Label", yaxis_title="Score", + yaxis=dict(range=[0,1.0])) + +labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]] +report_data2 = [[lbl, + report_dict_2[lbl]["precision"], + report_dict_2[lbl]["recall"], + report_dict_2[lbl]["f1-score"]] + for lbl in labels2] +df_report2 = pd.DataFrame(report_data2, + columns=["Label", "Precision", "Recall", "F1-Score"]) + +fig_report2 = go.Figure() +for col in ["Precision", "Recall", "F1-Score"]: + fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col)) +fig_report2.update_layout(barmode='group', + title="Class Report Metrics of PhoBert + Softmax", + xaxis_title="Label", yaxis_title="Score", + yaxis=dict(range=[0,1.0])) + +# 4️⃣ Model & Data Comparison Tables +df_model = pd.DataFrame( + [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()], + columns=["Model", "F1-Score", "Accuracy"] +) +df_data = pd.DataFrame( + [[s, f1] for s, f1 in data_compare["Data"].items()], + columns=["Preprocessing", "F1-Score"] +) + +# ==== LAYOUT RAO GỌN VỚI COLUMNS ==== + +# Row 1: Loss | F1 +col1, col2 = st.columns(2) +with col1: + st.plotly_chart(fig_loss, use_container_width=True) +with col2: + st.plotly_chart(fig_f1, use_container_width=True) + +# Row 2: Class Report Table | Bar Chart +col3, col4 = st.columns(2) +with col3: + st.plotly_chart(fig_report2, use_container_width=True) +with col4: + st.plotly_chart(fig_report, use_container_width=True) + +# Row 3: Model Compare | Data Compare +col5, col6 = st.columns(2) +with col5: + st.markdown("**Model Comparison**") + st.dataframe(df_model, use_container_width=True) +with col6: + st.markdown("**Data Preprocessing Comparison**") + st.dataframe(df_data, use_container_width=True) diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py index 11cb1c873c7993c3340f3aeb1500478f33e87ead..9b5dc0e11393a3bc5096332856313958b98f0623 100644 --- a/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/space/src/predict.py @@ -1,6 +1,7 @@ import torch from src.model import CRF_Tagger from src.preprocessing import process_demo_sentence +import os def predict(model, loader, count_loss=True): @@ -29,6 +30,9 @@ def predict(model, loader, count_loss=True): def predict_demo(text): + BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + model_path = os.path.join(BASE_DIR, "models", "best_epoch_16.pt") + id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'} @@ -36,7 +40,7 @@ def predict_demo(text): NUM_TAGS = 7 model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS) - model.load_state_dict(torch.load("../models/best_epoch_16.pt")) + model.load_state_dict(torch.load(model_path)) model.eval() with torch.no_grad(): preds = model.decode(x) diff --git a/space/space/space/space/space/space/space/space/space/space/space/space/space/st.py b/space/space/space/space/space/space/space/space/space/space/space/space/space/st.py index defa69a145c9a292c87b958181ca6e620976abcc..e5615222a010fc945827798960319ffb34d3d90d 100644 --- a/space/space/space/space/space/space/space/space/space/space/space/space/space/st.py +++ b/space/space/space/space/space/space/space/space/space/space/space/space/space/st.py @@ -1,98 +1,7 @@ import streamlit as st -import pandas as pd -import plotly.graph_objects as go -from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare -st.set_page_config( - page_title="My NER App", - layout="wide", - initial_sidebar_state="expanded" -) +# Load ảnh từ file local +st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/W%26B%20Chart%206_18_2025%2C%207_23_58%20PM.png", caption="Ảnh minh hoạ", use_column_width=True) -# ==== TẠO FIGURES ==== - -# 1️⃣ Loss -fig_loss = go.Figure() -fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"], - mode='lines+markers', name='Train Loss')) -fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"], - mode='lines+markers', name='Val Loss')) -fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss") - -# 2️⃣ F1-Score -fig_f1 = go.Figure() -fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"], - mode='lines+markers', name='Train F1')) -fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"], - mode='lines+markers', name='Val F1')) -fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score") - -# 3️⃣ Classification Report Table & Bar -labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]] -report_data = [[lbl, - report_dict[lbl]["precision"], - report_dict[lbl]["recall"], - report_dict[lbl]["f1-score"]] - for lbl in labels] -df_report = pd.DataFrame(report_data, - columns=["Label", "Precision", "Recall", "F1-Score"]) - -fig_report = go.Figure() -for col in ["Precision", "Recall", "F1-Score"]: - fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col)) -fig_report.update_layout(barmode='group', - title="Class Report Metrics of PhoBert + CRF", - xaxis_title="Label", yaxis_title="Score", - yaxis=dict(range=[0,1.0])) - -labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]] -report_data2 = [[lbl, - report_dict_2[lbl]["precision"], - report_dict_2[lbl]["recall"], - report_dict_2[lbl]["f1-score"]] - for lbl in labels2] -df_report2 = pd.DataFrame(report_data2, - columns=["Label", "Precision", "Recall", "F1-Score"]) - -fig_report2 = go.Figure() -for col in ["Precision", "Recall", "F1-Score"]: - fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col)) -fig_report2.update_layout(barmode='group', - title="Class Report Metrics of PhoBert + Softmax", - xaxis_title="Label", yaxis_title="Score", - yaxis=dict(range=[0,1.0])) - -# 4️⃣ Model & Data Comparison Tables -df_model = pd.DataFrame( - [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()], - columns=["Model", "F1-Score", "Accuracy"] -) -df_data = pd.DataFrame( - [[s, f1] for s, f1 in data_compare["Data"].items()], - columns=["Preprocessing", "F1-Score"] -) - -# ==== LAYOUT RAO GỌN VỚI COLUMNS ==== - -# Row 1: Loss | F1 -col1, col2 = st.columns(2) -with col1: - st.plotly_chart(fig_loss, use_container_width=True) -with col2: - st.plotly_chart(fig_f1, use_container_width=True) - -# Row 2: Class Report Table | Bar Chart -col3, col4 = st.columns(2) -with col3: - st.plotly_chart(fig_report2, use_container_width=True) -with col4: - st.plotly_chart(fig_report, use_container_width=True) - -# Row 3: Model Compare | Data Compare -col5, col6 = st.columns(2) -with col5: - st.markdown("**Model Comparison**") - st.dataframe(df_model, use_container_width=True) -with col6: - st.markdown("**Data Preprocessing Comparison**") - st.dataframe(df_data, use_container_width=True) +# Load ảnh từ URL +st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/W%26B%20Chart%206_18_2025%2C%207_24_20%20PM.png", caption="Ảnh từ URL", use_column_width=True) diff --git a/space/space/space/space/space/space/space/space/space/space/space/src/app.py b/space/space/space/space/space/space/space/space/space/space/space/src/app.py index b1fce6460c287fa42354153208fa47e387a02509..b7795259ae5d8b940e2a57aa9dd90c8ef3de1b4f 100644 --- a/space/space/space/space/space/space/space/space/space/space/space/src/app.py +++ b/space/space/space/space/space/space/space/space/space/space/space/src/app.py @@ -16,14 +16,27 @@ tab1, tab2, tab3 = st.tabs(["📊 Phân tích dữ liệu", "📈 Kết quả hu # --- Tab 1: PHÂN TÍCH DỮ LIỆU --- with tab1: - st.header("📊 Phân tích dữ liệu") - - df = pd.DataFrame({ - "Loại thực thể": ["PER", "LOC", "ORG", "MISC"], - "Số lượng": [3200, 2500, 1800, 900] - }) - - st.bar_chart(df.set_index("Loại thực thể")) + col1, col2 = st.columns(2) + + # ==== Distribution of NER Label Frequency ==== + with col1: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png") + + # ==== Distribution of NER Label Frequency (Add crawled data) ==== + with col2: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png") + + # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ==== + with col1: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png") + + # ==== Distribution of Sentence Lengths ==== + with col2: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png") + + # ==== Distribution of Token Lengths ==== + with col1: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png") # --- Tab 2: KẾT QUẢ HUẤN LUYỆN --- with tab2: diff --git a/space/space/space/space/space/space/space/space/space/space/space/st.py b/space/space/space/space/space/space/space/space/space/space/space/st.py index e5615222a010fc945827798960319ffb34d3d90d..01daf40e4451ef0aa22212fd1ce3233c1e789b9a 100644 --- a/space/space/space/space/space/space/space/space/space/space/space/st.py +++ b/space/space/space/space/space/space/space/space/space/space/space/st.py @@ -1,7 +1,23 @@ import streamlit as st -# Load ảnh từ file local -st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/W%26B%20Chart%206_18_2025%2C%207_23_58%20PM.png", caption="Ảnh minh hoạ", use_column_width=True) +col1, col2 = st.columns(2) -# Load ảnh từ URL -st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/W%26B%20Chart%206_18_2025%2C%207_24_20%20PM.png", caption="Ảnh từ URL", use_column_width=True) +# ==== Distribution of NER Label Frequency ==== +with col1: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png") + +# ==== Distribution of NER Label Frequency (Add crawled data) ==== +with col2: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png") + +# ==== Distribution of the Number of Entities per Sentence (0 to 15+) ==== +with col1: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png") + +# ==== Distribution of Sentence Lengths ==== +with col2: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png") + +# ==== Distribution of Token Lengths ==== +with col1: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png") diff --git a/space/space/space/space/space/space/space/space/src/app.py b/space/space/space/space/space/space/space/space/src/app.py index b7795259ae5d8b940e2a57aa9dd90c8ef3de1b4f..551ef63e594df2a86963228e3da6183e145d916e 100644 --- a/space/space/space/space/space/space/space/space/src/app.py +++ b/space/space/space/space/space/space/space/space/src/app.py @@ -8,52 +8,57 @@ from results.output import training_log, report_dict, report_dict_2, model_compa st.set_page_config(page_title="Vietnamese NER", layout="wide") -# ===== Tiêu đề chính ===== -st.title("🔍 Ứng dụng nhận diện thực thể có tên (NER) cho tiếng Việt") +# ===== Main Title ===== +st.title("🔍 Vietnamese Named Entity Recognition (NER) Application") # Tabs -tab1, tab2, tab3 = st.tabs(["📊 Phân tích dữ liệu", "📈 Kết quả huấn luyện", "🧪 Demo mô hình"]) +tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"]) -# --- Tab 1: PHÂN TÍCH DỮ LIỆU --- +# --- Tab 1: DATA ANALYSIS --- with tab1: col1, col2 = st.columns(2) # ==== Distribution of NER Label Frequency ==== with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png", + caption="NER Label Frequency Distribution") # ==== Distribution of NER Label Frequency (Add crawled data) ==== with col2: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png", + caption="NER Label Frequency (Extended with Crawled Data)") # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ==== with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png", + caption="Number of Entities per Sentence") # ==== Distribution of Sentence Lengths ==== with col2: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png", + caption="Sentence Length Distribution") # ==== Distribution of Token Lengths ==== with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png", + caption="Token Length Distribution") -# --- Tab 2: KẾT QUẢ HUẤN LUYỆN --- +# --- Tab 2: TRAINING RESULTS --- with tab2: st.set_page_config( - page_title="My NER App", + page_title="Vietnamese NER", layout="wide", initial_sidebar_state="expanded" ) - # ==== TẠO FIGURES ==== + # ==== CREATE FIGURES ==== # 1️⃣ Loss fig_loss = go.Figure() fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"], - mode='lines+markers', name='Train Loss')) + mode='lines+markers', name='Train Loss')) fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"], - mode='lines+markers', name='Val Loss')) + mode='lines+markers', name='Validation Loss')) fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss") # 2️⃣ F1-Score @@ -61,7 +66,7 @@ with tab2: fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"], mode='lines+markers', name='Train F1')) fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"], - mode='lines+markers', name='Val F1')) + mode='lines+markers', name='Validation F1')) fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score") # 3️⃣ Classification Report Table & Bar @@ -70,34 +75,34 @@ with tab2: report_dict[lbl]["precision"], report_dict[lbl]["recall"], report_dict[lbl]["f1-score"]] - for lbl in labels] + for lbl in labels] df_report = pd.DataFrame(report_data, - columns=["Label", "Precision", "Recall", "F1-Score"]) + columns=["Label", "Precision", "Recall", "F1-Score"]) fig_report = go.Figure() for col in ["Precision", "Recall", "F1-Score"]: fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col)) fig_report.update_layout(barmode='group', - title="Class Report Metrics of PhoBert + CRF", - xaxis_title="Label", yaxis_title="Score", - yaxis=dict(range=[0,1.0])) + title="Class Metrics: PhoBERT + CRF", + xaxis_title="Label", yaxis_title="Score", + yaxis=dict(range=[0, 1.0])) labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]] report_data2 = [[lbl, - report_dict_2[lbl]["precision"], - report_dict_2[lbl]["recall"], - report_dict_2[lbl]["f1-score"]] + report_dict_2[lbl]["precision"], + report_dict_2[lbl]["recall"], + report_dict_2[lbl]["f1-score"]] for lbl in labels2] df_report2 = pd.DataFrame(report_data2, - columns=["Label", "Precision", "Recall", "F1-Score"]) + columns=["Label", "Precision", "Recall", "F1-Score"]) fig_report2 = go.Figure() for col in ["Precision", "Recall", "F1-Score"]: fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col)) fig_report2.update_layout(barmode='group', - title="Class Report Metrics of PhoBert + Softmax", - xaxis_title="Label", yaxis_title="Score", - yaxis=dict(range=[0,1.0])) + title="Class Metrics: PhoBERT + Softmax", + xaxis_title="Label", yaxis_title="Score", + yaxis=dict(range=[0, 1.0])) # 4️⃣ Model & Data Comparison Tables df_model = pd.DataFrame( @@ -109,7 +114,7 @@ with tab2: columns=["Preprocessing", "F1-Score"] ) - # ==== LAYOUT RAO GỌN VỚI COLUMNS ==== + # ==== CLEAN LAYOUT WITH COLUMNS ==== # Row 1: Loss | F1 col1, col2 = st.columns(2) @@ -134,26 +139,26 @@ with tab2: st.markdown("**Data Preprocessing Comparison**") st.dataframe(df_data, use_container_width=True) -# --- Tab 3: DEMO MÔ HÌNH --- +# --- Tab 3: MODEL DEMO --- with tab3: - st.header("🧪 Vietnamese Named Entity Recognition") + st.header("🧪 Vietnamese Named Entity Recognition Demo") - text = st.text_input("Nhập văn bản tiếng Việt:", "Nguyễn Văn A đang làm việc tại Hà Nội") + text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội") - if st.button("Phân tích"): + if st.button("Analyze"): if not text.strip(): - st.warning("Vui lòng nhập văn bản!") + st.warning("Please enter some text!") else: tokens, labels = predict_demo(text) - st.subheader("Thực thể được phát hiện") + st.subheader("Detected Entities") entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"] if entities: for tok, lab in entities: st.markdown(f"🔹 **{tok}** — *{lab}*") else: - st.info("Không phát hiện thực thể.") + st.info("No named entities detected.") - st.subheader("Highlight trong văn bản:") + st.subheader("Highlighted Text") st.markdown(render_html(tokens, labels), unsafe_allow_html=True) diff --git a/space/space/space/space/space/space/space/src/app.py b/space/space/space/space/space/space/space/src/app.py index 551ef63e594df2a86963228e3da6183e145d916e..36c094a68bc2f1b22de36c23879fb4b0d36bf7d9 100644 --- a/space/space/space/space/space/space/space/src/app.py +++ b/space/space/space/space/space/space/space/src/app.py @@ -9,7 +9,7 @@ from results.output import training_log, report_dict, report_dict_2, model_compa st.set_page_config(page_title="Vietnamese NER", layout="wide") # ===== Main Title ===== -st.title("🔍 Vietnamese Named Entity Recognition (NER) Application") +st.title("🔍 Vietnamese Named Entity Recognition Demo") # Tabs tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"]) @@ -20,28 +20,23 @@ with tab1: # ==== Distribution of NER Label Frequency ==== with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png", - caption="NER Label Frequency Distribution") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png") # ==== Distribution of NER Label Frequency (Add crawled data) ==== with col2: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png", - caption="NER Label Frequency (Extended with Crawled Data)") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png") # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ==== with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png", - caption="Number of Entities per Sentence") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png") # ==== Distribution of Sentence Lengths ==== with col2: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png", - caption="Sentence Length Distribution") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png") # ==== Distribution of Token Lengths ==== with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png", - caption="Token Length Distribution") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png") # --- Tab 2: TRAINING RESULTS --- with tab2: @@ -141,8 +136,6 @@ with tab2: # --- Tab 3: MODEL DEMO --- with tab3: - st.header("🧪 Vietnamese Named Entity Recognition Demo") - text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội") if st.button("Analyze"): diff --git a/space/space/space/space/space/src/app.py b/space/space/space/space/space/src/app.py index 36c094a68bc2f1b22de36c23879fb4b0d36bf7d9..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/space/space/space/space/space/src/app.py +++ b/space/space/space/space/space/src/app.py @@ -1,157 +0,0 @@ -import streamlit as st -import pandas as pd -import plotly.graph_objects as go - -from src.predict import predict_demo -from src.front import render_html -from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare - -st.set_page_config(page_title="Vietnamese NER", layout="wide") - -# ===== Main Title ===== -st.title("🔍 Vietnamese Named Entity Recognition Demo") - -# Tabs -tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"]) - -# --- Tab 1: DATA ANALYSIS --- -with tab1: - col1, col2 = st.columns(2) - - # ==== Distribution of NER Label Frequency ==== - with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png") - - # ==== Distribution of NER Label Frequency (Add crawled data) ==== - with col2: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png") - - # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ==== - with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png") - - # ==== Distribution of Sentence Lengths ==== - with col2: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png") - - # ==== Distribution of Token Lengths ==== - with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png") - -# --- Tab 2: TRAINING RESULTS --- -with tab2: - st.set_page_config( - page_title="Vietnamese NER", - layout="wide", - initial_sidebar_state="expanded" - ) - - # ==== CREATE FIGURES ==== - - # 1️⃣ Loss - fig_loss = go.Figure() - fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"], - mode='lines+markers', name='Train Loss')) - fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"], - mode='lines+markers', name='Validation Loss')) - fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss") - - # 2️⃣ F1-Score - fig_f1 = go.Figure() - fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"], - mode='lines+markers', name='Train F1')) - fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"], - mode='lines+markers', name='Validation F1')) - fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score") - - # 3️⃣ Classification Report Table & Bar - labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]] - report_data = [[lbl, - report_dict[lbl]["precision"], - report_dict[lbl]["recall"], - report_dict[lbl]["f1-score"]] - for lbl in labels] - df_report = pd.DataFrame(report_data, - columns=["Label", "Precision", "Recall", "F1-Score"]) - - fig_report = go.Figure() - for col in ["Precision", "Recall", "F1-Score"]: - fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col)) - fig_report.update_layout(barmode='group', - title="Class Metrics: PhoBERT + CRF", - xaxis_title="Label", yaxis_title="Score", - yaxis=dict(range=[0, 1.0])) - - labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]] - report_data2 = [[lbl, - report_dict_2[lbl]["precision"], - report_dict_2[lbl]["recall"], - report_dict_2[lbl]["f1-score"]] - for lbl in labels2] - df_report2 = pd.DataFrame(report_data2, - columns=["Label", "Precision", "Recall", "F1-Score"]) - - fig_report2 = go.Figure() - for col in ["Precision", "Recall", "F1-Score"]: - fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col)) - fig_report2.update_layout(barmode='group', - title="Class Metrics: PhoBERT + Softmax", - xaxis_title="Label", yaxis_title="Score", - yaxis=dict(range=[0, 1.0])) - - # 4️⃣ Model & Data Comparison Tables - df_model = pd.DataFrame( - [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()], - columns=["Model", "F1-Score", "Accuracy"] - ) - df_data = pd.DataFrame( - [[s, f1] for s, f1 in data_compare["Data"].items()], - columns=["Preprocessing", "F1-Score"] - ) - - # ==== CLEAN LAYOUT WITH COLUMNS ==== - - # Row 1: Loss | F1 - col1, col2 = st.columns(2) - with col1: - st.plotly_chart(fig_loss, use_container_width=True) - with col2: - st.plotly_chart(fig_f1, use_container_width=True) - - # Row 2: Class Report Table | Bar Chart - col3, col4 = st.columns(2) - with col3: - st.plotly_chart(fig_report2, use_container_width=True) - with col4: - st.plotly_chart(fig_report, use_container_width=True) - - # Row 3: Model Compare | Data Compare - col5, col6 = st.columns(2) - with col5: - st.markdown("**Model Comparison**") - st.dataframe(df_model, use_container_width=True) - with col6: - st.markdown("**Data Preprocessing Comparison**") - st.dataframe(df_data, use_container_width=True) - -# --- Tab 3: MODEL DEMO --- -with tab3: - text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội") - - if st.button("Analyze"): - if not text.strip(): - st.warning("Please enter some text!") - else: - tokens, labels = predict_demo(text) - - st.subheader("Detected Entities") - entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"] - - if entities: - for tok, lab in entities: - st.markdown(f"🔹 **{tok}** — *{lab}*") - else: - st.info("No named entities detected.") - - st.subheader("Highlighted Text") - st.markdown(render_html(tokens, labels), unsafe_allow_html=True) diff --git a/space/space/space/space/space/src/predict.py b/space/space/space/space/space/src/predict.py index 9b5dc0e11393a3bc5096332856313958b98f0623..24f21a6df7afa2ed9eb43ffbd2e249a6d1dfd91d 100644 --- a/space/space/space/space/space/src/predict.py +++ b/space/space/space/space/space/src/predict.py @@ -1,8 +1,6 @@ import torch -from src.model import CRF_Tagger -from src.preprocessing import process_demo_sentence -import os - +from model import CRF_Tagger +from preprocessing import process_demo_sentence def predict(model, loader, count_loss=True): model.eval() # Evaluation Mode, Ignore Dropout, BatchNorm, ... @@ -30,9 +28,6 @@ def predict(model, loader, count_loss=True): def predict_demo(text): - BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - model_path = os.path.join(BASE_DIR, "models", "best_epoch_16.pt") - id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'} @@ -40,7 +35,7 @@ def predict_demo(text): NUM_TAGS = 7 model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS) - model.load_state_dict(torch.load(model_path)) + model.load_state_dict(torch.load("models/best_epoch_16.pt")) model.eval() with torch.no_grad(): preds = model.decode(x) diff --git a/space/space/space/space/space/src/preprocessing.py b/space/space/space/space/space/src/preprocessing.py index f452959c94dfdf494c03cc737617f946fb5e3831..0fb86551b048dde3f26dae590cf1992addc07d8f 100644 --- a/space/space/space/space/space/src/preprocessing.py +++ b/space/space/space/space/space/src/preprocessing.py @@ -3,7 +3,7 @@ import torch from transformers import AutoTokenizer, AutoModel from tqdm import tqdm from sklearn.model_selection import train_test_split -from src.configs import configs +from configs import configs from pyvi import ViTokenizer def join_tokens(tokens): diff --git a/space/space/space/space/space/src/templates/demo.html b/space/space/space/space/space/src/templates/demo.html new file mode 100644 index 0000000000000000000000000000000000000000..d9ec1afeb3bff9989ec501398c4a5581ac7a429c --- /dev/null +++ b/space/space/space/space/space/src/templates/demo.html @@ -0,0 +1,349 @@ + + + + + + Model Demo + + + + + + + +
+ + + +
+ +
+ +
+ + + +
+ +
+ +
+ +
+
+
+ + +
+ + +
+ +
+ +
+ + +
+ + Characters: 38 / 300 + + + Words: 7 + +
+ + +
+ +
+ + + + +
+ +
+ + + + +
+
+
+ + \ No newline at end of file diff --git a/space/space/space/space/space/src/train.py b/space/space/space/space/space/src/train.py index 23c93d6f33939b6122723a162be5db96f0067963..4da3acb0e83e1cf34994e686c32612014a7698c0 100644 --- a/space/space/space/space/space/src/train.py +++ b/space/space/space/space/space/src/train.py @@ -1,6 +1,6 @@ import wandb from tqdm import tqdm -from src.evaluate import evaluate +from evaluate import evaluate import torch def train_model(model, optimizer, configs, loaders): diff --git a/space/space/space/space/src/app.py b/space/space/space/space/src/app.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a665544c3da792692f75701508f89492e689adc9 100644 --- a/space/space/space/space/src/app.py +++ b/space/space/space/space/src/app.py @@ -0,0 +1,32 @@ +from flask import Flask, render_template, request, jsonify +from flask_cors import CORS +from predict import predict_demo +from front import render_html +import os + +app = Flask(__name__, template_folder=os.path.join(os.path.dirname(__file__), 'templates')) +CORS(app) +@app.route('/') +def index(): + return render_template('demo.html') + +@app.route('/predict', methods=['POST']) +def predict(): + try: + data = request.get_json() + print("Received data:", data) + text = data.get('text', '') + print("Text:", text) + if not text.strip(): + return jsonify({'error': 'No text provided.'}), 400 + tokens, labels = predict_demo(text) + print("Tokens:", tokens) + print("Labels:", labels) + html_result = render_html(tokens, labels) + print("HTML Result:", html_result) + return jsonify({'tokens': tokens, 'labels': labels, 'html_result': html_result}) + except Exception as e: + print("Exception:", e) + return jsonify({'error': str(e)}), 500 +if __name__ == '__main__': + app.run(host='0.0.0.0', port=5000, debug=True) diff --git a/space/space/src/app.py b/space/space/src/app.py index a665544c3da792692f75701508f89492e689adc9..7e2340b2fce6f9609bbf024a9fdbe6ebc0a7167b 100644 --- a/space/space/src/app.py +++ b/space/space/src/app.py @@ -1,32 +1,154 @@ -from flask import Flask, render_template, request, jsonify -from flask_cors import CORS -from predict import predict_demo -from front import render_html -import os - -app = Flask(__name__, template_folder=os.path.join(os.path.dirname(__file__), 'templates')) -CORS(app) -@app.route('/') -def index(): - return render_template('demo.html') - -@app.route('/predict', methods=['POST']) -def predict(): - try: - data = request.get_json() - print("Received data:", data) - text = data.get('text', '') - print("Text:", text) - if not text.strip(): - return jsonify({'error': 'No text provided.'}), 400 - tokens, labels = predict_demo(text) - print("Tokens:", tokens) - print("Labels:", labels) - html_result = render_html(tokens, labels) - print("HTML Result:", html_result) - return jsonify({'tokens': tokens, 'labels': labels, 'html_result': html_result}) - except Exception as e: - print("Exception:", e) - return jsonify({'error': str(e)}), 500 -if __name__ == '__main__': - app.run(host='0.0.0.0', port=5000, debug=True) +import streamlit as st +import pandas as pd +import plotly.graph_objects as go + +from src.predict import predict_demo +from src.front import render_html +from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare + +st.set_page_config(page_title="Vietnamese NER", layout="wide") + +# ===== Main Title ===== +st.title("🔍 Vietnamese Named Entity Recognition Demo") + +# Tabs +tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"]) + +# --- Tab 1: DATA ANALYSIS --- +with tab1: + col1, col2 = st.columns(2) + + # ==== Distribution of NER Label Frequency ==== + with col1: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png") + + # ==== Distribution of NER Label Frequency (Add crawled data) ==== + with col2: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png") + + # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ==== + with col1: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png") + + # ==== Distribution of Sentence Lengths ==== + with col2: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png") + + # ==== Distribution of Token Lengths ==== + with col1: + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png") + +# --- Tab 2: TRAINING RESULTS --- +with tab2: + st.set_page_config( + page_title="Vietnamese NER", + layout="wide", + initial_sidebar_state="expanded" + ) + + # ==== CREATE FIGURES ==== + + # 1️⃣ Loss + fig_loss = go.Figure() + fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"], + mode='lines+markers', name='Train Loss')) + fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"], + mode='lines+markers', name='Validation Loss')) + fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss") + + # 2️⃣ F1-Score + fig_f1 = go.Figure() + fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"], + mode='lines+markers', name='Train F1')) + fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"], + mode='lines+markers', name='Validation F1')) + fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score") + + # 3️⃣ Classification Report Table & Bar + labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]] + report_data = [[lbl, + report_dict[lbl]["precision"], + report_dict[lbl]["recall"], + report_dict[lbl]["f1-score"]] + for lbl in labels] + df_report = pd.DataFrame(report_data, + columns=["Label", "Precision", "Recall", "F1-Score"]) + + fig_report = go.Figure() + for col in ["Precision", "Recall", "F1-Score"]: + fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col)) + fig_report.update_layout(barmode='group', + title="Class Metrics: PhoBERT + CRF", + xaxis_title="Label", yaxis_title="Score", + yaxis=dict(range=[0, 1.0])) + + labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]] + report_data2 = [[lbl, + report_dict_2[lbl]["precision"], + report_dict_2[lbl]["recall"], + report_dict_2[lbl]["f1-score"]] + for lbl in labels2] + df_report2 = pd.DataFrame(report_data2, + columns=["Label", "Precision", "Recall", "F1-Score"]) + + fig_report2 = go.Figure() + for col in ["Precision", "Recall", "F1-Score"]: + fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col)) + fig_report2.update_layout(barmode='group', + title="Class Metrics: PhoBERT + Softmax", + xaxis_title="Label", yaxis_title="Score", + yaxis=dict(range=[0, 1.0])) + + # 4️⃣ Model & Data Comparison Tables + df_model = pd.DataFrame( + [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()], + columns=["Model", "F1-Score", "Accuracy"] + ) + df_data = pd.DataFrame( + [[s, f1] for s, f1 in data_compare["Data"].items()], + columns=["Preprocessing", "F1-Score"] + ) + + # ==== CLEAN LAYOUT WITH COLUMNS ==== + + # Row 1: Loss | F1 + col1, col2 = st.columns(2) + with col1: + st.plotly_chart(fig_loss, use_container_width=True) + with col2: + st.plotly_chart(fig_f1, use_container_width=True) + + # Row 2: Class Report Table | Bar Chart + col3, col4 = st.columns(2) + with col3: + st.plotly_chart(fig_report2, use_container_width=True) + with col4: + st.plotly_chart(fig_report, use_container_width=True) + + # Row 3: Model Compare | Data Compare + col5, col6 = st.columns(2) + with col5: + st.markdown("**Model Comparison**") + st.dataframe(df_model, use_container_width=True) + with col6: + st.markdown("**Data Preprocessing Comparison**") + st.dataframe(df_data, use_container_width=True) + +# --- Tab 3: MODEL DEMO --- +with tab3: + text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội") + + if st.button("Analyze"): + if not text.strip(): + return jsonify({'error': 'No text provided.'}), 400 + tokens, labels = predict_demo(text) + print("Tokens:", tokens) + print("Labels:", labels) + html_result = render_html(tokens, labels) + print("HTML Result:", html_result) + return jsonify({'tokens': tokens, 'labels': labels, 'html_result': html_result}) + except Exception as e: + print("Exception:", e) + return jsonify({'error': str(e)}), 500 +if __name__ == '__main__': + app.run(host='0.0.0.0', port=5000, debug=True) diff --git a/space/src/app.py b/space/src/app.py index 7e2340b2fce6f9609bbf024a9fdbe6ebc0a7167b..246d52a5dc9ab9ef1beb03f6c4dcfc7f4a8e026b 100644 --- a/space/src/app.py +++ b/space/src/app.py @@ -9,7 +9,7 @@ from results.output import training_log, report_dict, report_dict_2, model_compa st.set_page_config(page_title="Vietnamese NER", layout="wide") # ===== Main Title ===== -st.title("🔍 Vietnamese Named Entity Recognition Demo") +st.title("🔍 Vietnamese Named Entity Recognition (NER) Application") # Tabs tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"]) @@ -20,23 +20,28 @@ with tab1: # ==== Distribution of NER Label Frequency ==== with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png", + caption="NER Label Frequency Distribution") # ==== Distribution of NER Label Frequency (Add crawled data) ==== with col2: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png", + caption="NER Label Frequency (Extended with Crawled Data)") # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ==== with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png", + caption="Number of Entities per Sentence") # ==== Distribution of Sentence Lengths ==== with col2: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png", + caption="Sentence Length Distribution") # ==== Distribution of Token Lengths ==== with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png", + caption="Token Length Distribution") # --- Tab 2: TRAINING RESULTS --- with tab2: @@ -136,6 +141,8 @@ with tab2: # --- Tab 3: MODEL DEMO --- with tab3: + st.header("🧪 Vietnamese Named Entity Recognition Demo") + text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội") if st.button("Analyze"): diff --git a/src/app.py b/src/app.py index 246d52a5dc9ab9ef1beb03f6c4dcfc7f4a8e026b..36c094a68bc2f1b22de36c23879fb4b0d36bf7d9 100644 --- a/src/app.py +++ b/src/app.py @@ -9,7 +9,7 @@ from results.output import training_log, report_dict, report_dict_2, model_compa st.set_page_config(page_title="Vietnamese NER", layout="wide") # ===== Main Title ===== -st.title("🔍 Vietnamese Named Entity Recognition (NER) Application") +st.title("🔍 Vietnamese Named Entity Recognition Demo") # Tabs tab1, tab2, tab3 = st.tabs(["📊 Data Analysis", "📈 Training Results", "🧪 Model Demo"]) @@ -20,28 +20,23 @@ with tab1: # ==== Distribution of NER Label Frequency ==== with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png", - caption="NER Label Frequency Distribution") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq.png") # ==== Distribution of NER Label Frequency (Add crawled data) ==== with col2: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png", - caption="NER Label Frequency (Extended with Crawled Data)") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ner_freq_add.png") # ==== Distribution of the Number of Entities per Sentence (0 to 15+) ==== with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png", - caption="Number of Entities per Sentence") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/ent_dis.png") # ==== Distribution of Sentence Lengths ==== with col2: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png", - caption="Sentence Length Distribution") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/sent_len.png") # ==== Distribution of Token Lengths ==== with col1: - st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png", - caption="Token Length Distribution") + st.image("https://raw.githubusercontent.com/duclld1709/vietnamese-ner/refs/heads/main/results/token_len.png") # --- Tab 2: TRAINING RESULTS --- with tab2: @@ -141,21 +136,22 @@ with tab2: # --- Tab 3: MODEL DEMO --- with tab3: - st.header("🧪 Vietnamese Named Entity Recognition Demo") - text = st.text_input("Enter Vietnamese text:", "Nguyễn Văn A đang làm việc tại Hà Nội") if st.button("Analyze"): if not text.strip(): - return jsonify({'error': 'No text provided.'}), 400 - tokens, labels = predict_demo(text) - print("Tokens:", tokens) - print("Labels:", labels) - html_result = render_html(tokens, labels) - print("HTML Result:", html_result) - return jsonify({'tokens': tokens, 'labels': labels, 'html_result': html_result}) - except Exception as e: - print("Exception:", e) - return jsonify({'error': str(e)}), 500 -if __name__ == '__main__': - app.run(host='0.0.0.0', port=5000, debug=True) + st.warning("Please enter some text!") + else: + tokens, labels = predict_demo(text) + + st.subheader("Detected Entities") + entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"] + + if entities: + for tok, lab in entities: + st.markdown(f"🔹 **{tok}** — *{lab}*") + else: + st.info("No named entities detected.") + + st.subheader("Highlighted Text") + st.markdown(render_html(tokens, labels), unsafe_allow_html=True) diff --git a/src/predict.py b/src/predict.py index 24f21a6df7afa2ed9eb43ffbd2e249a6d1dfd91d..9b5dc0e11393a3bc5096332856313958b98f0623 100644 --- a/src/predict.py +++ b/src/predict.py @@ -1,6 +1,8 @@ import torch -from model import CRF_Tagger -from preprocessing import process_demo_sentence +from src.model import CRF_Tagger +from src.preprocessing import process_demo_sentence +import os + def predict(model, loader, count_loss=True): model.eval() # Evaluation Mode, Ignore Dropout, BatchNorm, ... @@ -28,6 +30,9 @@ def predict(model, loader, count_loss=True): def predict_demo(text): + BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + model_path = os.path.join(BASE_DIR, "models", "best_epoch_16.pt") + id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'} @@ -35,7 +40,7 @@ def predict_demo(text): NUM_TAGS = 7 model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS) - model.load_state_dict(torch.load("models/best_epoch_16.pt")) + model.load_state_dict(torch.load(model_path)) model.eval() with torch.no_grad(): preds = model.decode(x) diff --git a/src/preprocessing.py b/src/preprocessing.py index 0fb86551b048dde3f26dae590cf1992addc07d8f..f452959c94dfdf494c03cc737617f946fb5e3831 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -3,7 +3,7 @@ import torch from transformers import AutoTokenizer, AutoModel from tqdm import tqdm from sklearn.model_selection import train_test_split -from configs import configs +from src.configs import configs from pyvi import ViTokenizer def join_tokens(tokens): diff --git a/src/train.py b/src/train.py index 4da3acb0e83e1cf34994e686c32612014a7698c0..23c93d6f33939b6122723a162be5db96f0067963 100644 --- a/src/train.py +++ b/src/train.py @@ -1,6 +1,6 @@ import wandb from tqdm import tqdm -from evaluate import evaluate +from src.evaluate import evaluate import torch def train_model(model, optimizer, configs, loaders):