Spaces:

DucLai
/

Vietnamese_NER

Running

App Files Files Community

GitHub Actions commited on Jul 1

Commit

1364963

1 Parent(s): b746f44

Auto-deploy from GitHub (binary files removed)

Browse files

Files changed (33) hide show

space/results/output.log +88 -0
space/results/output.py +116 -0
space/space/.gitignore +0 -2
space/space/requirements.txt +0 -0
space/space/space/space/space/space/.github/workflows/main.yml +47 -0
space/space/space/space/space/space/.gitignore +23 -0
space/space/space/space/space/space/LICENSE +201 -0
space/space/space/space/space/space/configs/config.yaml +1 -0
space/space/space/space/space/space/environment.yml +9 -0
space/space/space/space/space/space/models/best_epoch_16.pt +3 -0
space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb +0 -0
space/space/space/space/space/space/notebooks/Kien_RF_lightgbm.ipynb +741 -0
space/space/space/space/space/space/notebooks/Kien_Rule_base.ipynb +0 -0
space/space/space/space/space/space/notebooks/Softmax_PhoBERT.ipynb +0 -0
space/space/space/space/space/space/requirements.txt +0 -0
space/space/space/space/space/space/run.py +73 -0
space/space/space/space/space/space/space/.gitattributes +35 -0
space/space/space/space/space/space/space/README.md +87 -0
space/space/space/space/space/space/src/__init__.py +4 -0
space/space/space/space/space/space/src/app.py +64 -0
space/space/space/space/space/space/src/configs.py +15 -0
space/space/space/space/space/space/src/data_set.py +31 -0
space/space/space/space/space/space/src/evaluate.py +21 -0
space/space/space/space/space/space/src/front.py +32 -0
space/space/space/space/space/space/src/model.py +16 -0
space/space/space/space/space/space/src/predict.py +46 -0
space/space/space/space/space/space/src/preprocessing.py +171 -0
space/space/space/space/space/space/src/torchcrf/__init__.py +340 -0
space/space/space/space/space/space/src/train.py +98 -0
space/space/space/src/app.py +96 -14
space/space/space/src/predict.py +1 -1
space/space/space/st.py +98 -0
src/predict.py +5 -1

space/results/output.log ADDED Viewed

	@@ -0,0 +1,88 @@

+Train Epoch 1/20: 100%|██████████| 736/736 [00:22<00:00, 32.46it/s, avg_loss=2.69, batch_loss=0.947]
+Epoch 1: train_loss=2.6912, train_f1=0.8224, val_loss=1.0848, val_f1=0.8273
+Saved imporved model to ./models/best_epoch_1.pt
+Train Epoch 2/20: 100%|██████████| 736/736 [00:21<00:00, 33.55it/s, avg_loss=0.806, batch_loss=0.998]
+Epoch 2: train_loss=0.8061, train_f1=0.8674, val_loss=0.7191, val_f1=0.8613
+Saved imporved model to ./models/best_epoch_2.pt
+Train Epoch 3/20: 100%|██████████| 736/736 [00:22<00:00, 32.59it/s, avg_loss=0.584, batch_loss=0.0527]
+Epoch 3: train_loss=0.5842, train_f1=0.8996, val_loss=0.5643, val_f1=0.8895
+Saved imporved model to ./models/best_epoch_3.pt
+Train Epoch 4/20: 100%|██████████| 736/736 [00:23<00:00, 31.34it/s, avg_loss=0.478, batch_loss=1.06]
+Epoch 4: train_loss=0.4782, train_f1=0.9122, val_loss=0.4838, val_f1=0.8994
+Saved imporved model to ./models/best_epoch_4.pt
+Train Epoch 5/20: 100%|██████████| 736/736 [00:22<00:00, 32.59it/s, avg_loss=0.406, batch_loss=0.421]
+Epoch 5: train_loss=0.4056, train_f1=0.9254, val_loss=0.4281, val_f1=0.9101
+Saved imporved model to ./models/best_epoch_5.pt
+Train Epoch 6/20: 100%|██████████| 736/736 [00:21<00:00, 34.15it/s, avg_loss=0.36, batch_loss=1.01]
+Epoch 6: train_loss=0.3599, train_f1=0.9343, val_loss=0.3934, val_f1=0.9190
+Saved imporved model to ./models/best_epoch_6.pt
+Train Epoch 7/20: 100%|██████████| 736/736 [00:22<00:00, 33.08it/s, avg_loss=0.322, batch_loss=0.392]
+Epoch 7: train_loss=0.3218, train_f1=0.9383, val_loss=0.3751, val_f1=0.9192
+Saved imporved model to ./models/best_epoch_7.pt
+Train Epoch 8/20: 100%|██████████| 736/736 [00:22<00:00, 32.66it/s, avg_loss=0.294, batch_loss=0.468]
+Epoch 8: train_loss=0.2942, train_f1=0.9424, val_loss=0.3560, val_f1=0.9189
+Train Epoch 9/20: 100%|██████████| 736/736 [00:23<00:00, 31.68it/s, avg_loss=0.27, batch_loss=0.681]
+Epoch 9: train_loss=0.2699, train_f1=0.9429, val_loss=0.3521, val_f1=0.9177
+Train Epoch 10/20: 100%|██████████| 736/736 [00:21<00:00, 33.46it/s, avg_loss=0.252, batch_loss=0.525]
+Epoch 10: train_loss=0.2517, train_f1=0.9493, val_loss=0.3413, val_f1=0.9222
+Saved imporved model to ./models/best_epoch_10.pt
+Train Epoch 11/20: 100%|██████████| 736/736 [00:22<00:00, 32.92it/s, avg_loss=0.238, batch_loss=0.022]
+Epoch 11: train_loss=0.2383, train_f1=0.9551, val_loss=0.3292, val_f1=0.9232
+Saved imporved model to ./models/best_epoch_11.pt
+Train Epoch 12/20: 100%|██████████| 736/736 [00:23<00:00, 31.72it/s, avg_loss=0.222, batch_loss=0.529]
+Epoch 12: train_loss=0.2223, train_f1=0.9543, val_loss=0.3305, val_f1=0.9207
+Train Epoch 13/20: 100%|██████████| 736/736 [00:23<00:00, 31.74it/s, avg_loss=0.213, batch_loss=0.381]
+Epoch 13: train_loss=0.2127, train_f1=0.9593, val_loss=0.3244, val_f1=0.9221
+Train Epoch 14/20: 100%|██████████| 736/736 [00:23<00:00, 31.69it/s, avg_loss=0.203, batch_loss=0.279]
+Epoch 14: train_loss=0.2026, train_f1=0.9609, val_loss=0.3213, val_f1=0.9224
+Train Epoch 15/20: 100%|██████████| 736/736 [00:23<00:00, 31.84it/s, avg_loss=0.193, batch_loss=0.0462]
+Epoch 15: train_loss=0.1925, train_f1=0.9574, val_loss=0.3392, val_f1=0.9117
+Train Epoch 16/20: 100%|██████████| 736/736 [00:22<00:00, 32.11it/s, avg_loss=0.186, batch_loss=0.943]
+Epoch 16: train_loss=0.1863, train_f1=0.9654, val_loss=0.3169, val_f1=0.9250
+Saved imporved model to ./models/best_epoch_16.pt
+Train Epoch 17/20: 100%|██████████| 736/736 [00:22<00:00, 32.38it/s, avg_loss=0.18, batch_loss=0.113]
+Epoch 17: train_loss=0.1795, train_f1=0.9677, val_loss=0.3187, val_f1=0.9237
+Train Epoch 18/20: 100%|██████████| 736/736 [00:22<00:00, 33.30it/s, avg_loss=0.173, batch_loss=0.00558]
+Epoch 18: train_loss=0.1728, train_f1=0.9692, val_loss=0.3219, val_f1=0.9173
+Train Epoch 19/20: 100%|██████████| 736/736 [00:23<00:00, 31.48it/s, avg_loss=0.167, batch_loss=0.115]
+Epoch 19: train_loss=0.1673, train_f1=0.9681, val_loss=0.3261, val_f1=0.9195
+Train Epoch 20/20: 100%|██████████| 736/736 [00:22<00:00, 32.17it/s, avg_loss=0.164, batch_loss=0.0463]
+Epoch 20: train_loss=0.1640, train_f1=0.9715, val_loss=0.3230, val_f1=0.9185
+Loading best model from ./models/best_epoch_16.pt for final evaluation...
+Done
+Evaluation on test set ...
+Test_loss=0.2967, Test_f1=0.9087
+              precision    recall  f1-score   support
+           0       1.00      1.00      1.00     51036
+           1       0.99      0.98      0.99      1112
+           2       0.97      0.99      0.98       506
+           3       0.86      0.79      0.82       180
+           4       0.84      0.80      0.82       291
+           5       0.89      0.91      0.90       939
+           6       0.87      0.84      0.86       428
+    accuracy                           0.99     54492
+   macro avg       0.92      0.90      0.91     54492
+weighted avg       0.99      0.99      0.99     54492

space/results/output.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Model Results
+training_log = {
+    "epoch": list(range(1, 21)),
+    "train_loss": [
+        2.6912, 0.8061, 0.5842, 0.4782, 0.4056,
+        0.3599, 0.3218, 0.2942, 0.2699, 0.2517,
+        0.2383, 0.2223, 0.2127, 0.2026, 0.1925,
+        0.1863, 0.1795, 0.1728, 0.1673, 0.1640
+    ],
+    "val_loss": [
+        1.0848, 0.7191, 0.5643, 0.4838, 0.4281,
+        0.3934, 0.3751, 0.3560, 0.3521, 0.3413,
+        0.3292, 0.3305, 0.3244, 0.3213, 0.3392,
+        0.3169, 0.3187, 0.3219, 0.3261, 0.3230
+    ],
+    "train_f1": [
+        0.8224, 0.8674, 0.8996, 0.9122, 0.9254,
+        0.9343, 0.9383, 0.9424, 0.9429, 0.9493,
+        0.9551, 0.9543, 0.9593, 0.9609, 0.9574,
+        0.9654, 0.9677, 0.9692, 0.9681, 0.9715
+    ],
+    "val_f1": [
+        0.8273, 0.8613, 0.8895, 0.8994, 0.9101,
+        0.9190, 0.9192, 0.9189, 0.9177, 0.9222,
+        0.9232, 0.9207, 0.9221, 0.9224, 0.9117,
+        0.9250, 0.9237, 0.9173, 0.9195, 0.9185
+    ]
+}
+report_dict = {
+    'O': {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 51036},
+    'B-PER': {"precision": 0.99, "recall": 0.98, "f1-score": 0.98, "support": 1112},
+    'I-PER': {"precision": 0.97, "recall": 0.99, "f1-score": 0.98, "support": 506},
+    'B-ORG': {"precision": 0.93, "recall": 0.95, "f1-score": 0.94, "support": 939},
+    'I-ORG': {"precision": 0.93, "recall": 0.91, "f1-score": 0.92, "support": 428},
+    'B-LOC': {"precision": 0.83, "recall": 0.84, "f1-score": 0.84, "support": 180},
+    'I-LOC': {"precision": 0.88, "recall": 0.84, "f1-score": 0.86, "support": 291},
+    "accuracy": 0.99,
+    "macro avg": {"precision": 0.93, "recall": 0.93, "f1-score": 0.93, "support": 54492},
+    "weighted avg": {"precision": 0.99, "recall": 0.99, "f1-score": 0.99, "support": 54492}
+}
+report_dict_2 = {
+    'O': {"precision": 1.00, "recall": 1.00, "f1-score": 1.00, "support": 68476},
+    'B-PER': {"precision": 0.99, "recall": 0.98, "f1-score": 0.98, "support": 1464},
+    'I-PER': {"precision": 0.98, "recall": 0.98, "f1-score": 0.98, "support": 686},
+    'B-ORG': {"precision": 0.77, "recall": 0.82, "f1-score": 0.80, "support": 257},
+    'I-ORG': {"precision": 0.80, "recall": 0.77, "f1-score": 0.78, "support": 430},
+    'B-LOC': {"precision": 0.88, "recall": 0.90, "f1-score": 0.89, "support": 1241},
+    'I-LOC': {"precision": 0.83, "recall": 0.82, "f1-score": 0.82, "support": 554},
+    "accuracy": 0.99,
+    "macro avg": {"precision": 0.89, "recall": 0.89, "f1-score": 0.89, "support": 73108},
+    "weighted avg": {"precision": 0.99, "recall": 0.99, "f1-score": 0.99, "support": 73108}
+}
+model_compare = {
+    "Header": ["Model", "F1", "Accuracy"],
+    "Data": {
+        "PhoBERT + CRF": {"F1": 0.93, "Accuracy": 0.99},
+        "CRF": {"F1": 0.91, "Accuracy": 0.99},
+        "Softmax": {"F1": 0.89, "Accuracy": 0.99},
+        "Random Forest": {"F1": 0.78, "Accuracy": 0.98}
+    }
+}
+data_compare = {
+    "Header": ["Data Preprocessing Strategy", "F1"],
+    "Data": {
+        "Raw": 0.93,
+        "Crawl for Balance": 0.91,
+        "Remove Sentences with Only 'O' Tags": 0.91
+    }
+}
+# EDA
+data_aug_count_sorted = {
+    'B-PER': 474,
+    'I-PER': 121,
+    'B-LOC': 874,
+    'I-LOC': 289,
+    'B-ORG': 1110,
+    'I-ORG': 761
+}
+raw_data_count_sorted = {
+    'B-PER': 7479,
+    'I-PER': 3522,
+    'B-LOC': 6244,
+    'I-LOC': 2783,
+    'B-ORG': 1212,
+    'I-ORG': 2055,
+    'B-NAT': 282,
+    'I-NAT': 279
+}
+raw_data_count_withoutNAT_sorted = {
+    'B-PER': 7479,
+    'I-PER': 3522,
+    'B-LOC': 6244,
+    'I-LOC': 2783,
+    'B-ORG': 1212,
+    'I-ORG': 2055
+}
+combined_count_sorted = {
+    'B-PER': 7953,
+    'I-PER': 3643,
+    'B-LOC': 7118,
+    'I-LOC': 3072,
+    'B-ORG': 2322,
+    'I-ORG': 2816
+}

space/space/.gitignore CHANGED Viewed

@@ -10,8 +10,6 @@ __pycache__/
 # Dataset and results folders
 data/
-results/
-outputs/
 logs/
 # Large files

 # Dataset and results folders
 data/
 logs/
 # Large files

space/space/requirements.txt CHANGED Viewed

Binary files a/space/space/requirements.txt and b/space/space/requirements.txt differ

space/space/space/space/space/space/.github/workflows/main.yml ADDED Viewed

	@@ -0,0 +1,47 @@

+name: Deploy to Hugging Face Space
+on:
+  push:
+    branches:
+      - main  # hoặc branch bạn dùng
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Set up Git
+        run: |
+          git config --global user.email "[email protected]"
+          git config --global user.name "GitHub Actions"
+      - name: Push to Hugging Face Spaces
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          git clone https://huggingface.co/spaces/DucLai/Vietnamese_NER space
+          # Đồng bộ code vào repo Space (không copy .git)
+          rsync -av --exclude '.git' ./ space/
+          # Xoá file binary ra khỏi Git index trước khi commit
+          cd space
+          find . -type f \( \
+            -iname "*.png" -o \
+            -iname "*.jpg" -o \
+            -iname "*.jpeg" -o \
+            -iname "*.mp4" -o \
+            -iname "*.zip" -o \
+            -iname "*.pth" -o \
+            -iname "*.h5" -o \
+            -iname "*.tar.gz" -o \
+            -iname "*.wav" \
+          \) -exec git rm --cached {} \; || true
+          # Commit và push
+          git add .
+          git commit -m "Auto-deploy from GitHub (binary files removed)" || echo "No changes to commit"
+          git push https://DucLai:${HF_TOKEN}@huggingface.co/spaces/DucLai/Vietnamese_NER HEAD

space/space/space/space/space/space/.gitignore ADDED Viewed

	@@ -0,0 +1,23 @@

+# Python cache
+__pycache__/
+*.pyc
+*.pyo
+# Hugging Face binary/model outputs
+*.pth
+*.h5
+*.ckpt
+# Dataset and results folders
+data/
+results/
+outputs/
+logs/
+# Large files
+*.zip
+*.tar.gz
+*.mp4
+*.png
+*.jpg
+*.jpeg

space/space/space/space/space/space/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

space/space/space/space/space/space/configs/config.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ ECHO is on.

space/space/space/space/space/space/environment.yml ADDED Viewed

	@@ -0,0 +1,9 @@

+name: vnner
+channels:
+  - defaults
+  - conda-forge
+dependencies:
+  - python=3.10
+  - pip
+  - pip:
+      - -r requirements.txt

space/space/space/space/space/space/models/best_epoch_16.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:622cac3a55eec6a245f70c2ec7591d8fbfa8c18e13db7555915405fb57b145a0
+size 24130

space/space/space/space/space/space/notebooks/Duc_Notebook.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

space/space/space/space/space/space/notebooks/Kien_RF_lightgbm.ipynb ADDED Viewed

	@@ -0,0 +1,741 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "10ec017cb658e125",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:21:33.244538Z",
+     "start_time": "2025-06-11T00:21:05.317283Z"
+    }
+   },
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}\n",
+    "df_train = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"train\"])\n",
+    "df_valid = pd.read_parquet(\"hf://datasets/datnth1709/VLSP2016-NER-data/\" + splits[\"valid\"])\n",
+    "df = pd.concat([df_train, df_valid]).reset_index(drop=True)"
+   ],
+   "outputs": [],
+   "execution_count": 1
+  },
+  {
+   "cell_type": "code",
+   "id": "c533c55a2ad7b16e",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:21:33.499341Z",
+     "start_time": "2025-06-11T00:21:33.262933Z"
+    }
+   },
+   "source": [
+    "# Tạo thêm các cột khác\n",
+    "def join_tokens(tokens):\n",
+    "    text = ' '.join(tokens)\n",
+    "    return text\n",
+    "\n",
+    "def reform_raw_text(tokens):\n",
+    "    text = ' '.join(tokens)\n",
+    "    return text.replace(\"_\", \" \")\n",
+    "\n",
+    "def label(x):\n",
+    "  return [id_tag[int(i)] for i in x]\n",
+    "\n",
+    "def replace_7_8(lst):\n",
+    "    return [0 if x in (7, 8) else x for x in lst]\n",
+    "\n",
+    "\n",
+    "tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}\n",
+    "id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}\n",
+    "\n",
+    "\n",
+    "df['ner_tags'] = df['ner_tags'].apply(replace_7_8)\n",
+    "df['text_withseg'] = df['tokens'].apply(join_tokens)\n",
+    "df['text_raw'] = df['tokens'].apply(reform_raw_text)\n",
+    "df[\"ner_labels\"] = df.ner_tags.apply(label)\n",
+    "df.columns = ['tokens', 'id', 'seg_text', 'raw_text', 'labels']\n",
+    "df\n"
+   ],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "                                                  tokens  \\\n",
+       "0                        [Không_khí, thật, náo_nhiệt, .]   \n",
+       "1      [Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...   \n",
+       "2      [Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...   \n",
+       "3      [Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...   \n",
+       "4                         [Nhật_ký, của, thuyền_viên, .]   \n",
+       "...                                                  ...   \n",
+       "16853  [Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...   \n",
+       "16854        [Nhưng, mọi, chuyện, không, dừng, ở, đó, .]   \n",
+       "16855  [Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...   \n",
+       "16856  [Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...   \n",
+       "16857  [Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...   \n",
+       "\n",
+       "                                                      id  \\\n",
+       "0                                           [0, 0, 0, 0]   \n",
+       "1      [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   \n",
+       "2      [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...   \n",
+       "3      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...   \n",
+       "4                                           [0, 0, 0, 0]   \n",
+       "...                                                  ...   \n",
+       "16853  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...   \n",
+       "16854                           [0, 0, 0, 0, 0, 0, 0, 0]   \n",
+       "16855  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   \n",
+       "16856      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]   \n",
+       "16857  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   \n",
+       "\n",
+       "                                                seg_text  \\\n",
+       "0                             Không_khí thật náo_nhiệt .   \n",
+       "1      Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...   \n",
+       "2      Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...   \n",
+       "3      Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...   \n",
+       "4                              Nhật_ký của thuyền_viên .   \n",
+       "...                                                  ...   \n",
+       "16853  Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...   \n",
+       "16854                 Nhưng mọi chuyện không dừng ở đó .   \n",
+       "16855  Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...   \n",
+       "16856  Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...   \n",
+       "16857  Trên đây mới là “ thành_tích ” tiêu tiền của m...   \n",
+       "\n",
+       "                                                raw_text  \\\n",
+       "0                             Không khí thật náo nhiệt .   \n",
+       "1      Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...   \n",
+       "2      Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...   \n",
+       "3      Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...   \n",
+       "4                              Nhật ký của thuyền viên .   \n",
+       "...                                                  ...   \n",
+       "16853  Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...   \n",
+       "16854                 Nhưng mọi chuyện không dừng ở đó .   \n",
+       "16855  Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...   \n",
+       "16856  Biết bao người đã tình nguyện hiến dâng cả cuộ...   \n",
+       "16857  Trên đây mới là “ thành tích ” tiêu tiền của m...   \n",
+       "\n",
+       "                                                  labels  \n",
+       "0                                           [O, O, O, O]  \n",
+       "1      [O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...  \n",
+       "2      [O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...  \n",
+       "3      [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...  \n",
+       "4                                           [O, O, O, O]  \n",
+       "...                                                  ...  \n",
+       "16853  [O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...  \n",
+       "16854                           [O, O, O, O, O, O, O, O]  \n",
+       "16855  [B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...  \n",
+       "16856      [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]  \n",
+       "16857  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  \n",
+       "\n",
+       "[16858 rows x 5 columns]"
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>tokens</th>\n",
+       "      <th>id</th>\n",
+       "      <th>seg_text</th>\n",
+       "      <th>raw_text</th>\n",
+       "      <th>labels</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[Không_khí, thật, náo_nhiệt, .]</td>\n",
+       "      <td>[0, 0, 0, 0]</td>\n",
+       "      <td>Không_khí thật náo_nhiệt .</td>\n",
+       "      <td>Không khí thật náo nhiệt .</td>\n",
+       "      <td>[O, O, O, O]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[Chị, Lãnh, và, Xăng, ra, đi, ,, mình, đứng, n...</td>\n",
+       "      <td>[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
+       "      <td>Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...</td>\n",
+       "      <td>Chị Lãnh và Xăng ra đi , mình đứng nhìn hai ch...</td>\n",
+       "      <td>[O, B-PER, O, B-PER, O, O, O, O, O, O, O, O, O...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[Suy_tính, mãi, ,, khóc, mãi, rồi, Phúc, lấy, ...</td>\n",
+       "      <td>[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
+       "      <td>Suy_tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...</td>\n",
+       "      <td>Suy tính mãi , khóc mãi rồi Phúc lấy ra tờ giấ...</td>\n",
+       "      <td>[O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[Hoà, bảo, hồi, mới, qua, đâu, có, biết, nấu_n...</td>\n",
+       "      <td>[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, ...</td>\n",
+       "      <td>Hoà bảo hồi mới qua đâu có biết nấu_nướng gì ,...</td>\n",
+       "      <td>Hoà bảo hồi mới qua đâu có biết nấu nướng gì ,...</td>\n",
+       "      <td>[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, B-...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>[Nhật_ký, của, thuyền_viên, .]</td>\n",
+       "      <td>[0, 0, 0, 0]</td>\n",
+       "      <td>Nhật_ký của thuyền_viên .</td>\n",
+       "      <td>Nhật ký của thuyền viên .</td>\n",
+       "      <td>[O, O, O, O]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16853</th>\n",
+       "      <td>[Nghe, thấy, đã, ghê_ghê, nhưng, Nhiêu, chưa, ...</td>\n",
+       "      <td>[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...</td>\n",
+       "      <td>Nghe thấy đã ghê_ghê nhưng Nhiêu chưa được tườ...</td>\n",
+       "      <td>Nghe thấy đã ghê ghê nhưng Nhiêu chưa được tườ...</td>\n",
+       "      <td>[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16854</th>\n",
+       "      <td>[Nhưng, mọi, chuyện, không, dừng, ở, đó, .]</td>\n",
+       "      <td>[0, 0, 0, 0, 0, 0, 0, 0]</td>\n",
+       "      <td>Nhưng mọi chuyện không dừng ở đó .</td>\n",
+       "      <td>Nhưng mọi chuyện không dừng ở đó .</td>\n",
+       "      <td>[O, O, O, O, O, O, O, O]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16855</th>\n",
+       "      <td>[Hoà, bảo, thời_gian, đầu, mặc_cảm, lắm, ,, ở,...</td>\n",
+       "      <td>[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
+       "      <td>Hoà bảo thời_gian đầu mặc_cảm lắm , ở trong nh...</td>\n",
+       "      <td>Hoà bảo thời gian đầu mặc cảm lắm , ở trong nh...</td>\n",
+       "      <td>[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16856</th>\n",
+       "      <td>[Biết_bao, người, đã, tình_nguyện, hiến_dâng, ...</td>\n",
+       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]</td>\n",
+       "      <td>Biết_bao người đã tình_nguyện hiến_dâng cả cuộ...</td>\n",
+       "      <td>Biết bao người đã tình nguyện hiến dâng cả cuộ...</td>\n",
+       "      <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16857</th>\n",
+       "      <td>[Trên, đây, mới, là, “, thành_tích, ”, tiêu, t...</td>\n",
+       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
+       "      <td>Trên đây mới là “ thành_tích ” tiêu tiền của m...</td>\n",
+       "      <td>Trên đây mới là “ thành tích ” tiêu tiền của m...</td>\n",
+       "      <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>16858 rows × 5 columns</p>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 2
+  },
+  {
+   "cell_type": "code",
+   "id": "14d9b9fae58b7173",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:21:59.373985Z",
+     "start_time": "2025-06-11T00:21:34.524025Z"
+    }
+   },
+   "source": [
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "# Load PhoBERT tokenizer và model\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"vinai/phobert-base\", use_fast=False)\n",
+    "model = AutoModel.from_pretrained(\"vinai/phobert-base\")\n",
+    "model.eval()"
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cuda\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "RobertaModel(\n",
+       "  (embeddings): RobertaEmbeddings(\n",
+       "    (word_embeddings): Embedding(64001, 768, padding_idx=1)\n",
+       "    (position_embeddings): Embedding(258, 768, padding_idx=1)\n",
+       "    (token_type_embeddings): Embedding(1, 768)\n",
+       "    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "    (dropout): Dropout(p=0.1, inplace=False)\n",
+       "  )\n",
+       "  (encoder): RobertaEncoder(\n",
+       "    (layer): ModuleList(\n",
+       "      (0-11): 12 x RobertaLayer(\n",
+       "        (attention): RobertaAttention(\n",
+       "          (self): RobertaSdpaSelfAttention(\n",
+       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "          (output): RobertaSelfOutput(\n",
+       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (intermediate): RobertaIntermediate(\n",
+       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          (intermediate_act_fn): GELUActivation()\n",
+       "        )\n",
+       "        (output): RobertaOutput(\n",
+       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "          (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (pooler): RobertaPooler(\n",
+       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "    (activation): Tanh()\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 3
+  },
+  {
+   "cell_type": "code",
+   "id": "a47ec382649c3036",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:23:23.888583Z",
+     "start_time": "2025-06-11T00:23:23.885204Z"
+    }
+   },
+   "source": [
+    "# Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece\n",
+    "def group_embeddings(tokens, embeddings):\n",
+    "    word_embeddings = []\n",
+    "    current_vecs = []\n",
+    "\n",
+    "    for token, emb in zip(tokens, embeddings):\n",
+    "        if token in [\"<s>\", \"</s>\"]:\n",
+    "            continue\n",
+    "\n",
+    "        if token.endswith(\"@@\"):\n",
+    "            current_vecs.append(emb)\n",
+    "        else:\n",
+    "            current_vecs.append(emb)\n",
+    "            word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n",
+    "            word_embeddings.append(word_emb)\n",
+    "            current_vecs = []\n",
+    "\n",
+    "    if current_vecs:  # Trong trường hợp sót lại cuối câu\n",
+    "        word_emb = torch.mean(torch.stack(current_vecs), dim=0)\n",
+    "        word_embeddings.append(word_emb)\n",
+    "\n",
+    "    return word_embeddings"
+   ],
+   "outputs": [],
+   "execution_count": 4
+  },
+  {
+   "cell_type": "code",
+   "id": "f8c0ad89ae81b0c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:25:52.567135Z",
+     "start_time": "2025-06-11T00:23:56.155322Z"
+    }
+   },
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "model.to(device)\n",
+    "\n",
+    "all_embeddings = []  # list of [seq_len_i, 768] tensors\n",
+    "all_labels = [] # list of [seq_len_i,] tensors\n",
+    "len_em = []\n",
+    "\n",
+    "# count = 0\n",
+    "\n",
+    "for i, row in df.iterrows():\n",
+    "\n",
+    "    # count += 1\n",
+    "    # if count == 500:\n",
+    "    #   break\n",
+    "\n",
+    "    # Truy cập phần tử từng dòng\n",
+    "    sentence = row['seg_text']\n",
+    "    gold_labels = row[\"id\"]\n",
+    "\n",
+    "    # Cho sentence đi qua SentencePiece\n",
+    "    input_ids = tokenizer.encode(sentence, return_tensors=\"pt\").to(device)\n",
+    "\n",
+    "    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].to(device))\n",
+    "\n",
+    "    # Encode tạo embeddings\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model(input_ids)\n",
+    "        last_hidden_state = outputs.last_hidden_state.squeeze(0)\n",
+    "\n",
+    "    # Gộp các embeddings đã bị tách khi đi qua SentencePiece\n",
+    "    word_embeds = group_embeddings(tokens, last_hidden_state)\n",
+    "\n",
+    "    # Kiểm tra số lượng embeddings và số lượng labels\n",
+    "    if len(word_embeds) != len(gold_labels):\n",
+    "        continue\n",
+    "\n",
+    "    # Thêm vào list tổng / Tới đây là data đã sẵn sàng cho training\n",
+    "    all_embeddings.append(torch.stack(word_embeds))\n",
+    "    all_labels.append(torch.tensor(gold_labels))"
+   ],
+   "outputs": [],
+   "execution_count": 6
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:35:23.255306Z",
+     "start_time": "2025-06-11T00:35:23.252026Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "# We skip 43 data since they aren't convertable",
+   "id": "c3e406ad994802be",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-43\n"
+     ]
+    }
+   ],
+   "execution_count": 15
+  },
+  {
+   "cell_type": "code",
+   "id": "cadc3a861025b3b9",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:36:18.857012Z",
+     "start_time": "2025-06-11T00:36:08.257408Z"
+    }
+   },
+   "source": [
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "X_flat = []\n",
+    "y_flat = []\n",
+    "\n",
+    "for emb_seq, label_seq in zip(all_embeddings, all_labels):\n",
+    "    for emb, label in zip(emb_seq, label_seq):\n",
+    "        X_flat.append(emb.cpu().numpy())   # emb: [768]\n",
+    "        y_flat.append(label.item())  # label: int\n",
+    "\n",
+    "X_flat = np.array(X_flat)  # [N, 768]\n",
+    "y_flat = np.array(y_flat)  # [N]\n"
+   ],
+   "outputs": [],
+   "execution_count": 16
+  },
+  {
+   "cell_type": "code",
+   "id": "52a0fe72a50d4f73",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:39:58.211159Z",
+     "start_time": "2025-06-11T00:39:58.208074Z"
+    }
+   },
+   "source": [
+    "print(X_flat[0].shape)\n",
+    "print(y_flat.shape)"
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(768,)\n",
+      "(368172,)\n"
+     ]
+    }
+   ],
+   "execution_count": 19
+  },
+  {
+   "cell_type": "code",
+   "id": "d6275df555f0c4c3",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:42:00.129778Z",
+     "start_time": "2025-06-11T00:42:00.096986Z"
+    }
+   },
+   "source": [
+    "# Kiểm tra độ lệch data\n",
+    "unique_values, counts = np.unique(y_flat, return_counts=True)\n",
+    "\n",
+    "# In ra từng giá trị và số lần xuất hiện\n",
+    "for val, count in zip(unique_values, counts):\n",
+    "    print(f\"Label {val}: {count} times\")\n"
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Label 0: 344986 times\n",
+      "Label 1: 7450 times\n",
+      "Label 2: 3504 times\n",
+      "Label 3: 1204 times\n",
+      "Label 4: 2050 times\n",
+      "Label 5: 6211 times\n",
+      "Label 6: 2767 times\n"
+     ]
+    }
+   ],
+   "execution_count": 24
+  },
+  {
+   "cell_type": "code",
+   "id": "664020977ba9a1e2",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:42:03.350616Z",
+     "start_time": "2025-06-11T00:42:02.915680Z"
+    }
+   },
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X_flat, y_flat, test_size=0.2, random_state=42, stratify=y_flat)\n"
+   ],
+   "outputs": [],
+   "execution_count": 25
+  },
+  {
+   "cell_type": "code",
+   "id": "d4acda9c7cae3214",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:42:25.235471Z",
+     "start_time": "2025-06-11T00:42:16.769480Z"
+    }
+   },
+   "source": [
+    "import lightgbm as lgb\n",
+    "from sklearn.metrics import accuracy_score, f1_score, classification_report\n",
+    "\n",
+    "\n",
+    "# Tạo Dataset cho LightGBM\n",
+    "train_data = lgb.Dataset(X_train, label=y_train)\n",
+    "test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)\n",
+    "\n",
+    "# Cấu hình tham số LightGBM (Random Forest mode)\n",
+    "params = {\n",
+    "    \"objective\": \"multiclass\",          # nếu multiclass classification\n",
+    "    \"num_class\": len(np.unique(y_train)),\n",
+    "    \"metric\": \"multi_logloss\",\n",
+    "    \"boosting_type\": \"rf\",              # random forest mode trong LightGBM\n",
+    "    \"num_leaves\": 31,\n",
+    "    \"bagging_freq\": 1,\n",
+    "    \"bagging_fraction\": 0.8,\n",
+    "    \"feature_fraction\": 0.8,\n",
+    "    \"bagging_seed\": 42,\n",
+    "    \"verbose\": -1,\n",
+    "    \"seed\": 42,\n",
+    "    \"is_unbalance\": True\n",
+    "}\n",
+    "\n",
+    "\n",
+    "\n",
+    "# Train model, tích hợp wandb callback để log metrics\n",
+    "model = lgb.train(\n",
+    "    params,\n",
+    "    train_data,\n",
+    "    num_boost_round=2,\n",
+    "    valid_sets=[train_data, test_data],\n",
+    "    valid_names=[\"train\", \"test\"]\n",
+    ")\n",
+    "\n",
+    "# Dự đoán trên test set\n",
+    "y_pred_prob = model.predict(X_test)\n",
+    "y_pred = np.argmax(y_pred_prob, axis=1)\n",
+    "\n",
+    "# Ánh xạ số về nhãn tên entity\n",
+    "label_map = {\n",
+    "    0: 'O',\n",
+    "    1: 'B-PER',\n",
+    "    2: 'I-PER',\n",
+    "    3: 'B-ORG',\n",
+    "    4: 'I-ORG',\n",
+    "    5: 'B-LOC',\n",
+    "    6: 'I-LOC'\n",
+    "}\n",
+    "\n",
+    "# Chuyển y_test và y_pred sang nhãn gốc\n",
+    "y_test_labels = [label_map[i] for i in y_test]\n",
+    "y_pred_labels = [label_map[i] for i in y_pred]\n",
+    "\n",
+    "# In classification report với nhãn thật\n",
+    "print(\"\\nClassification Report (theo label gốc):\")\n",
+    "print(classification_report(y_test_labels, y_pred_labels, digits=4))\n",
+    "\n",
+    "\n"
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Classification Report (theo label gốc):\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "       B-LOC     0.3679    0.5000    0.4239      1242\n",
+      "       B-ORG     0.2639    0.3942    0.3161       241\n",
+      "       B-PER     0.4395    0.7490    0.5540      1490\n",
+      "       I-LOC     0.2321    0.4448    0.3050       553\n",
+      "       I-ORG     0.1532    0.2878    0.2000       410\n",
+      "       I-PER     0.4304    0.5863    0.4964       701\n",
+      "           O     0.9869    0.9478    0.9669     68998\n",
+      "\n",
+      "    accuracy                         0.9235     73635\n",
+      "   macro avg     0.4106    0.5586    0.4660     73635\n",
+      "weighted avg     0.9474    0.9235    0.9336     73635\n",
+      "\n"
+     ]
+    }
+   ],
+   "execution_count": 26
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:45:00.649942Z",
+     "start_time": "2025-06-11T00:45:00.646595Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "print(model.feature_importance().shape)",
+   "id": "b1cf76bc3e58bc93",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(768,)\n"
+     ]
+    }
+   ],
+   "execution_count": 35
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:52:36.844604Z",
+     "start_time": "2025-06-11T00:52:36.827018Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "correct = 0\n",
+    "for i in range(73635):\n",
+    "    if y_pred[i] == y_test[i]:\n",
+    "        correct += 1\n",
+    "correct"
+   ],
+   "id": "39d391e67a51211c",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "68001"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 58
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-06-11T00:57:45.109129Z",
+     "start_time": "2025-06-11T00:57:45.105078Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "print(y_test.shape)",
+   "id": "1a0ba8f0410c5589",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(73635,)\n"
+     ]
+    }
+   ],
+   "execution_count": 61
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

space/space/space/space/space/space/notebooks/Kien_Rule_base.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

space/space/space/space/space/space/notebooks/Softmax_PhoBERT.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

space/space/space/space/space/space/requirements.txt ADDED Viewed

Binary file (2.43 kB). View file

space/space/space/space/space/space/run.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from src.preprocessing import download_raw_data, preprocess_data_for_EDA, load_phoBERT_model_and_tokenizer, create_embeddings, split_dataset
+from src.data_set import NerDataset, collate_fn
+from src.configs import configs
+from src.model import CRF_Tagger
+from src.train import train_model
+import torch
+from torch.utils.data import DataLoader
+import warnings
+warnings.filterwarnings("ignore")
+def main():
+    # Download VLSP2016 from hgface
+    print("Download raw data ...")
+    df = download_raw_data()
+    # Save raw data
+    df.to_csv(r".\data\raw_data.csv", index=False)
+    print("Save at data\raw_data.csv \n")
+    # Process data for EDA
+    print("Process data for EDA ...")
+    df = preprocess_data_for_EDA(df)
+    df.to_csv(r".\data\processed_data_EDA.csv", index=False)
+    print("Save at data\processed_data_EDA.csv \n")
+    # Init PhoBERT Tokenizer and PhoBERT Model
+    print("Embedding data ...")
+    model, tokenizer = load_phoBERT_model_and_tokenizer()
+    # Embeddings data
+    processed_data = create_embeddings(df, model, tokenizer)
+    torch.save(processed_data, r".\data\processed_data_full.pt")
+    print("Save at data\processed_data_full.pt \n")
+    # Split data into train/valid/test
+    print("Train/Valid/Test Split ...")
+    X_train, Y_train, X_val, Y_val, X_test, Y_test = split_dataset(processed_data)
+    print("Done \n")
+    # Data Agumentation for training set
+    # Pass
+    # Init DataLoader
+    print("Init DataLoader ...")
+    datasets = {
+        'train': NerDataset(X_train, Y_train),
+        'val': NerDataset(X_val, Y_val),
+        'test': NerDataset(X_test, Y_test)
+    }
+    loaders = {
+        split: DataLoader(dataset, batch_size=configs["batch_size"], shuffle=(split=='train'), collate_fn=collate_fn)
+        for split, dataset in datasets.items()
+    }
+    print("Done \n")
+    # Init sequence label model
+    print("Init Model ...")
+    NUM_TAGS = 7
+    model = CRF_Tagger(input_dim=X_train[0].size(1), num_tags=NUM_TAGS)
+    optimizer = torch.optim.Adam(model.parameters(), lr=configs["learning_rate"])
+    print("Done \n")
+    # Training Model
+    print("Start training ...")
+    train_model(model, optimizer, configs, loaders)
+if __name__ == "__main__":
+    main()

space/space/space/space/space/space/space/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

space/space/space/space/space/space/space/README.md ADDED Viewed

	@@ -0,0 +1,87 @@

+---
+title: Vietnamese NER Demo
+emoji: 🧠
+colorFrom: indigo
+colorTo: yellow
+sdk: streamlit
+sdk_version: 1.46.1
+app_file: src/app.py
+pinned: false
+---
+# Vietnamese Named Entity Recognition
+## 🛠️ Set Up Your Environment With Conda
+### Option 1: Using `requirements.txt`
+```bash
+conda create --name vnner python=3.10
+conda activate vnner
+pip install -r requirements.txt
+```
+### Option 2: Using `environment.yml`
+```bash
+conda env create -f environment.yml
+conda activate vnner
+```
+## Run
+```bash
+python run.py
+```
+---
+## 📂 Project Structure
+```
+my_ai_project/
+│
+├── data/
+│   ├── raw_data.csv               # Dữ liệu gốc
+│   ├── processed_data_EDA.csv     # Dữ liệu sau khi tiền xử lý
+│   └── processed_data_full.csv    # Dữ liệu sẵn sàng training
+│
+├── notebooks/                      # Thử nghiệm và khám phá dữ liệu
+│   ├── Duc_Notebook.ipynb          # CRF + RandomForest
+│   ├── Softmax_PhoBERT.ipynb       # Softmax
+│
+├── src/                   # Mã nguồn chính của dự án
+│   ├── __init__.py
+│   ├── data_loader.py     # Nạp và xử lý dữ liệu
+│   ├── preprocessing.py   # Hàm tiền xử lý dữ liệu
+│   ├── model.py           # Định nghĩa kiến trúc mô hình
+│   ├── train.py           # Huấn luyện mô hình
+│   ├── evaluate.py        # Đánh giá mô hình
+│   └── predict.py         # Dự đoán với mô hình đã huấn luyện
+│
+├── models/                # Mô hình đã lưu sau khi huấn luyện
+│   └── best_model.pth     # File trọng số mô hình
+│
+├── outputs/               # Kết quả, biểu đồ, log, metrics
+│   ├── logs/              # Nhật ký huấn luyện (tensorboard/logging)
+│   └── figures/           # Biểu đồ trực quan hóa
+│
+├── configs/               # File cấu hình cho mô hình, huấn luyện
+│   └── config.yaml
+│
+├── tests/                 # Unit test cho các hàm chính
+│
+├── requirements.txt       # Thư viện cần cài đặt
+├── environment.yml        # Môi trường Conda
+├── README.md              # Giới thiệu dự án
+└── run.py                 # Script chính để chạy toàn bộ pipeline
+```
+---
+## 📚 Additional Resources (Optional)
+If you have any questions about the project structure, consider reading these helpful articles first:
+* [Understanding `__init__.py`](https://zetcode.com/python/init-file/)
+* [Markdown Basic Syntax](https://www.markdownguide.org/basic-syntax/#escaping-characters)
+* [Difference Between `requirements.txt` and `environment.yml`](https://www.reddit.com/r/learnpython/comments/xvlpdz/why_do_people_provide_a_requirementstxt_or/)
+These resources could be useful for you!

space/space/space/space/space/space/src/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""Marks the directory as a Python package."""
+__version__ = "1.0.0"
+__author__ = "Duc Lai"
+PACKAGE_NAME = "src"

space/space/space/space/space/space/src/app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import streamlit as st
+import pandas as pd
+from src.predict import predict_demo
+from src.front import render_html
+st.set_page_config(page_title="Vietnamese NER", layout="wide")
+# ===== Tiêu đề chính =====
+st.title("🔍 Ứng dụng nhận diện thực thể có tên (NER) cho tiếng Việt")
+# Tabs
+tab1, tab2, tab3 = st.tabs(["📊 Phân tích dữ liệu", "📈 Kết quả huấn luyện", "🧪 Demo mô hình"])
+# --- Tab 1: PHÂN TÍCH DỮ LIỆU ---
+with tab1:
+    st.header("📊 Phân tích dữ liệu")
+    df = pd.DataFrame({
+        "Loại thực thể": ["PER", "LOC", "ORG", "MISC"],
+        "Số lượng": [3200, 2500, 1800, 900]
+    })
+    st.bar_chart(df.set_index("Loại thực thể"))
+# --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
+with tab2:
+    st.header("📈 Kết quả huấn luyện")
+    loss = [0.9, 0.7, 0.5, 0.35, 0.28]
+    epoch = [1, 2, 3, 4, 5]
+    df_loss = pd.DataFrame({"Epoch": epoch, "Loss": loss})
+    st.line_chart(df_loss.set_index("Epoch"))
+    st.subheader("Đánh giá mô hình")
+    df_eval = pd.DataFrame({
+        "Phiên bản": ["v1", "v2", "v3"],
+        "F1-score": [0.78, 0.83, 0.86],
+        "Accuracy": [0.81, 0.85, 0.88]
+    })
+    st.dataframe(df_eval)
+# --- Tab 3: DEMO MÔ HÌNH ---
+with tab3:
+    st.header("🧪 Vietnamese Named Entity Recognition")
+    text = st.text_input("Nhập văn bản tiếng Việt:", "Nguyễn Văn A đang làm việc tại Hà Nội")
+    if st.button("Phân tích"):
+        if not text.strip():
+            st.warning("Vui lòng nhập văn bản!")
+        else:
+            tokens, labels = predict_demo(text)
+            st.subheader("Thực thể được phát hiện")
+            entities = [(tok, lab) for tok, lab in zip(tokens, labels) if lab != "O"]
+            if entities:
+                for tok, lab in entities:
+                    st.markdown(f"🔹 **{tok}** — *{lab}*")
+            else:
+                st.info("Không phát hiện thực thể.")
+        st.subheader("Highlight trong văn bản:")
+        st.markdown(render_html(tokens, labels), unsafe_allow_html=True)

space/space/space/space/space/space/src/configs.py ADDED Viewed

	@@ -0,0 +1,15 @@

+configs = {
+    # Init
+    "project": "NER",
+    "name": "CRF_VLSP2016_Ultra",
+    "model": "Linear/CRF",
+    # Hyperparameters
+    "optim": "Adam",
+    "learning_rate": 1e-3,
+    "batch_size": 16,
+    "epochs": 20,
+    "train_ratio": 0.7,
+    "val_ratio": 0.15,
+    "test_ratio": 0.15
+}

space/space/space/space/space/space/src/data_set.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from torch.utils.data import Dataset
+import torch
+class NerDataset(Dataset):
+    def __init__(self, embeddings, labels):
+        super().__init__()
+        self.embeddings = embeddings
+        self.labels = labels
+    def __len__(self):
+        return len(self.embeddings)
+    def __getitem__(self, idx):
+        return self.embeddings[idx], self.labels[idx]
+def collate_fn(batch): # Batch_size x Seq_length x 768
+    embeddings, labels = zip(*batch)
+    lengths = [e.size(0) for e in embeddings]
+    max_len = max(lengths)
+    padded_embs = torch.stack([
+        torch.cat([e, torch.zeros(max_len - e.size(0), e.size(1))]) for e in embeddings
+    ])
+    padded_labels = torch.stack([
+        torch.cat([l, torch.full((max_len - l.size(0),), -1, dtype=torch.long)]) for l in labels
+    ])
+    return padded_embs, padded_labels, lengths

space/space/space/space/space/space/src/evaluate.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from src.predict import predict
+from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
+def evaluate(model, loader, count_loss=True, report=False):
+    # Model Preidction (Inference)
+    all_preds, all_true, loss = predict(model, loader, count_loss)
+    class_report = None
+    # Get evaluation metric
+    precision, recall, f1, _ = precision_recall_fscore_support(all_true, all_preds, average='macro', zero_division=0)
+    acc = accuracy_score(all_true, all_preds)
+    # Get classification report
+    if report:
+        class_report = classification_report(all_true, all_preds)
+    return precision, recall, f1, acc, loss, class_report
+def evaluate_ignore_O(model, loader):
+    pass

space/space/space/space/space/space/src/front.py ADDED Viewed

	@@ -0,0 +1,32 @@

+def render_html(tokens, labels):
+    """
+    Tô màu highlight theo nhãn IOB, với màu khác nhau cho PER, ORG, LOC
+    """
+    label_colors = {
+        "PER": "lightcoral",   # đỏ nhạt
+        "ORG": "lightblue",    # xanh nhạt
+        "LOC": "lightgreen",   # xanh lá nhạt
+    }
+    html = ""
+    current_label = None
+    for tok, label in zip(tokens, labels):
+        if label.startswith("B-"):
+            if current_label:
+                html += "</span> "
+            current_label = label[2:]
+            color = label_colors.get(current_label, "lightgray")
+            html += f"<span style='background-color:{color};padding:2px;border-radius:4px;' title='{current_label}'>{tok}"
+        elif label.startswith("I-") and current_label:
+            html += f" {tok}"
+        else:
+            if current_label:
+                html += "</span> "
+                current_label = None
+            html += f"{tok} "
+    if current_label:
+        html += "</span>"
+    return f"<div style='font-family:monospace;font-size:16px'>{html.strip()}</div>"

space/space/space/space/space/space/src/model.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from torchcrf import CRF
+import torch.nn as nn
+class CRF_Tagger(nn.Module):
+    def __init__(self, input_dim, num_tags):
+        super().__init__()
+        self.embed2tag = nn.Linear(input_dim, num_tags)
+        self.crf = CRF(num_tags, batch_first=True)
+    def forward(self, x, labels, mask):
+        emissions = self.embed2tag(x)
+        return -self.crf(emissions, labels, mask=mask, reduction="mean")
+    def decode(self, x, mask=None):
+        emissions = self.embed2tag(x)
+        return self.crf.decode(emissions, mask)

space/space/space/space/space/space/src/predict.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+from src.model import CRF_Tagger
+from src.preprocessing import process_demo_sentence
+def predict(model, loader, count_loss=True):
+    model.eval() # Evaluation Mode, Ignore Dropout, BatchNorm, ...
+    all_preds, all_true = [], []
+    loss = 0.0
+    with torch.no_grad(): # Stop track gradient
+        for x, y, _ in loader:
+            mask = (y != -1)
+            # Get loss
+            if count_loss:
+                loss += model(x, y, mask).item()
+            # Get prediction
+            preds = model.decode(x, mask)
+            # Loop for each sentence in mini-batch
+            for pred_seq, true_seq, m in zip(preds, y, mask):
+                true_labels = true_seq[m].tolist() # tensor[mask tensor boolean]
+                all_preds.extend(pred_seq)
+                all_true.extend(true_labels)
+    return all_preds, all_true, loss/len(loader)
+def predict_demo(text):
+    id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
+    x, tokens = process_demo_sentence(text) # 1 x seq_length x 768
+    NUM_TAGS = 7
+    model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
+    model.load_state_dict(torch.load(".\models\best_epoch_16.pt"))
+    model.eval()
+    with torch.no_grad():
+        preds = model.decode(x)
+    labels = [id_tag[lab] for lab in preds[0]] # preds[0] vì sẽ trả về nhiều batch nhưng chúng ta chỉ có 1
+    return tokens, labels

space/space/space/space/space/space/src/preprocessing.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import pandas as pd
+import torch
+from transformers import AutoTokenizer, AutoModel
+from tqdm import tqdm
+from sklearn.model_selection import train_test_split
+from src.configs import configs
+from pyvi import ViTokenizer
+def join_tokens(tokens):
+    text = ' '.join(tokens)
+    return text
+def reform_raw_text(tokens):
+    text = ' '.join(tokens)
+    return text.replace("_", " ")
+def label(x, ):
+    id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
+    return [id_tag[int(i)] for i in x]
+def replace_7_8(lst):
+    return [0 if x in (7, 8) else x for x in lst]
+# Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece
+def group_embeddings(tokens, embeddings):
+    word_embeddings = []
+    current_vecs = []
+    for token, emb in zip(tokens, embeddings):
+        if token in ["<s>", "</s>"]:
+            continue
+        if token.endswith("@@"):
+            current_vecs.append(emb)
+        else:
+            current_vecs.append(emb)
+            word_emb = torch.mean(torch.stack(current_vecs), dim=0)
+            word_embeddings.append(word_emb)
+            current_vecs = []
+    if current_vecs:  # Trong trường hợp sót lại cuối câu
+        word_emb = torch.mean(torch.stack(current_vecs), dim=0)
+        word_embeddings.append(word_emb)
+    return word_embeddings
+# Download the dataset form Hugging Face
+def download_raw_data():
+    splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}
+    df_train = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["train"])
+    df_valid = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["valid"])
+    df = pd.concat([df_train, df_valid]).reset_index(drop=True)
+    return df
+# Process dataframe for EDA
+def preprocess_data_for_EDA(df):
+    # Define tag - id
+    tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
+    id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
+    # Add columns and remove inappropriate tags
+    df['ner_tags'] = df['ner_tags'].apply(replace_7_8)
+    df['text_withseg'] = df['tokens'].apply(join_tokens)
+    df['text_raw'] = df['tokens'].apply(reform_raw_text)
+    df["ner_labels"] = df.ner_tags.apply(label)
+    df.columns = ['tokens', 'id_labels', 'seg_text', 'raw_text', 'labels']
+    return df
+def load_phoBERT_model_and_tokenizer():
+    # Load PhoBERT tokenizer và model
+    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
+    model = AutoModel.from_pretrained("vinai/phobert-base")
+    model.eval()
+    return model, tokenizer
+# Embedding text
+def create_embeddings(df, model, tokenizer):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    all_embeddings = []  # list of [seq_len_i, 768] tensors
+    all_labels = [] # list of [seq_len_i,] tensors
+    remove_index = []
+    for i, row in tqdm(df.iterrows(), total=len(df)):
+        # Truy cập phần tử từng dòng
+        sentence = row['seg_text']
+        gold_labels = row["id_labels"]
+        # Cho sentence đi qua SentencePiece
+        input_ids = tokenizer.encode(sentence, return_tensors="pt").to(device)
+        tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
+        # Encode tạo embeddings
+        with torch.no_grad():
+            outputs = model(input_ids)
+            last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()
+        # Gộp các embeddings đã bị tách khi đi qua SentencePiece
+        word_embeds = group_embeddings(tokens, last_hidden_state)
+        # Kiểm tra số lượng embeddings và số lượng labels, nếu conflict -> xóa dòng đó
+        if len(word_embeds) != len(gold_labels):
+            # print(f"Warning: Skip row {i} - length mismatch")
+            remove_index.append(i)
+            continue
+        # Thêm vào list tổng & Tới đây là data đã sẵn sàng cho training
+        all_embeddings.append(torch.stack(word_embeds))
+        all_labels.append(torch.tensor(gold_labels))
+        # Create Dict
+        processed_data = {
+          "embeddings": all_embeddings,
+          "labels": all_labels
+        }
+    return processed_data
+def split_dataset(data):
+    # Train_Val / Test Split
+    X_train_val, X_test, Y_train_val, Y_test = train_test_split(data["embeddings"], data["labels"], test_size=configs["test_ratio"], random_state=42)
+    # Train / Val Split
+    val_rest_ratio = configs["val_ratio"] / (configs["val_ratio"] + configs["train_ratio"])
+    X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size = val_rest_ratio, random_state=42)
+    return X_train, Y_train, X_val, Y_val, X_test, Y_test
+# TODO: Refactor hàm process_demo_sentence, và hàm predict demo, warning nếu độ dài tokens_word không bằng độ dài sau group_embeddings
+def process_demo_sentence(text):
+    """
+    Trả về tensor shape 1 x Seq_length x 768
+    """
+    segmented_text = ViTokenizer.tokenize(text)
+    tokens_word = segmented_text.strip().split(" ")
+    model, tokenizer = load_phoBERT_model_and_tokenizer()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    input_ids = tokenizer.encode(segmented_text, return_tensors="pt").to(device)
+    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
+    with torch.no_grad():
+        outputs = model(input_ids)
+        last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()
+    word_embeds = group_embeddings(tokens, last_hidden_state)
+    all_embeddings = torch.stack(word_embeds) # seq_length x 768
+    all_embeddings = all_embeddings.unsqueeze(0) # Thêm chiều batch size là 1 -> 1 x seq_length x 768
+    return all_embeddings, tokens_word

space/space/space/space/space/space/src/torchcrf/__init__.py ADDED Viewed

	@@ -0,0 +1,340 @@

+__version__ = '0.7.2'
+from typing import List, Optional
+import torch
+import torch.nn as nn
+class CRF(nn.Module):
+    """Conditional random field.
+    This module implements a conditional random field [LMP01]_. The forward computation
+    of this class computes the log likelihood of the given sequence of tags and
+    emission score tensor. This class also has `~CRF.decode` method which finds
+    the best tag sequence given an emission score tensor using `Viterbi algorithm`_.
+    Args:
+        num_tags: Number of tags.
+        batch_first: Whether the first dimension corresponds to the size of a minibatch.
+    Attributes:
+        start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
+            ``(num_tags,)``.
+        end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
+            ``(num_tags,)``.
+        transitions (`~torch.nn.Parameter`): Transition score tensor of size
+            ``(num_tags, num_tags)``.
+    .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
+       "Conditional random fields: Probabilistic models for segmenting and
+       labeling sequence data". *Proc. 18th International Conf. on Machine
+       Learning*. Morgan Kaufmann. pp. 282–289.
+    .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
+    """
+    def __init__(self, num_tags: int, batch_first: bool = False) -> None:
+        if num_tags <= 0:
+            raise ValueError(f'invalid number of tags: {num_tags}')
+        super().__init__()
+        self.num_tags = num_tags
+        self.batch_first = batch_first
+        self.start_transitions = nn.Parameter(torch.empty(num_tags))
+        self.end_transitions = nn.Parameter(torch.empty(num_tags))
+        self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        """Initialize the transition parameters.
+        The parameters will be initialized randomly from a uniform distribution
+        between -0.1 and 0.1.
+        """
+        nn.init.uniform_(self.start_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.transitions, -0.1, 0.1)
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(num_tags={self.num_tags})'
+    def forward(
+            self,
+            emissions: torch.Tensor,
+            tags: torch.LongTensor,
+            mask: Optional[torch.ByteTensor] = None,
+            reduction: str = 'sum',
+    ) -> torch.Tensor:
+        """Compute the conditional log likelihood of a sequence of tags given emission scores.
+        Args:
+            emissions (`~torch.Tensor`): Emission score tensor of size
+                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
+                ``(batch_size, seq_length, num_tags)`` otherwise.
+            tags (`~torch.LongTensor`): Sequence of tags tensor of size
+                ``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
+                ``(batch_size, seq_length)`` otherwise.
+            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
+                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
+            reduction: Specifies  the reduction to apply to the output:
+                ``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
+                ``sum``: the output will be summed over batches. ``mean``: the output will be
+                averaged over batches. ``token_mean``: the output will be averaged over tokens.
+        Returns:
+            `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
+            reduction is ``none``, ``()`` otherwise.
+        """
+        self._validate(emissions, tags=tags, mask=mask)
+        if reduction not in ('none', 'sum', 'mean', 'token_mean'):
+            raise ValueError(f'invalid reduction: {reduction}')
+        if mask is None:
+            mask = torch.ones_like(tags, dtype=torch.uint8)
+        if self.batch_first:
+            emissions = emissions.transpose(0, 1)
+            tags = tags.transpose(0, 1)
+            mask = mask.transpose(0, 1)
+        # shape: (batch_size,)
+        numerator = self._compute_score(emissions, tags, mask)
+        # shape: (batch_size,)
+        denominator = self._compute_normalizer(emissions, mask)
+        # shape: (batch_size,)
+        llh = numerator - denominator
+        if reduction == 'none':
+            return llh
+        if reduction == 'sum':
+            return llh.sum()
+        if reduction == 'mean':
+            return llh.mean()
+        assert reduction == 'token_mean'
+        return llh.sum() / mask.type_as(emissions).sum()
+    @torch.jit.export
+    def decode(self, emissions: torch.Tensor,
+               mask: Optional[torch.ByteTensor] = None) -> List[List[int]]:
+        """Find the most likely tag sequence using Viterbi algorithm.
+        Args:
+            emissions (`~torch.Tensor`): Emission score tensor of size
+                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
+                ``(batch_size, seq_length, num_tags)`` otherwise.
+            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
+                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
+        Returns:
+            List of list containing the best tag sequence for each batch.
+        """
+        self._validate(emissions, mask=mask)
+        if mask is None:
+            mask = emissions.new_ones(emissions.shape[:2], dtype=torch.uint8)
+        if self.batch_first:
+            emissions = emissions.transpose(0, 1)
+            mask = mask.transpose(0, 1)
+        return self._viterbi_decode(emissions, mask)
+    def _validate(
+            self,
+            emissions: torch.Tensor,
+            tags: Optional[torch.LongTensor] = None,
+            mask: Optional[torch.ByteTensor] = None) -> None:
+        if emissions.dim() != 3:
+            raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
+        if emissions.size(2) != self.num_tags:
+            raise ValueError(
+                f'expected last dimension of emissions is {self.num_tags}, '
+                f'got {emissions.size(2)}')
+        if tags is not None:
+            if emissions.shape[:2] != tags.shape:
+                raise ValueError(
+                    'the first two dimensions of emissions and tags must match, '
+                    f'got {(emissions.shape[0], emissions.shape[1])} and {(tags.shape[0], tags.shape[1])}'
+                )
+        if mask is not None:
+            if emissions.shape[:2] != mask.shape:
+                raise ValueError(
+                    'the first two dimensions of emissions and mask must match, '
+                    f'got {(emissions.shape[0], emissions.shape[1])} and {(mask.shape[0], mask.shape[1])}'
+                )
+            no_empty_seq = not self.batch_first and mask[0].all()
+            no_empty_seq_bf = self.batch_first and mask[:, 0].all()
+            if not no_empty_seq and not no_empty_seq_bf:
+                raise ValueError('mask of the first timestep must all be on')
+    def _compute_score(
+            self, emissions: torch.Tensor, tags: torch.LongTensor,
+            mask: torch.ByteTensor) -> torch.Tensor:
+        # emissions: (seq_length, batch_size, num_tags)
+        # tags: (seq_length, batch_size)
+        # mask: (seq_length, batch_size)
+        assert emissions.dim() == 3 and tags.dim() == 2
+        assert emissions.shape[:2] == tags.shape
+        assert emissions.size(2) == self.num_tags
+        assert mask.shape == tags.shape
+        assert mask[0].all()
+        seq_length, batch_size = tags.shape
+        mask = mask.type_as(emissions)
+        # Start transition score and first emission
+        # shape: (batch_size,)
+        score = self.start_transitions[tags[0]]
+        score += emissions[0, torch.arange(batch_size), tags[0]]
+        for i in range(1, seq_length):
+            # Transition score to next tag, only added if next timestep is valid (mask == 1)
+            # shape: (batch_size,)
+            score += self.transitions[tags[i - 1], tags[i]] * mask[i]
+            # Emission score for next tag, only added if next timestep is valid (mask == 1)
+            # shape: (batch_size,)
+            score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i]
+        # End transition score
+        # shape: (batch_size,)
+        seq_ends = mask.long().sum(dim=0) - 1
+        # shape: (batch_size,)
+        last_tags = tags[seq_ends, torch.arange(batch_size)]
+        # shape: (batch_size,)
+        score += self.end_transitions[last_tags]
+        return score
+    def _compute_normalizer(
+            self, emissions: torch.Tensor, mask: torch.ByteTensor) -> torch.Tensor:
+        # emissions: (seq_length, batch_size, num_tags)
+        # mask: (seq_length, batch_size)
+        assert emissions.dim() == 3 and mask.dim() == 2
+        assert emissions.shape[:2] == mask.shape
+        assert emissions.size(2) == self.num_tags
+        assert mask[0].all()
+        seq_length = emissions.size(0)
+        # Start transition score and first emission; score has size of
+        # (batch_size, num_tags) where for each batch, the j-th column stores
+        # the score that the first timestep has tag j
+        # shape: (batch_size, num_tags)
+        score = self.start_transitions + emissions[0]
+        for i in range(1, seq_length):
+            # Broadcast score for every possible next tag
+            # shape: (batch_size, num_tags, 1)
+            broadcast_score = score.unsqueeze(2)
+            # Broadcast emission score for every possible current tag
+            # shape: (batch_size, 1, num_tags)
+            broadcast_emissions = emissions[i].unsqueeze(1)
+            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
+            # for each sample, entry at row i and column j stores the sum of scores of all
+            # possible tag sequences so far that end with transitioning from tag i to tag j
+            # and emitting
+            # shape: (batch_size, num_tags, num_tags)
+            next_score = broadcast_score + self.transitions + broadcast_emissions
+            # Sum over all possible current tags, but we're in score space, so a sum
+            # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
+            # all possible tag sequences so far, that end in tag i
+            # shape: (batch_size, num_tags)
+            next_score = torch.logsumexp(next_score, dim=1)
+            # Set score to the next score if this timestep is valid (mask == 1)
+            # shape: (batch_size, num_tags)
+            score = torch.where(mask[i].unsqueeze(1), next_score, score)
+        # End transition score
+        # shape: (batch_size, num_tags)
+        score += self.end_transitions
+        # Sum (log-sum-exp) over all possible tags
+        # shape: (batch_size,)
+        return torch.logsumexp(score, dim=1)
+    def _viterbi_decode(self, emissions: torch.FloatTensor,
+                        mask: torch.ByteTensor) -> List[List[int]]:
+        # emissions: (seq_length, batch_size, num_tags)
+        # mask: (seq_length, batch_size)
+        assert emissions.dim() == 3 and mask.dim() == 2
+        assert emissions.shape[:2] == mask.shape
+        assert emissions.size(2) == self.num_tags
+        assert mask[0].all()
+        seq_length, batch_size = mask.shape
+        # Start transition and first emission
+        # shape: (batch_size, num_tags)
+        score = self.start_transitions + emissions[0]
+        history: List[torch.Tensor] = []
+        # score is a tensor of size (batch_size, num_tags) where for every batch,
+        # value at column j stores the score of the best tag sequence so far that ends
+        # with tag j
+        # history saves where the best tags candidate transitioned from; this is used
+        # when we trace back the best tag sequence
+        # Viterbi algorithm recursive case: we compute the score of the best tag sequence
+        # for every possible next tag
+        for i in range(1, seq_length):
+            # Broadcast viterbi score for every possible next tag
+            # shape: (batch_size, num_tags, 1)
+            broadcast_score = score.unsqueeze(2)
+            # Broadcast emission score for every possible current tag
+            # shape: (batch_size, 1, num_tags)
+            broadcast_emission = emissions[i].unsqueeze(1)
+            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
+            # for each sample, entry at row i and column j stores the score of the best
+            # tag sequence so far that ends with transitioning from tag i to tag j and emitting
+            # shape: (batch_size, num_tags, num_tags)
+            next_score = broadcast_score + self.transitions + broadcast_emission
+            # Find the maximum score over all possible current tag
+            # shape: (batch_size, num_tags)
+            next_score, indices = next_score.max(dim=1)
+            # Set score to the next score if this timestep is valid (mask == 1)
+            # and save the index that produces the next score
+            # shape: (batch_size, num_tags)
+            score = torch.where(mask[i].unsqueeze(1), next_score, score)
+            history.append(indices)
+        # End transition score
+        # shape: (batch_size, num_tags)
+        score += self.end_transitions
+        # Now, compute the best path for each sample
+        # shape: (batch_size,)
+        seq_ends = mask.long().sum(dim=0) - 1
+        best_tags_list: List[List[int]] = []
+        for idx in range(batch_size):
+            # Find the tag which maximizes the score at the last timestep; this is our best tag
+            # for the last timestep
+            _, best_last_tag = score[idx].max(dim=0)
+            best_tags: List[int] = []
+            best_tags.append(best_last_tag.item())
+            # We trace back where the best last tag comes from, append that to our best tag
+            # sequence, and trace it back again, and so on
+            # NOTE: reversed() cannot be used here because it is not supported by TorchScript,
+            # see https://github.com/pytorch/pytorch/issues/31772.
+            for hist in history[:seq_ends[idx]][::-1]:
+                best_last_tag = hist[idx][best_tags[-1]]
+                best_tags.append(best_last_tag.item())
+            # Reverse the order because we start from the last timestep
+            best_tags.reverse()
+            best_tags_list.append(best_tags)
+        return best_tags_list

space/space/space/space/space/space/src/train.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import wandb
+from tqdm import tqdm
+from src.evaluate import evaluate
+import torch
+def train_model(model, optimizer, configs, loaders):
+    # Login wandb
+    wandb.login()
+    # Init Wandb for tracking training phase
+    wandb.init(
+        project=configs["project"],
+        name=configs["name"],
+        config=configs
+    )
+    # Log gradient of parameter
+    wandb.watch(model, log="all")
+    # Save model checkpoint by best F1
+    best_val_f1 = 0.0
+    # Training Loop
+    for epoch in range(1, configs["epochs"] + 1):
+        model.train()
+        total_loss = 0.0
+        # Create progress bar
+        train_bar = tqdm(loaders['train'], desc=f"Train Epoch {epoch}/{configs['epochs']}")
+        for batch_idx, (x, y, _) in enumerate(train_bar, start=1):
+            mask = (y != -1)
+            loss = model(x, y, mask)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            train_bar.set_postfix(batch_loss=loss.item(), avg_loss=total_loss / batch_idx)
+        # Evaluate model after each epoch
+        avg_train_loss = total_loss / len(loaders['train'])
+        train_precision, train_recall, train_f1, train_acc, _, _ = evaluate(model, loaders['train'], count_loss=False)
+        val_precision, val_recall, val_f1, val_acc, avg_val_loss, _= evaluate(model, loaders['val'], count_loss=True)
+        # Log metric for train and val set
+        print(f"Epoch {epoch}: train_loss={avg_train_loss:.4f}, train_f1={train_f1:.4f}, val_loss={avg_val_loss:.4f}, val_f1={val_f1:.4f}")
+        wandb.log({
+            "epoch": epoch,
+            # Group: Training metrics
+            "Train/Loss": avg_train_loss,
+            "Train/Precision": train_precision,
+            "Train/Recall": train_recall,
+            "Train/F1": train_f1,
+            "Train/Accuracy": train_acc,
+            # Group: Validation metrics
+            "Val/Loss": avg_val_loss,
+            "Val/Precision": val_precision,
+            "Val/Recall": val_recall,
+            "Val/F1": val_f1,
+            "Val/Accuracy": val_acc
+        })
+        # Save best model based on val_f1
+        if val_f1 > best_val_f1:
+            best_val_f1 = val_f1
+            ckpt_path = f"./models/best_epoch_{epoch}.pt"
+            torch.save(model.state_dict(), ckpt_path)
+            wandb.save(ckpt_path)
+            print(f"Saved imporved model to {ckpt_path}")
+        print()
+    # Load best model before test
+    print(f"Loading best model from {ckpt_path} for final evaluation...")
+    model.load_state_dict(torch.load(ckpt_path))
+    print("Done \n")
+    # Log metric for test set
+    print("Evaluation on test set ...")
+    test_precision, test_recall, test_f1, test_acc, avg_test_loss, report = evaluate(model, loaders['test'], count_loss=True, report=True)
+    wandb.log({
+        "Test/Loss": avg_test_loss,
+        "Test/Precision": test_precision,
+        "Test/Recall": test_recall,
+        "Test/F1": test_f1,
+        "Test/Accuracy": test_acc,
+    })
+    print(f"Test_loss={avg_test_loss:.4f}, Test_f1={test_f1:.4f}")
+    print(report)
+    # Finish W&B run
+    wandb.finish()

space/space/space/src/app.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import streamlit as st
 import pandas as pd
 from src.predict import predict_demo
 from src.front import render_html
 st.set_page_config(page_title="Vietnamese NER", layout="wide")
@@ -24,20 +27,99 @@ with tab1:
 # --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
 with tab2:
-    st.header("📈 Kết quả huấn luyện")
-    loss = [0.9, 0.7, 0.5, 0.35, 0.28]
-    epoch = [1, 2, 3, 4, 5]
-    df_loss = pd.DataFrame({"Epoch": epoch, "Loss": loss})
-    st.line_chart(df_loss.set_index("Epoch"))
-    st.subheader("Đánh giá mô hình")
-    df_eval = pd.DataFrame({
-        "Phiên bản": ["v1", "v2", "v3"],
-        "F1-score": [0.78, 0.83, 0.86],
-        "Accuracy": [0.81, 0.85, 0.88]
-    })
-    st.dataframe(df_eval)
 # --- Tab 3: DEMO MÔ HÌNH ---
 with tab3:

 import streamlit as st
 import pandas as pd
+import plotly.graph_objects as go
 from src.predict import predict_demo
 from src.front import render_html
+from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare
 st.set_page_config(page_title="Vietnamese NER", layout="wide")
 # --- Tab 2: KẾT QUẢ HUẤN LUYỆN ---
 with tab2:
+    st.set_page_config(
+        page_title="My NER App",
+        layout="wide",
+        initial_sidebar_state="expanded"
+    )
+    # ==== TẠO FIGURES ====
+    # 1️⃣ Loss
+    fig_loss = go.Figure()
+    fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
+                                mode='lines+markers', name='Train Loss'))
+    fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
+                                mode='lines+markers', name='Val Loss'))
+    fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
+    # 2️⃣ F1-Score
+    fig_f1 = go.Figure()
+    fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
+                                mode='lines+markers', name='Train F1'))
+    fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
+                                mode='lines+markers', name='Val F1'))
+    fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
+    # 3️⃣ Classification Report Table & Bar
+    labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
+    report_data = [[lbl,
+                    report_dict[lbl]["precision"],
+                    report_dict[lbl]["recall"],
+                    report_dict[lbl]["f1-score"]]
+                for lbl in labels]
+    df_report = pd.DataFrame(report_data,
+                            columns=["Label", "Precision", "Recall", "F1-Score"])
+    fig_report = go.Figure()
+    for col in ["Precision", "Recall", "F1-Score"]:
+        fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
+    fig_report.update_layout(barmode='group',
+                            title="Class Report Metrics of PhoBert + CRF",
+                            xaxis_title="Label", yaxis_title="Score",
+                            yaxis=dict(range=[0,1.0]))
+    labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
+    report_data2 = [[lbl,
+                    report_dict_2[lbl]["precision"],
+                    report_dict_2[lbl]["recall"],
+                    report_dict_2[lbl]["f1-score"]]
+                    for lbl in labels2]
+    df_report2 = pd.DataFrame(report_data2,
+                            columns=["Label", "Precision", "Recall", "F1-Score"])
+    fig_report2 = go.Figure()
+    for col in ["Precision", "Recall", "F1-Score"]:
+        fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
+    fig_report2.update_layout(barmode='group',
+                            title="Class Report Metrics of PhoBert + Softmax",
+                            xaxis_title="Label", yaxis_title="Score",
+                            yaxis=dict(range=[0,1.0]))
+    # 4️⃣ Model & Data Comparison Tables
+    df_model = pd.DataFrame(
+        [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()],
+        columns=["Model", "F1-Score", "Accuracy"]
+    )
+    df_data = pd.DataFrame(
+        [[s, f1] for s, f1 in data_compare["Data"].items()],
+        columns=["Preprocessing", "F1-Score"]
+    )
+    # ==== LAYOUT RAO GỌN VỚI COLUMNS ====
+    # Row 1: Loss | F1
+    col1, col2 = st.columns(2)
+    with col1:
+        st.plotly_chart(fig_loss, use_container_width=True)
+    with col2:
+        st.plotly_chart(fig_f1, use_container_width=True)
+    # Row 2: Class Report Table | Bar Chart
+    col3, col4 = st.columns(2)
+    with col3:
+        st.plotly_chart(fig_report2, use_container_width=True)
+    with col4:
+        st.plotly_chart(fig_report, use_container_width=True)
+    # Row 3: Model Compare | Data Compare
+    col5, col6 = st.columns(2)
+    with col5:
+        st.markdown("**Model Comparison**")
+        st.dataframe(df_model, use_container_width=True)
+    with col6:
+        st.markdown("**Data Preprocessing Comparison**")
+        st.dataframe(df_data, use_container_width=True)
 # --- Tab 3: DEMO MÔ HÌNH ---
 with tab3:

space/space/space/src/predict.py CHANGED Viewed

@@ -36,7 +36,7 @@ def predict_demo(text):
     NUM_TAGS = 7
     model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
-    model.load_state_dict(torch.load(".\models\best_epoch_16.pt"))
     model.eval()
     with torch.no_grad():
         preds = model.decode(x)

     NUM_TAGS = 7
     model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
+    model.load_state_dict(torch.load("../models/best_epoch_16.pt"))
     model.eval()
     with torch.no_grad():
         preds = model.decode(x)

space/space/space/st.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import streamlit as st
+import pandas as pd
+import plotly.graph_objects as go
+from results.output import training_log, report_dict, report_dict_2, model_compare, data_compare
+st.set_page_config(
+    page_title="My NER App",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# ==== TẠO FIGURES ====
+# 1️⃣ Loss
+fig_loss = go.Figure()
+fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_loss"],
+                              mode='lines+markers', name='Train Loss'))
+fig_loss.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_loss"],
+                              mode='lines+markers', name='Val Loss'))
+fig_loss.update_layout(title="Loss Curve", xaxis_title="Epoch", yaxis_title="Loss")
+# 2️⃣ F1-Score
+fig_f1 = go.Figure()
+fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["train_f1"],
+                            mode='lines+markers', name='Train F1'))
+fig_f1.add_trace(go.Scatter(x=training_log["epoch"], y=training_log["val_f1"],
+                            mode='lines+markers', name='Val F1'))
+fig_f1.update_layout(title="F1-Score Curve", xaxis_title="Epoch", yaxis_title="F1-Score")
+# 3️⃣ Classification Report Table & Bar
+labels = [k for k in report_dict.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
+report_data = [[lbl,
+                report_dict[lbl]["precision"],
+                report_dict[lbl]["recall"],
+                report_dict[lbl]["f1-score"]]
+               for lbl in labels]
+df_report = pd.DataFrame(report_data,
+                         columns=["Label", "Precision", "Recall", "F1-Score"])
+fig_report = go.Figure()
+for col in ["Precision", "Recall", "F1-Score"]:
+    fig_report.add_trace(go.Bar(x=df_report["Label"], y=df_report[col], name=col))
+fig_report.update_layout(barmode='group',
+                         title="Class Report Metrics of PhoBert + CRF",
+                         xaxis_title="Label", yaxis_title="Score",
+                         yaxis=dict(range=[0,1.0]))
+labels2 = [k for k in report_dict_2.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
+report_data2 = [[lbl,
+                 report_dict_2[lbl]["precision"],
+                 report_dict_2[lbl]["recall"],
+                 report_dict_2[lbl]["f1-score"]]
+                for lbl in labels2]
+df_report2 = pd.DataFrame(report_data2,
+                          columns=["Label", "Precision", "Recall", "F1-Score"])
+fig_report2 = go.Figure()
+for col in ["Precision", "Recall", "F1-Score"]:
+    fig_report2.add_trace(go.Bar(x=df_report2["Label"], y=df_report2[col], name=col))
+fig_report2.update_layout(barmode='group',
+                          title="Class Report Metrics of PhoBert + Softmax",
+                          xaxis_title="Label", yaxis_title="Score",
+                          yaxis=dict(range=[0,1.0]))
+# 4️⃣ Model & Data Comparison Tables
+df_model = pd.DataFrame(
+    [[m, v["F1"], v["Accuracy"]] for m, v in model_compare["Data"].items()],
+    columns=["Model", "F1-Score", "Accuracy"]
+)
+df_data = pd.DataFrame(
+    [[s, f1] for s, f1 in data_compare["Data"].items()],
+    columns=["Preprocessing", "F1-Score"]
+)
+# ==== LAYOUT RAO GỌN VỚI COLUMNS ====
+# Row 1: Loss | F1
+col1, col2 = st.columns(2)
+with col1:
+    st.plotly_chart(fig_loss, use_container_width=True)
+with col2:
+    st.plotly_chart(fig_f1, use_container_width=True)
+# Row 2: Class Report Table | Bar Chart
+col3, col4 = st.columns(2)
+with col3:
+    st.plotly_chart(fig_report2, use_container_width=True)
+with col4:
+    st.plotly_chart(fig_report, use_container_width=True)
+# Row 3: Model Compare | Data Compare
+col5, col6 = st.columns(2)
+with col5:
+    st.markdown("**Model Comparison**")
+    st.dataframe(df_model, use_container_width=True)
+with col6:
+    st.markdown("**Data Preprocessing Comparison**")
+    st.dataframe(df_data, use_container_width=True)

src/predict.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
 from src.model import CRF_Tagger
 from src.preprocessing import process_demo_sentence
 def predict(model, loader, count_loss=True):
@@ -29,6 +30,9 @@ def predict(model, loader, count_loss=True):
 def predict_demo(text):
     id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
@@ -36,7 +40,7 @@ def predict_demo(text):
     NUM_TAGS = 7
     model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
-    model.load_state_dict(torch.load("../models/best_epoch_16.pt"))
     model.eval()
     with torch.no_grad():
         preds = model.decode(x)

 import torch
 from src.model import CRF_Tagger
 from src.preprocessing import process_demo_sentence
+import os
 def predict(model, loader, count_loss=True):
 def predict_demo(text):
+    BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    model_path = os.path.join(BASE_DIR, "models", "best_epoch_16.pt")
     id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
     NUM_TAGS = 7
     model = CRF_Tagger(input_dim=x.size(2), num_tags=NUM_TAGS)
+    model.load_state_dict(torch.load(model_path))
     model.eval()
     with torch.no_grad():
         preds = model.decode(x)