Upload logs
Browse files
logs.log
CHANGED
@@ -1,53 +1,53 @@
|
|
1 |
-
2024-04-21
|
2 |
-
2024-04-21
|
3 |
-
2024-04-21
|
4 |
-
2024-04-21
|
5 |
-
2024-04-21
|
6 |
-
2024-04-21
|
7 |
-
2024-04-21
|
8 |
-
2024-04-21
|
9 |
-
2024-04-21
|
10 |
-
2024-04-21
|
11 |
-
2024-04-21
|
12 |
-
2024-04-21
|
13 |
-
2024-04-21
|
14 |
-
2024-04-21
|
15 |
-
2024-04-21
|
16 |
-
2024-04-21
|
17 |
-
2024-04-21
|
18 |
-
2024-04-21
|
19 |
-
2024-04-21
|
20 |
-
2024-04-21
|
21 |
-
2024-04-21
|
22 |
-
2024-04-21
|
23 |
-
2024-04-21
|
24 |
-
2024-04-21
|
25 |
-
2024-04-21
|
26 |
-
2024-04-21
|
27 |
-
2024-04-21
|
28 |
-
2024-04-21
|
29 |
-
2024-04-21
|
30 |
-
2024-04-21 14:
|
31 |
-
2024-04-21 14:
|
32 |
-
2024-04-21 14:
|
33 |
-
2024-04-21 14:
|
34 |
-
2024-04-21 14:
|
35 |
-
2024-04-21 14:
|
36 |
-
2024-04-21 14:
|
37 |
-
2024-04-21 14:
|
38 |
-
2024-04-21 14:
|
39 |
-
2024-04-21 14:
|
40 |
-
2024-04-21 14:
|
41 |
-
2024-04-21 14:
|
42 |
-
2024-04-21 14:
|
43 |
-
2024-04-21 14:
|
44 |
-
2024-04-21 14:
|
45 |
-
2024-04-21 14:
|
46 |
-
2024-04-21 14:
|
47 |
-
2024-04-21 14:
|
48 |
-
2024-04-21 14:
|
49 |
-
2024-04-21 14:
|
50 |
-
2024-04-21 15:
|
51 |
Traceback (most recent call last):
|
52 |
File "/app/finetuning/train.py", line 1179, in <module>
|
53 |
run(cfg=cfg)
|
@@ -69,4 +69,4 @@ Traceback (most recent call last):
|
|
69 |
response = verify_rest_response(response, endpoint)
|
70 |
File "/usr/local/lib/python3.10/dist-packages/mlflow/utils/rest_utils.py", line 152, in verify_rest_response
|
71 |
raise RestException(json.loads(response.text))
|
72 |
-
mlflow.exceptions.RestException: 409: INSERT ERROR, ENTITY:mlflow_metric, Duplicate _id:
|
|
|
1 |
+
2024-04-21 15:07:21,870 - INFO: Training in distributed mode with multiple processes, 1 GPU per process. Process 1, total: 4 local rank: 1.
|
2 |
+
2024-04-21 15:07:21,870 - INFO: Training in distributed mode with multiple processes, 1 GPU per process. Process 3, total: 4 local rank: 3.
|
3 |
+
2024-04-21 15:07:21,870 - INFO: Training in distributed mode with multiple processes, 1 GPU per process. Process 0, total: 4 local rank: 0.
|
4 |
+
2024-04-21 15:07:21,870 - INFO: Training in distributed mode with multiple processes, 1 GPU per process. Process 2, total: 4 local rank: 2.
|
5 |
+
2024-04-21 15:07:22,585 - INFO: Problem Type: text_causal_language_modeling
|
6 |
+
2024-04-21 15:07:22,586 - INFO: Global random seed: 879809
|
7 |
+
2024-04-21 15:07:22,586 - INFO: Preparing the data...
|
8 |
+
2024-04-21 15:07:22,586 - INFO: Setting up automatic validation split...
|
9 |
+
2024-04-21 15:07:22,613 - INFO: Preparing train and validation data
|
10 |
+
2024-04-21 15:07:22,613 - INFO: Loading train dataset...
|
11 |
+
2024-04-21 15:07:23,453 - INFO: Stop token ids: []
|
12 |
+
2024-04-21 15:07:23,459 - INFO: Loading validation dataset...
|
13 |
+
2024-04-21 15:07:23,933 - INFO: Stop token ids: []
|
14 |
+
2024-04-21 15:07:23,937 - INFO: Number of observations in train dataset: 495
|
15 |
+
2024-04-21 15:07:23,937 - INFO: Number of observations in validation dataset: 5
|
16 |
+
2024-04-21 15:07:24,567 - WARNING: PAD token id not matching between config and tokenizer. Overwriting with tokenizer id.
|
17 |
+
2024-04-21 15:07:24,568 - INFO: Setting pretraining_tp of model config to 1.
|
18 |
+
2024-04-21 15:07:24,570 - INFO: Using bfloat16 for backbone
|
19 |
+
2024-04-21 15:07:24,576 - WARNING: PAD token id not matching between config and tokenizer. Overwriting with tokenizer id.
|
20 |
+
2024-04-21 15:07:24,576 - INFO: Setting pretraining_tp of model config to 1.
|
21 |
+
2024-04-21 15:07:24,579 - INFO: Using bfloat16 for backbone
|
22 |
+
2024-04-21 15:07:24,614 - WARNING: PAD token id not matching between config and tokenizer. Overwriting with tokenizer id.
|
23 |
+
2024-04-21 15:07:24,614 - INFO: Setting pretraining_tp of model config to 1.
|
24 |
+
2024-04-21 15:07:24,617 - INFO: Using bfloat16 for backbone
|
25 |
+
2024-04-21 15:07:24,657 - INFO: Stop token ids: []
|
26 |
+
2024-04-21 15:07:24,659 - WARNING: PAD token id not matching between config and tokenizer. Overwriting with tokenizer id.
|
27 |
+
2024-04-21 15:07:24,660 - INFO: Setting pretraining_tp of model config to 1.
|
28 |
+
2024-04-21 15:07:24,662 - INFO: Using bfloat16 for backbone
|
29 |
+
2024-04-21 15:07:24,662 - INFO: Loading meta-llama/Llama-2-13b-hf. This may take a while.
|
30 |
+
2024-04-21 15:14:07,270 - INFO: Loaded meta-llama/Llama-2-13b-hf.
|
31 |
+
2024-04-21 15:14:07,274 - INFO: Lora module names: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
|
32 |
+
2024-04-21 15:14:09,060 - INFO: Enough space available for saving model weights.Required space: 25632.04MB, Available space: 973919.94MB.
|
33 |
+
2024-04-21 15:14:09,062 - INFO: Enough space available for saving model weights.Required space: 25632.04MB, Available space: 973919.94MB.
|
34 |
+
2024-04-21 15:14:09,064 - INFO: Enough space available for saving model weights.Required space: 25632.04MB, Available space: 973919.94MB.
|
35 |
+
2024-04-21 15:14:09,064 - INFO: Enough space available for saving model weights.Required space: 25632.04MB, Available space: 973919.94MB.
|
36 |
+
2024-04-21 15:14:09,070 - INFO: Optimizer AdamW has been provided with parameters {'eps': 1e-08, 'weight_decay': 0.0, 'betas': (0.8999999762, 0.9990000129), 'lr': 0.0001}
|
37 |
+
2024-04-21 15:14:09,072 - INFO: Optimizer AdamW has been provided with parameters {'weight_decay': 0.0, 'eps': 1e-08, 'betas': (0.8999999762, 0.9990000129), 'lr': 0.0001}
|
38 |
+
2024-04-21 15:14:09,074 - INFO: Optimizer AdamW has been provided with parameters {'eps': 1e-08, 'weight_decay': 0.0, 'betas': (0.8999999762, 0.9990000129), 'lr': 0.0001}
|
39 |
+
2024-04-21 15:14:09,074 - INFO: Optimizer AdamW has been provided with parameters {'weight_decay': 0.0, 'eps': 1e-08, 'betas': (0.8999999762, 0.9990000129), 'lr': 0.0001}
|
40 |
+
2024-04-21 15:14:11,746 - INFO: started process: 2, can_track: False, tracking_mode: TrackingMode.DURING_EPOCH
|
41 |
+
2024-04-21 15:14:11,752 - INFO: started process: 3, can_track: False, tracking_mode: TrackingMode.DURING_EPOCH
|
42 |
+
2024-04-21 15:14:11,753 - INFO: started process: 1, can_track: False, tracking_mode: TrackingMode.DURING_EPOCH
|
43 |
+
2024-04-21 15:14:11,890 - INFO: Evaluation step: 61
|
44 |
+
2024-04-21 15:14:11,902 - INFO: Evaluation step: 61
|
45 |
+
2024-04-21 15:14:11,902 - INFO: Evaluation step: 61
|
46 |
+
2024-04-21 15:14:12,694 - INFO: started process: 0, can_track: True, tracking_mode: TrackingMode.DURING_EPOCH
|
47 |
+
2024-04-21 15:14:12,694 - INFO: Training Epoch: 1 / 1
|
48 |
+
2024-04-21 15:14:12,695 - INFO: train loss: 0%| | 0/61 [00:00<?, ?it/s]
|
49 |
+
2024-04-21 15:14:12,765 - INFO: Evaluation step: 61
|
50 |
+
2024-04-21 15:16:27,176 - ERROR: Exception occurred during the run:
|
51 |
Traceback (most recent call last):
|
52 |
File "/app/finetuning/train.py", line 1179, in <module>
|
53 |
run(cfg=cfg)
|
|
|
69 |
response = verify_rest_response(response, endpoint)
|
70 |
File "/usr/local/lib/python3.10/dist-packages/mlflow/utils/rest_utils.py", line 152, in verify_rest_response
|
71 |
raise RestException(json.loads(response.text))
|
72 |
+
mlflow.exceptions.RestException: 409: INSERT ERROR, ENTITY:mlflow_metric, Duplicate _id: ceb1b9c6-ddd4-4ac3-9db0-564af323463b
|