Dean commited on
Commit
7e3c514
·
1 Parent(s): c9eec48

Starting to apply fixes for the project to latest version

Browse files
.gitignore CHANGED
@@ -93,6 +93,5 @@ coverage.xml
93
  .vscode
94
  /data
95
 
96
- wandb/
97
  summarization-dagshub/
98
  /models
 
93
  .vscode
94
  /data
95
 
 
96
  summarization-dagshub/
97
  /models
data.dvc DELETED
@@ -1,14 +0,0 @@
1
- deps:
2
- - path: params.yml
3
- md5: d0f3e81bc9191e752a69761045a449d9
4
- size: 196
5
- - path: src/data/make_dataset.py
6
- md5: 9de71de0f8df5d0a7beb235ef7c7777d
7
- size: 772
8
- cmd: python src/data/make_dataset.py
9
- outs:
10
- - md5: 2ab20ac1b58df875a590b07d0e04eb5b.dir
11
- nfiles: 3
12
- path: data/raw
13
- size: 1358833013
14
- md5: ff502232006c7fbef1015b5aa5cc4bbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dvc.lock CHANGED
@@ -10,19 +10,22 @@ stages:
10
  md5: 0900e2bb330df94cb045faddd0b945d1
11
  size: 1138285
12
  - path: params.yml
13
- md5: d0f3e81bc9191e752a69761045a449d9
14
- size: 196
15
  - path: src/models/train_model.py
16
- md5: fca8acf70f09cecd679ca1ddb2eef6a9
17
- size: 1198
18
  outs:
19
  - path: models
20
- md5: 688745a9fb1cc7c8580887bae3873a39.dir
21
- size: 486952666
22
- nfiles: 10
23
- - path: reports/training_metrics.txt
24
- md5: 048a956b0eb431535d287bbc3322cf76
25
- size: 158
 
 
 
26
  eval:
27
  cmd: python src/models/evaluate_model.py
28
  deps:
@@ -51,8 +54,8 @@ stages:
51
  size: 0
52
  nfiles: 0
53
  - path: params.yml
54
- md5: d0f3e81bc9191e752a69761045a449d9
55
- size: 196
56
  - path: src/data/process_data.py
57
  md5: ba3ba7b7c8a905b736b6b0a28d2334c4
58
  size: 623
@@ -66,3 +69,17 @@ stages:
66
  - path: data/processed/validation.csv
67
  md5: 0900e2bb330df94cb045faddd0b945d1
68
  size: 1138285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  md5: 0900e2bb330df94cb045faddd0b945d1
11
  size: 1138285
12
  - path: params.yml
13
+ md5: 8ac76f9483ae2d78cf89a2e2be4e8446
14
+ size: 189
15
  - path: src/models/train_model.py
16
+ md5: d57b5ff84bc29a8ea75e191027d70148
17
+ size: 988
18
  outs:
19
  - path: models
20
+ md5: b8dd7baa6b7b85a7b4c2fcfbe3d831bf.dir
21
+ size: 243476333
22
+ nfiles: 5
23
+ - path: reports/training_metrics.csv
24
+ md5: f0c89a07561ca8aea8ab3f4764b648e7
25
+ size: 26
26
+ - path: reports/training_params.yml
27
+ md5: 8a80554c91d9fca8acb82f023de02f11
28
+ size: 3
29
  eval:
30
  cmd: python src/models/evaluate_model.py
31
  deps:
 
54
  size: 0
55
  nfiles: 0
56
  - path: params.yml
57
+ md5: 8ac76f9483ae2d78cf89a2e2be4e8446
58
+ size: 189
59
  - path: src/data/process_data.py
60
  md5: ba3ba7b7c8a905b736b6b0a28d2334c4
61
  size: 623
 
69
  - path: data/processed/validation.csv
70
  md5: 0900e2bb330df94cb045faddd0b945d1
71
  size: 1138285
72
+ download_data:
73
+ cmd: python src/data/make_dataset.py
74
+ deps:
75
+ - path: params.yml
76
+ md5: 8ac76f9483ae2d78cf89a2e2be4e8446
77
+ size: 189
78
+ - path: src/data/make_dataset.py
79
+ md5: 9de71de0f8df5d0a7beb235ef7c7777d
80
+ size: 772
81
+ outs:
82
+ - path: data/raw
83
+ md5: 2ab20ac1b58df875a590b07d0e04eb5b.dir
84
+ size: 1358833013
85
+ nfiles: 3
dvc.yaml CHANGED
@@ -1,4 +1,11 @@
1
  stages:
 
 
 
 
 
 
 
2
  process_data:
3
  cmd: python src/data/process_data.py
4
  deps:
@@ -25,8 +32,10 @@ stages:
25
  outs:
26
  - models:
27
  persist: true
 
 
28
  metrics:
29
- - reports/training_metrics.txt:
30
  cache: false
31
  eval:
32
  cmd: python src/models/evaluate_model.py
@@ -36,6 +45,6 @@ stages:
36
  - models
37
  - src/models/evaluate_model.py
38
  metrics:
39
- - reports/metrics.txt:
40
  cache: false
41
 
 
1
  stages:
2
+ download_data:
3
+ cmd: python src/data/make_dataset.py
4
+ deps:
5
+ - params.yml
6
+ - src/data/make_dataset.py
7
+ outs:
8
+ - data/raw
9
  process_data:
10
  cmd: python src/data/process_data.py
11
  deps:
 
32
  outs:
33
  - models:
34
  persist: true
35
+ - reports/training_params.yml:
36
+ cache: false
37
  metrics:
38
+ - reports/training_metrics.csv:
39
  cache: false
40
  eval:
41
  cmd: python src/models/evaluate_model.py
 
45
  - models
46
  - src/models/evaluate_model.py
47
  metrics:
48
+ - reports/metrics.csv:
49
  cache: false
50
 
requirements.txt CHANGED
@@ -9,7 +9,6 @@ rouge_score
9
  pyyaml
10
  dvc
11
  mlflow
12
- wandb
13
 
14
  # external requirements
15
  click
 
9
  pyyaml
10
  dvc
11
  mlflow
 
12
 
13
  # external requirements
14
  click
src/models/evaluate_model.py CHANGED
@@ -1,4 +1,4 @@
1
- import dagshub
2
  import yaml
3
 
4
  from model import Summarization
@@ -18,8 +18,8 @@ def evaluate_model():
18
  model.load_model(model_type=params['model_type'], model_dir=params['model_dir'])
19
  results = model.evaluate(test_df=test_df, metrics=params['metric'])
20
 
21
- with open('reports/metrics.txt', 'w') as fp:
22
- json.dump(results, fp)
23
 
24
 
25
  if __name__ == '__main__':
 
1
+ from dagshub import dagshub_logger
2
  import yaml
3
 
4
  from model import Summarization
 
18
  model.load_model(model_type=params['model_type'], model_dir=params['model_dir'])
19
  results = model.evaluate(test_df=test_df, metrics=params['metric'])
20
 
21
+ with dagshub_logger(should_log_hparams=False) as logger:
22
+ logger.log_metrics(results)
23
 
24
 
25
  if __name__ == '__main__':
src/models/model.py CHANGED
@@ -7,7 +7,8 @@ from transformers import (
7
  )
8
  from torch.utils.data import Dataset, DataLoader
9
  import pytorch_lightning as pl
10
- from pytorch_lightning.loggers import MLFlowLogger, WandbLogger
 
11
  from pytorch_lightning import Trainer
12
  from pytorch_lightning.callbacks.early_stopping import EarlyStopping
13
  from pytorch_lightning import LightningDataModule
@@ -15,8 +16,6 @@ from pytorch_lightning import LightningModule
15
  from datasets import load_metric
16
  from tqdm.auto import tqdm
17
 
18
- # from dagshub.pytorch_lightning import DAGsHubLogger
19
-
20
 
21
  torch.cuda.empty_cache()
22
  pl.seed_everything(42)
@@ -330,9 +329,8 @@ class Summarization:
330
  MLlogger = MLFlowLogger(experiment_name="Summarization",
331
  tracking_uri="https://dagshub.com/gagan3012/summarization.mlflow")
332
 
333
- WandLogger = WandbLogger(project="summarization-dagshub")
334
-
335
- # logger = DAGsHubLogger(metrics_path='reports/training_metrics.txt')
336
 
337
  early_stop_callback = (
338
  [
@@ -351,7 +349,7 @@ class Summarization:
351
  gpus = -1 if use_gpu and torch.cuda.is_available() else 0
352
 
353
  trainer = Trainer(
354
- logger=[WandLogger, MLlogger],
355
  callbacks=early_stop_callback,
356
  max_epochs=max_epochs,
357
  gpus=gpus,
 
7
  )
8
  from torch.utils.data import Dataset, DataLoader
9
  import pytorch_lightning as pl
10
+ from pytorch_lightning.loggers import MLFlowLogger
11
+ from dagshub.pytorch_lightning import DAGsHubLogger
12
  from pytorch_lightning import Trainer
13
  from pytorch_lightning.callbacks.early_stopping import EarlyStopping
14
  from pytorch_lightning import LightningDataModule
 
16
  from datasets import load_metric
17
  from tqdm.auto import tqdm
18
 
 
 
19
 
20
  torch.cuda.empty_cache()
21
  pl.seed_everything(42)
 
329
  MLlogger = MLFlowLogger(experiment_name="Summarization",
330
  tracking_uri="https://dagshub.com/gagan3012/summarization.mlflow")
331
 
332
+ logger = DAGsHubLogger(metrics_path='reports/training_metrics.csv',
333
+ hparams_path='reports/training_params.yml')
 
334
 
335
  early_stop_callback = (
336
  [
 
349
  gpus = -1 if use_gpu and torch.cuda.is_available() else 0
350
 
351
  trainer = Trainer(
352
+ logger=[MLlogger, logger],
353
  callbacks=early_stop_callback,
354
  max_epochs=max_epochs,
355
  gpus=gpus,
src/models/train_model.py CHANGED
@@ -1,5 +1,3 @@
1
- import json
2
-
3
  import yaml
4
 
5
  from model import Summarization
@@ -30,12 +28,6 @@ def train_model():
30
 
31
  model.save_model(model_dir=params['model_dir'])
32
 
33
- with open('wandb/latest-run/files/wandb-summary.json') as json_file:
34
- data = json.load(json_file)
35
-
36
- with open('reports/training_metrics.txt', 'w') as fp:
37
- json.dump(data, fp)
38
-
39
 
40
  if __name__ == '__main__':
41
  train_model()
 
 
 
1
  import yaml
2
 
3
  from model import Summarization
 
28
 
29
  model.save_model(model_dir=params['model_dir'])
30
 
 
 
 
 
 
 
31
 
32
  if __name__ == '__main__':
33
  train_model()