Dean commited on
Commit
d5a6d18
·
1 Parent(s): c6912f8

committing after a bunch of fixes and before re-running with dvc

Browse files
.gitignore CHANGED
@@ -96,4 +96,5 @@ coverage.xml
96
  summarization-dagshub/
97
  /models
98
  default/
99
- artifacts/
 
 
96
  summarization-dagshub/
97
  /models
98
  default/
99
+ artifacts/
100
+ mlruns/
Makefile CHANGED
@@ -48,7 +48,15 @@ pull:
48
 
49
  ## run the DVC pipeline - recompute any modified outputs such as processed data or trained models
50
  run:
51
- dvc repro dvc.yaml
 
 
 
 
 
 
 
 
52
 
53
  #################################################################################
54
  # PROJECT RULES #
 
48
 
49
  ## run the DVC pipeline - recompute any modified outputs such as processed data or trained models
50
  run:
51
+ dvc repro dvc.yaml eval
52
+
53
+ ## run the visualization using Streamlit
54
+ visualize:
55
+ dvc repro dvc.yaml visualize
56
+
57
+ ## push the trained model to HF model hub
58
+ push_to_hf_hub:
59
+ dvc repro dvc.yaml push_to_hf_hub
60
 
61
  #################################################################################
62
  # PROJECT RULES #
data_params.yml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ data: cnn_dailymail
2
+ split: 0.01
dvc.lock CHANGED
@@ -54,30 +54,30 @@ stages:
54
  size: 1358833013
55
  nfiles: 3
56
  - path: params.yml
57
- md5: 200ce3c4d9f2e8b9eb040ef93eb22757
58
- size: 189
59
  - path: src/data/process_data.py
60
- md5: 7633b8978c523858d18b1ce9a5d3c8b7
61
  size: 516
62
  outs:
63
  - path: data/processed/test.csv
64
- md5: 3cb7b63891f12d53b3ef3e81a2e93f8e
65
- size: 986944
66
  - path: data/processed/train.csv
67
- md5: 51edd724b75a8e99a78b9138f8f37c60
68
- size: 25012573
69
  - path: data/processed/validation.csv
70
- md5: 0900e2bb330df94cb045faddd0b945d1
71
- size: 1138285
72
  download_data:
73
  cmd: python src/data/make_dataset.py
74
  deps:
75
  - path: params.yml
76
- md5: 200ce3c4d9f2e8b9eb040ef93eb22757
77
- size: 189
78
  - path: src/data/make_dataset.py
79
- md5: 9de71de0f8df5d0a7beb235ef7c7777d
80
- size: 772
81
  outs:
82
  - path: data/raw
83
  md5: 2ab20ac1b58df875a590b07d0e04eb5b.dir
 
54
  size: 1358833013
55
  nfiles: 3
56
  - path: params.yml
57
+ md5: 160cbfd0ed8f87c9c5cb28fbeef1072d
58
+ size: 266
59
  - path: src/data/process_data.py
60
+ md5: 5b6aaadc5a628979956d502b4fb4ebf2
61
  size: 516
62
  outs:
63
  - path: data/processed/test.csv
64
+ md5: 5f2bfb37d55a13ead3c81564dbee2fd5
65
+ size: 508508
66
  - path: data/processed/train.csv
67
+ md5: 707c5ed455a15ec48965daf92fed7df6
68
+ size: 12653913
69
  - path: data/processed/validation.csv
70
+ md5: 1e021dc163cc87a32cef74a98e4a0d51
71
+ size: 558403
72
  download_data:
73
  cmd: python src/data/make_dataset.py
74
  deps:
75
  - path: params.yml
76
+ md5: 160cbfd0ed8f87c9c5cb28fbeef1072d
77
+ size: 266
78
  - path: src/data/make_dataset.py
79
+ md5: 075c6233f8732eedf7915732f9a8ebfd
80
+ size: 771
81
  outs:
82
  - path: data/raw
83
  md5: 2ab20ac1b58df875a590b07d0e04eb5b.dir
dvc.yaml CHANGED
@@ -2,14 +2,14 @@ stages:
2
  download_data:
3
  cmd: python src/data/make_dataset.py
4
  deps:
5
- - params.yml
6
  - src/data/make_dataset.py
7
  outs:
8
  - data/raw
9
  process_data:
10
  cmd: python src/data/process_data.py
11
  deps:
12
- - params.yml
13
  - data/raw
14
  - src/data/process_data.py
15
  outs:
@@ -25,7 +25,7 @@ stages:
25
  train:
26
  cmd: python src/models/train_model.py
27
  deps:
28
- - params.yml
29
  - data/processed/train.csv
30
  - data/processed/validation.csv
31
  - src/models/train_model.py
@@ -38,7 +38,7 @@ stages:
38
  eval:
39
  cmd: python src/models/evaluate_model.py
40
  deps:
41
- - params.yml
42
  - data/processed/test.csv
43
  - models
44
  - src/models/evaluate_model.py
@@ -50,8 +50,13 @@ stages:
50
  deps:
51
  - models
52
  - src/visualization/visualize.py
53
- - params.yml
54
  metrics:
55
- - reports/visualization_metrics.csv:
56
  cache: false
 
 
 
 
 
 
57
 
 
2
  download_data:
3
  cmd: python src/data/make_dataset.py
4
  deps:
5
+ - data_params.yml
6
  - src/data/make_dataset.py
7
  outs:
8
  - data/raw
9
  process_data:
10
  cmd: python src/data/process_data.py
11
  deps:
12
+ - data_params.yml
13
  - data/raw
14
  - src/data/process_data.py
15
  outs:
 
25
  train:
26
  cmd: python src/models/train_model.py
27
  deps:
28
+ - model_params.yml
29
  - data/processed/train.csv
30
  - data/processed/validation.csv
31
  - src/models/train_model.py
 
38
  eval:
39
  cmd: python src/models/evaluate_model.py
40
  deps:
41
+ - model_params.yml
42
  - data/processed/test.csv
43
  - models
44
  - src/models/evaluate_model.py
 
50
  deps:
51
  - models
52
  - src/visualization/visualize.py
 
53
  metrics:
54
+ - reports/visualization_metrics.txt:
55
  cache: false
56
+ push_to_hf_hub:
57
+ cmd: python src/models/hf_upload.py
58
+ deps:
59
+ - model_params.yml
60
+ - src/models/hf_upload.py
61
+ - models
62
 
params.yml → model_params.yml RENAMED
@@ -1,16 +1,14 @@
1
  name: summarsiation
2
- data: cnn_dailymail
3
- batch_size: 2
4
- num_workers: 2
5
  model_type: t5
6
  model_name: t5-small
7
- learning_rate: 1e-4
8
  epochs: 5
9
- source_dir: src
 
 
10
  model_dir: models
11
  metric: rouge
12
- split: 0.001
13
- use_gpu: True
14
  visualise: True
15
  hf_username: gagan3012
16
- upload_to_hf: True
 
1
  name: summarsiation
 
 
 
2
  model_type: t5
3
  model_name: t5-small
4
+ batch_size: 2
5
  epochs: 5
6
+ use_gpu: True
7
+ learning_rate: 1e-4
8
+ num_workers: 2
9
  model_dir: models
10
  metric: rouge
11
+ source_dir: src
 
12
  visualise: True
13
  hf_username: gagan3012
14
+ upload_to_hf: False
reports/training_metrics.csv CHANGED
@@ -1,19 +1,11 @@
1
  Name,Value,Timestamp,Step
2
- "train_loss",4.101656913757324,1627559482684,49
3
- "epoch",0,1627559482684,49
4
- "val_loss",2.6896562576293945,1627559491036,57
5
- "epoch",0,1627559491036,57
6
- "train_loss",4.598623752593994,1627559499092,99
7
- "epoch",1,1627559499092,99
8
- "val_loss",2.472928047180176,1627559505946,115
9
- "epoch",1,1627559505946,115
10
- "train_loss",1.4196646213531494,1627559515636,149
11
- "epoch",2,1627559515636,149
12
- "val_loss",2.311669111251831,1627559521015,173
13
- "epoch",2,1627559521015,173
14
- "train_loss",0.9744294881820679,1627559532066,199
15
- "epoch",3,1627559532066,199
16
- "val_loss",2.2401840686798096,1627559535896,231
17
- "epoch",3,1627559535896,231
18
- "train_loss",2.785480260848999,1627559548623,249
19
- "epoch",4,1627559548623,249
 
1
  Name,Value,Timestamp,Step
2
+ "val_loss",5.029108047485352,1628177741756,14
3
+ "epoch",0,1628177741756,14
4
+ "val_loss",4.757647514343262,1628177893078,29
5
+ "epoch",1,1628177893078,29
6
+ "val_loss",4.493412494659424,1628177940684,44
7
+ "epoch",2,1628177940684,44
8
+ "train_loss",1.328701138496399,1628178045108,49
9
+ "epoch",3,1628178045108,49
10
+ "val_loss",4.228608131408691,1628178200552,59
11
+ "epoch",3,1628178200552,59
 
 
 
 
 
 
 
 
reports/training_params.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ status: success
requirements.txt CHANGED
@@ -5,10 +5,11 @@ transformers==4.9.0
5
  torch==1.9.0
6
  dagshub==0.1.7
7
  pandas==1.1.5
8
- rouge_score
 
 
 
9
  pyyaml
10
- dvc
11
- mlflow
12
 
13
  # external requirements
14
  click
 
5
  torch==1.9.0
6
  dagshub==0.1.7
7
  pandas==1.1.5
8
+ rouge_score==0.0.4
9
+ dvc==2.5.4
10
+ mlflow==1.19.0
11
+ streamlit==0.85.1
12
  pyyaml
 
 
13
 
14
  # external requirements
15
  click
src/data/make_dataset.py CHANGED
@@ -17,7 +17,7 @@ def make_dataset(dataset="cnn_dailymail", split="train"):
17
 
18
 
19
  if __name__ == "__main__":
20
- with open("params.yml") as f:
21
  params = yaml.safe_load(f)
22
  pprint.pprint(params)
23
  make_dataset(dataset=params["data"], split="train")
 
17
 
18
 
19
  if __name__ == "__main__":
20
+ with open("data_params.yml") as f:
21
  params = yaml.safe_load(f)
22
  pprint.pprint(params)
23
  make_dataset(dataset=params["data"], split="train")
src/data/process_data.py CHANGED
@@ -5,7 +5,7 @@ import os
5
 
6
  def process_data(split="train"):
7
 
8
- with open("params.yml") as f:
9
  params = yaml.safe_load(f)
10
 
11
  df = pd.read_csv("data/raw/{}.csv".format(split))
 
5
 
6
  def process_data(split="train"):
7
 
8
+ with open("data_params.yml") as f:
9
  params = yaml.safe_load(f)
10
 
11
  df = pd.read_csv("data/raw/{}.csv".format(split))
src/models/evaluate_model.py CHANGED
@@ -10,7 +10,7 @@ def evaluate_model():
10
  """
11
  Evaluate model using rouge measure
12
  """
13
- with open("params.yml") as f:
14
  params = yaml.safe_load(f)
15
 
16
  test_df = pd.read_csv("data/processed/test.csv")[:25]
 
10
  """
11
  Evaluate model using rouge measure
12
  """
13
+ with open("model_params.yml") as f:
14
  params = yaml.safe_load(f)
15
 
16
  test_df = pd.read_csv("data/processed/test.csv")[:25]
src/models/hf_upload.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ from getpass import getpass
3
+ from pathlib import Path
4
+ import yaml
5
+
6
+ from model import Summarization
7
+ from huggingface_hub import HfApi, Repository
8
+
9
+
10
+ def upload(upload_model, model_name):
11
+ hf_username = input("Enter your HuggingFace username:")
12
+ hf_password = getpass("Enter your HuggingFace password:")
13
+ if Path("./models").exists():
14
+ shutil.rmtree("./models")
15
+ token = HfApi().login(username=hf_username, password=hf_password)
16
+ del hf_password
17
+ model_url = HfApi().create_repo(token=token, name=model_name, exist_ok=True)
18
+ model_repo = Repository(
19
+ "./model",
20
+ clone_from=model_url,
21
+ use_auth_token=token,
22
+ git_email=f"{hf_username}@users.noreply.huggingface.co",
23
+ git_user=hf_username,
24
+ )
25
+
26
+ readme_txt = f"""
27
+ ---
28
+ Summarisation model {model_name}
29
+ """.strip()
30
+
31
+ (Path(model_repo.local_dir) / "README.md").write_text(readme_txt)
32
+ upload_model.save_model()
33
+ commit_url = model_repo.push_to_hub()
34
+
35
+ print("Check out your model at:")
36
+ print(commit_url)
37
+ print(f"https://huggingface.co/{hf_username}/{model_name}")
38
+
39
+
40
+ if __name__ == "__main__":
41
+ with open("model_params.yml") as f:
42
+ params = yaml.safe_load(f)
43
+
44
+ model = Summarization()
45
+ model.load_model(model_dir="./models")
46
+
47
+ upload(upload_model=model, model_name=params["name"])
src/models/model.py CHANGED
@@ -1,10 +1,7 @@
1
- import shutil
2
- from getpass import getpass
3
- from pathlib import Path
4
 
5
  import torch
6
  import pandas as pd
7
- from huggingface_hub import HfApi, Repository
8
  from transformers import (
9
  AdamW,
10
  T5ForConditionalGeneration,
@@ -550,31 +547,3 @@ class Summarization:
550
  "rougeLsum High F1": results["rougeLsum"].high.fmeasure,
551
  }
552
  return output
553
-
554
- def upload(self, hf_username, model_name):
555
- hf_password = getpass("Enter your HuggingFace password")
556
- if Path("./models").exists():
557
- shutil.rmtree("./models")
558
- token = HfApi().login(username=hf_username, password=hf_password)
559
- del hf_password
560
- model_url = HfApi().create_repo(token=token, name=model_name, exist_ok=True)
561
- model_repo = Repository(
562
- "./model",
563
- clone_from=model_url,
564
- use_auth_token=token,
565
- git_email=f"{hf_username}@users.noreply.huggingface.co",
566
- git_user=hf_username,
567
- )
568
-
569
- readme_txt = f"""
570
- ---
571
- Summarisation model {model_name}
572
- """.strip()
573
-
574
- (Path(model_repo.local_dir) / "README.md").write_text(readme_txt)
575
- self.save_model()
576
- commit_url = model_repo.push_to_hub()
577
-
578
- print("Check out your model at:")
579
- print(commit_url)
580
- print(f"https://huggingface.co/{hf_username}/{model_name}")
 
1
+
 
 
2
 
3
  import torch
4
  import pandas as pd
 
5
  from transformers import (
6
  AdamW,
7
  T5ForConditionalGeneration,
 
547
  "rougeLsum High F1": results["rougeLsum"].high.fmeasure,
548
  }
549
  return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/models/predict_model.py CHANGED
@@ -8,7 +8,7 @@ def predict_model(text):
8
  """
9
  Predict the summary of the given text.
10
  """
11
- with open("params.yml") as f:
12
  params = yaml.safe_load(f)
13
 
14
  model = Summarization()
 
8
  """
9
  Predict the summary of the given text.
10
  """
11
+ with open("model_params.yml") as f:
12
  params = yaml.safe_load(f)
13
 
14
  model = Summarization()
src/models/train_model.py CHANGED
@@ -8,15 +8,15 @@ def train_model():
8
  """
9
  Train the model
10
  """
11
- with open("params.yml") as f:
12
  params = yaml.safe_load(f)
13
 
14
  # Load the data
15
  train_df = pd.read_csv("data/processed/train.csv")
16
  eval_df = pd.read_csv("data/processed/validation.csv")
17
 
18
- train_df = train_df.sample(frac=params["split"], replace=True, random_state=1)
19
- eval_df = eval_df.sample(frac=params["split"], replace=True, random_state=1)
20
 
21
  model = Summarization()
22
  model.from_pretrained(
@@ -35,9 +35,6 @@ def train_model():
35
 
36
  model.save_model(model_dir=params["model_dir"])
37
 
38
- if params["upload_to_hf"]:
39
- model.upload(hf_username=params["hf_username"], model_name=params["name"])
40
-
41
 
42
  if __name__ == "__main__":
43
  train_model()
 
8
  """
9
  Train the model
10
  """
11
+ with open("model_params.yml") as f:
12
  params = yaml.safe_load(f)
13
 
14
  # Load the data
15
  train_df = pd.read_csv("data/processed/train.csv")
16
  eval_df = pd.read_csv("data/processed/validation.csv")
17
 
18
+ train_df = train_df.sample(random_state=1)
19
+ eval_df = eval_df.sample(random_state=1)
20
 
21
  model = Summarization()
22
  model.from_pretrained(
 
35
 
36
  model.save_model(model_dir=params["model_dir"])
37
 
 
 
 
38
 
39
  if __name__ == "__main__":
40
  train_model()
src/visualization/visualize.py CHANGED
@@ -1,7 +1,6 @@
1
  import streamlit as st
2
- import yaml
3
 
4
- from models import predict_model
5
 
6
 
7
  def visualize():
@@ -25,8 +24,4 @@ def visualize():
25
 
26
 
27
  if __name__ == "__main__":
28
- with open("params.yml") as f:
29
- params = yaml.safe_load(f)
30
-
31
- if params["visualise"]:
32
- visualize()
 
1
  import streamlit as st
 
2
 
3
+ from ..models import predict_model
4
 
5
 
6
  def visualize():
 
24
 
25
 
26
  if __name__ == "__main__":
27
+ visualize()