Spaces:
Runtime error
Runtime error
Dean
commited on
Commit
·
d5a6d18
1
Parent(s):
c6912f8
committing after a bunch of fixes and before re-running with dvc
Browse files- .gitignore +2 -1
- Makefile +9 -1
- data_params.yml +2 -0
- dvc.lock +13 -13
- dvc.yaml +11 -6
- params.yml → model_params.yml +6 -8
- reports/training_metrics.csv +10 -18
- reports/training_params.yml +1 -0
- requirements.txt +4 -3
- src/data/make_dataset.py +1 -1
- src/data/process_data.py +1 -1
- src/models/evaluate_model.py +1 -1
- src/models/hf_upload.py +47 -0
- src/models/model.py +1 -32
- src/models/predict_model.py +1 -1
- src/models/train_model.py +3 -6
- src/visualization/visualize.py +2 -7
.gitignore
CHANGED
@@ -96,4 +96,5 @@ coverage.xml
|
|
96 |
summarization-dagshub/
|
97 |
/models
|
98 |
default/
|
99 |
-
artifacts/
|
|
|
|
96 |
summarization-dagshub/
|
97 |
/models
|
98 |
default/
|
99 |
+
artifacts/
|
100 |
+
mlruns/
|
Makefile
CHANGED
@@ -48,7 +48,15 @@ pull:
|
|
48 |
|
49 |
## run the DVC pipeline - recompute any modified outputs such as processed data or trained models
|
50 |
run:
|
51 |
-
dvc repro dvc.yaml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
#################################################################################
|
54 |
# PROJECT RULES #
|
|
|
48 |
|
49 |
## run the DVC pipeline - recompute any modified outputs such as processed data or trained models
|
50 |
run:
|
51 |
+
dvc repro dvc.yaml eval
|
52 |
+
|
53 |
+
## run the visualization using Streamlit
|
54 |
+
visualize:
|
55 |
+
dvc repro dvc.yaml visualize
|
56 |
+
|
57 |
+
## push the trained model to HF model hub
|
58 |
+
push_to_hf_hub:
|
59 |
+
dvc repro dvc.yaml push_to_hf_hub
|
60 |
|
61 |
#################################################################################
|
62 |
# PROJECT RULES #
|
data_params.yml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
data: cnn_dailymail
|
2 |
+
split: 0.01
|
dvc.lock
CHANGED
@@ -54,30 +54,30 @@ stages:
|
|
54 |
size: 1358833013
|
55 |
nfiles: 3
|
56 |
- path: params.yml
|
57 |
-
md5:
|
58 |
-
size:
|
59 |
- path: src/data/process_data.py
|
60 |
-
md5:
|
61 |
size: 516
|
62 |
outs:
|
63 |
- path: data/processed/test.csv
|
64 |
-
md5:
|
65 |
-
size:
|
66 |
- path: data/processed/train.csv
|
67 |
-
md5:
|
68 |
-
size:
|
69 |
- path: data/processed/validation.csv
|
70 |
-
md5:
|
71 |
-
size:
|
72 |
download_data:
|
73 |
cmd: python src/data/make_dataset.py
|
74 |
deps:
|
75 |
- path: params.yml
|
76 |
-
md5:
|
77 |
-
size:
|
78 |
- path: src/data/make_dataset.py
|
79 |
-
md5:
|
80 |
-
size:
|
81 |
outs:
|
82 |
- path: data/raw
|
83 |
md5: 2ab20ac1b58df875a590b07d0e04eb5b.dir
|
|
|
54 |
size: 1358833013
|
55 |
nfiles: 3
|
56 |
- path: params.yml
|
57 |
+
md5: 160cbfd0ed8f87c9c5cb28fbeef1072d
|
58 |
+
size: 266
|
59 |
- path: src/data/process_data.py
|
60 |
+
md5: 5b6aaadc5a628979956d502b4fb4ebf2
|
61 |
size: 516
|
62 |
outs:
|
63 |
- path: data/processed/test.csv
|
64 |
+
md5: 5f2bfb37d55a13ead3c81564dbee2fd5
|
65 |
+
size: 508508
|
66 |
- path: data/processed/train.csv
|
67 |
+
md5: 707c5ed455a15ec48965daf92fed7df6
|
68 |
+
size: 12653913
|
69 |
- path: data/processed/validation.csv
|
70 |
+
md5: 1e021dc163cc87a32cef74a98e4a0d51
|
71 |
+
size: 558403
|
72 |
download_data:
|
73 |
cmd: python src/data/make_dataset.py
|
74 |
deps:
|
75 |
- path: params.yml
|
76 |
+
md5: 160cbfd0ed8f87c9c5cb28fbeef1072d
|
77 |
+
size: 266
|
78 |
- path: src/data/make_dataset.py
|
79 |
+
md5: 075c6233f8732eedf7915732f9a8ebfd
|
80 |
+
size: 771
|
81 |
outs:
|
82 |
- path: data/raw
|
83 |
md5: 2ab20ac1b58df875a590b07d0e04eb5b.dir
|
dvc.yaml
CHANGED
@@ -2,14 +2,14 @@ stages:
|
|
2 |
download_data:
|
3 |
cmd: python src/data/make_dataset.py
|
4 |
deps:
|
5 |
-
-
|
6 |
- src/data/make_dataset.py
|
7 |
outs:
|
8 |
- data/raw
|
9 |
process_data:
|
10 |
cmd: python src/data/process_data.py
|
11 |
deps:
|
12 |
-
-
|
13 |
- data/raw
|
14 |
- src/data/process_data.py
|
15 |
outs:
|
@@ -25,7 +25,7 @@ stages:
|
|
25 |
train:
|
26 |
cmd: python src/models/train_model.py
|
27 |
deps:
|
28 |
-
-
|
29 |
- data/processed/train.csv
|
30 |
- data/processed/validation.csv
|
31 |
- src/models/train_model.py
|
@@ -38,7 +38,7 @@ stages:
|
|
38 |
eval:
|
39 |
cmd: python src/models/evaluate_model.py
|
40 |
deps:
|
41 |
-
-
|
42 |
- data/processed/test.csv
|
43 |
- models
|
44 |
- src/models/evaluate_model.py
|
@@ -50,8 +50,13 @@ stages:
|
|
50 |
deps:
|
51 |
- models
|
52 |
- src/visualization/visualize.py
|
53 |
-
- params.yml
|
54 |
metrics:
|
55 |
-
- reports/visualization_metrics.
|
56 |
cache: false
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
|
|
2 |
download_data:
|
3 |
cmd: python src/data/make_dataset.py
|
4 |
deps:
|
5 |
+
- data_params.yml
|
6 |
- src/data/make_dataset.py
|
7 |
outs:
|
8 |
- data/raw
|
9 |
process_data:
|
10 |
cmd: python src/data/process_data.py
|
11 |
deps:
|
12 |
+
- data_params.yml
|
13 |
- data/raw
|
14 |
- src/data/process_data.py
|
15 |
outs:
|
|
|
25 |
train:
|
26 |
cmd: python src/models/train_model.py
|
27 |
deps:
|
28 |
+
- model_params.yml
|
29 |
- data/processed/train.csv
|
30 |
- data/processed/validation.csv
|
31 |
- src/models/train_model.py
|
|
|
38 |
eval:
|
39 |
cmd: python src/models/evaluate_model.py
|
40 |
deps:
|
41 |
+
- model_params.yml
|
42 |
- data/processed/test.csv
|
43 |
- models
|
44 |
- src/models/evaluate_model.py
|
|
|
50 |
deps:
|
51 |
- models
|
52 |
- src/visualization/visualize.py
|
|
|
53 |
metrics:
|
54 |
+
- reports/visualization_metrics.txt:
|
55 |
cache: false
|
56 |
+
push_to_hf_hub:
|
57 |
+
cmd: python src/models/hf_upload.py
|
58 |
+
deps:
|
59 |
+
- model_params.yml
|
60 |
+
- src/models/hf_upload.py
|
61 |
+
- models
|
62 |
|
params.yml → model_params.yml
RENAMED
@@ -1,16 +1,14 @@
|
|
1 |
name: summarsiation
|
2 |
-
data: cnn_dailymail
|
3 |
-
batch_size: 2
|
4 |
-
num_workers: 2
|
5 |
model_type: t5
|
6 |
model_name: t5-small
|
7 |
-
|
8 |
epochs: 5
|
9 |
-
|
|
|
|
|
10 |
model_dir: models
|
11 |
metric: rouge
|
12 |
-
|
13 |
-
use_gpu: True
|
14 |
visualise: True
|
15 |
hf_username: gagan3012
|
16 |
-
upload_to_hf:
|
|
|
1 |
name: summarsiation
|
|
|
|
|
|
|
2 |
model_type: t5
|
3 |
model_name: t5-small
|
4 |
+
batch_size: 2
|
5 |
epochs: 5
|
6 |
+
use_gpu: True
|
7 |
+
learning_rate: 1e-4
|
8 |
+
num_workers: 2
|
9 |
model_dir: models
|
10 |
metric: rouge
|
11 |
+
source_dir: src
|
|
|
12 |
visualise: True
|
13 |
hf_username: gagan3012
|
14 |
+
upload_to_hf: False
|
reports/training_metrics.csv
CHANGED
@@ -1,19 +1,11 @@
|
|
1 |
Name,Value,Timestamp,Step
|
2 |
-
"
|
3 |
-
"epoch",0,
|
4 |
-
"val_loss",
|
5 |
-
"epoch",
|
6 |
-
"
|
7 |
-
"epoch",
|
8 |
-
"
|
9 |
-
"epoch",
|
10 |
-
"
|
11 |
-
"epoch",
|
12 |
-
"val_loss",2.311669111251831,1627559521015,173
|
13 |
-
"epoch",2,1627559521015,173
|
14 |
-
"train_loss",0.9744294881820679,1627559532066,199
|
15 |
-
"epoch",3,1627559532066,199
|
16 |
-
"val_loss",2.2401840686798096,1627559535896,231
|
17 |
-
"epoch",3,1627559535896,231
|
18 |
-
"train_loss",2.785480260848999,1627559548623,249
|
19 |
-
"epoch",4,1627559548623,249
|
|
|
1 |
Name,Value,Timestamp,Step
|
2 |
+
"val_loss",5.029108047485352,1628177741756,14
|
3 |
+
"epoch",0,1628177741756,14
|
4 |
+
"val_loss",4.757647514343262,1628177893078,29
|
5 |
+
"epoch",1,1628177893078,29
|
6 |
+
"val_loss",4.493412494659424,1628177940684,44
|
7 |
+
"epoch",2,1628177940684,44
|
8 |
+
"train_loss",1.328701138496399,1628178045108,49
|
9 |
+
"epoch",3,1628178045108,49
|
10 |
+
"val_loss",4.228608131408691,1628178200552,59
|
11 |
+
"epoch",3,1628178200552,59
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reports/training_params.yml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
status: success
|
requirements.txt
CHANGED
@@ -5,10 +5,11 @@ transformers==4.9.0
|
|
5 |
torch==1.9.0
|
6 |
dagshub==0.1.7
|
7 |
pandas==1.1.5
|
8 |
-
rouge_score
|
|
|
|
|
|
|
9 |
pyyaml
|
10 |
-
dvc
|
11 |
-
mlflow
|
12 |
|
13 |
# external requirements
|
14 |
click
|
|
|
5 |
torch==1.9.0
|
6 |
dagshub==0.1.7
|
7 |
pandas==1.1.5
|
8 |
+
rouge_score==0.0.4
|
9 |
+
dvc==2.5.4
|
10 |
+
mlflow==1.19.0
|
11 |
+
streamlit==0.85.1
|
12 |
pyyaml
|
|
|
|
|
13 |
|
14 |
# external requirements
|
15 |
click
|
src/data/make_dataset.py
CHANGED
@@ -17,7 +17,7 @@ def make_dataset(dataset="cnn_dailymail", split="train"):
|
|
17 |
|
18 |
|
19 |
if __name__ == "__main__":
|
20 |
-
with open("
|
21 |
params = yaml.safe_load(f)
|
22 |
pprint.pprint(params)
|
23 |
make_dataset(dataset=params["data"], split="train")
|
|
|
17 |
|
18 |
|
19 |
if __name__ == "__main__":
|
20 |
+
with open("data_params.yml") as f:
|
21 |
params = yaml.safe_load(f)
|
22 |
pprint.pprint(params)
|
23 |
make_dataset(dataset=params["data"], split="train")
|
src/data/process_data.py
CHANGED
@@ -5,7 +5,7 @@ import os
|
|
5 |
|
6 |
def process_data(split="train"):
|
7 |
|
8 |
-
with open("
|
9 |
params = yaml.safe_load(f)
|
10 |
|
11 |
df = pd.read_csv("data/raw/{}.csv".format(split))
|
|
|
5 |
|
6 |
def process_data(split="train"):
|
7 |
|
8 |
+
with open("data_params.yml") as f:
|
9 |
params = yaml.safe_load(f)
|
10 |
|
11 |
df = pd.read_csv("data/raw/{}.csv".format(split))
|
src/models/evaluate_model.py
CHANGED
@@ -10,7 +10,7 @@ def evaluate_model():
|
|
10 |
"""
|
11 |
Evaluate model using rouge measure
|
12 |
"""
|
13 |
-
with open("
|
14 |
params = yaml.safe_load(f)
|
15 |
|
16 |
test_df = pd.read_csv("data/processed/test.csv")[:25]
|
|
|
10 |
"""
|
11 |
Evaluate model using rouge measure
|
12 |
"""
|
13 |
+
with open("model_params.yml") as f:
|
14 |
params = yaml.safe_load(f)
|
15 |
|
16 |
test_df = pd.read_csv("data/processed/test.csv")[:25]
|
src/models/hf_upload.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import shutil
|
2 |
+
from getpass import getpass
|
3 |
+
from pathlib import Path
|
4 |
+
import yaml
|
5 |
+
|
6 |
+
from model import Summarization
|
7 |
+
from huggingface_hub import HfApi, Repository
|
8 |
+
|
9 |
+
|
10 |
+
def upload(upload_model, model_name):
|
11 |
+
hf_username = input("Enter your HuggingFace username:")
|
12 |
+
hf_password = getpass("Enter your HuggingFace password:")
|
13 |
+
if Path("./models").exists():
|
14 |
+
shutil.rmtree("./models")
|
15 |
+
token = HfApi().login(username=hf_username, password=hf_password)
|
16 |
+
del hf_password
|
17 |
+
model_url = HfApi().create_repo(token=token, name=model_name, exist_ok=True)
|
18 |
+
model_repo = Repository(
|
19 |
+
"./model",
|
20 |
+
clone_from=model_url,
|
21 |
+
use_auth_token=token,
|
22 |
+
git_email=f"{hf_username}@users.noreply.huggingface.co",
|
23 |
+
git_user=hf_username,
|
24 |
+
)
|
25 |
+
|
26 |
+
readme_txt = f"""
|
27 |
+
---
|
28 |
+
Summarisation model {model_name}
|
29 |
+
""".strip()
|
30 |
+
|
31 |
+
(Path(model_repo.local_dir) / "README.md").write_text(readme_txt)
|
32 |
+
upload_model.save_model()
|
33 |
+
commit_url = model_repo.push_to_hub()
|
34 |
+
|
35 |
+
print("Check out your model at:")
|
36 |
+
print(commit_url)
|
37 |
+
print(f"https://huggingface.co/{hf_username}/{model_name}")
|
38 |
+
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
with open("model_params.yml") as f:
|
42 |
+
params = yaml.safe_load(f)
|
43 |
+
|
44 |
+
model = Summarization()
|
45 |
+
model.load_model(model_dir="./models")
|
46 |
+
|
47 |
+
upload(upload_model=model, model_name=params["name"])
|
src/models/model.py
CHANGED
@@ -1,10 +1,7 @@
|
|
1 |
-
|
2 |
-
from getpass import getpass
|
3 |
-
from pathlib import Path
|
4 |
|
5 |
import torch
|
6 |
import pandas as pd
|
7 |
-
from huggingface_hub import HfApi, Repository
|
8 |
from transformers import (
|
9 |
AdamW,
|
10 |
T5ForConditionalGeneration,
|
@@ -550,31 +547,3 @@ class Summarization:
|
|
550 |
"rougeLsum High F1": results["rougeLsum"].high.fmeasure,
|
551 |
}
|
552 |
return output
|
553 |
-
|
554 |
-
def upload(self, hf_username, model_name):
|
555 |
-
hf_password = getpass("Enter your HuggingFace password")
|
556 |
-
if Path("./models").exists():
|
557 |
-
shutil.rmtree("./models")
|
558 |
-
token = HfApi().login(username=hf_username, password=hf_password)
|
559 |
-
del hf_password
|
560 |
-
model_url = HfApi().create_repo(token=token, name=model_name, exist_ok=True)
|
561 |
-
model_repo = Repository(
|
562 |
-
"./model",
|
563 |
-
clone_from=model_url,
|
564 |
-
use_auth_token=token,
|
565 |
-
git_email=f"{hf_username}@users.noreply.huggingface.co",
|
566 |
-
git_user=hf_username,
|
567 |
-
)
|
568 |
-
|
569 |
-
readme_txt = f"""
|
570 |
-
---
|
571 |
-
Summarisation model {model_name}
|
572 |
-
""".strip()
|
573 |
-
|
574 |
-
(Path(model_repo.local_dir) / "README.md").write_text(readme_txt)
|
575 |
-
self.save_model()
|
576 |
-
commit_url = model_repo.push_to_hub()
|
577 |
-
|
578 |
-
print("Check out your model at:")
|
579 |
-
print(commit_url)
|
580 |
-
print(f"https://huggingface.co/{hf_username}/{model_name}")
|
|
|
1 |
+
|
|
|
|
|
2 |
|
3 |
import torch
|
4 |
import pandas as pd
|
|
|
5 |
from transformers import (
|
6 |
AdamW,
|
7 |
T5ForConditionalGeneration,
|
|
|
547 |
"rougeLsum High F1": results["rougeLsum"].high.fmeasure,
|
548 |
}
|
549 |
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/models/predict_model.py
CHANGED
@@ -8,7 +8,7 @@ def predict_model(text):
|
|
8 |
"""
|
9 |
Predict the summary of the given text.
|
10 |
"""
|
11 |
-
with open("
|
12 |
params = yaml.safe_load(f)
|
13 |
|
14 |
model = Summarization()
|
|
|
8 |
"""
|
9 |
Predict the summary of the given text.
|
10 |
"""
|
11 |
+
with open("model_params.yml") as f:
|
12 |
params = yaml.safe_load(f)
|
13 |
|
14 |
model = Summarization()
|
src/models/train_model.py
CHANGED
@@ -8,15 +8,15 @@ def train_model():
|
|
8 |
"""
|
9 |
Train the model
|
10 |
"""
|
11 |
-
with open("
|
12 |
params = yaml.safe_load(f)
|
13 |
|
14 |
# Load the data
|
15 |
train_df = pd.read_csv("data/processed/train.csv")
|
16 |
eval_df = pd.read_csv("data/processed/validation.csv")
|
17 |
|
18 |
-
train_df = train_df.sample(
|
19 |
-
eval_df = eval_df.sample(
|
20 |
|
21 |
model = Summarization()
|
22 |
model.from_pretrained(
|
@@ -35,9 +35,6 @@ def train_model():
|
|
35 |
|
36 |
model.save_model(model_dir=params["model_dir"])
|
37 |
|
38 |
-
if params["upload_to_hf"]:
|
39 |
-
model.upload(hf_username=params["hf_username"], model_name=params["name"])
|
40 |
-
|
41 |
|
42 |
if __name__ == "__main__":
|
43 |
train_model()
|
|
|
8 |
"""
|
9 |
Train the model
|
10 |
"""
|
11 |
+
with open("model_params.yml") as f:
|
12 |
params = yaml.safe_load(f)
|
13 |
|
14 |
# Load the data
|
15 |
train_df = pd.read_csv("data/processed/train.csv")
|
16 |
eval_df = pd.read_csv("data/processed/validation.csv")
|
17 |
|
18 |
+
train_df = train_df.sample(random_state=1)
|
19 |
+
eval_df = eval_df.sample(random_state=1)
|
20 |
|
21 |
model = Summarization()
|
22 |
model.from_pretrained(
|
|
|
35 |
|
36 |
model.save_model(model_dir=params["model_dir"])
|
37 |
|
|
|
|
|
|
|
38 |
|
39 |
if __name__ == "__main__":
|
40 |
train_model()
|
src/visualization/visualize.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import streamlit as st
|
2 |
-
import yaml
|
3 |
|
4 |
-
from models import predict_model
|
5 |
|
6 |
|
7 |
def visualize():
|
@@ -25,8 +24,4 @@ def visualize():
|
|
25 |
|
26 |
|
27 |
if __name__ == "__main__":
|
28 |
-
|
29 |
-
params = yaml.safe_load(f)
|
30 |
-
|
31 |
-
if params["visualise"]:
|
32 |
-
visualize()
|
|
|
1 |
import streamlit as st
|
|
|
2 |
|
3 |
+
from ..models import predict_model
|
4 |
|
5 |
|
6 |
def visualize():
|
|
|
24 |
|
25 |
|
26 |
if __name__ == "__main__":
|
27 |
+
visualize()
|
|
|
|
|
|
|
|