hakim commited on
Commit
f2492e6
·
1 Parent(s): f68f6ad

model trainer added

Browse files
config/config.yaml CHANGED
@@ -17,4 +17,11 @@ data_validation:
17
  data_transformation:
18
  root_dir: artifacts/data_transformation
19
  data_path: artifacts/data_ingestion/samsum_dataset
20
- tokenizer_name: google/pegasus-cnn_dailymail
 
 
 
 
 
 
 
 
17
  data_transformation:
18
  root_dir: artifacts/data_transformation
19
  data_path: artifacts/data_ingestion/samsum_dataset
20
+ tokenizer_name: google/pegasus-cnn_dailymail
21
+
22
+
23
+
24
+ model_trainer:
25
+ root_dir: artifacts/model_trainer
26
+ data_path: artifacts/data_transformation/samsum_dataset
27
+ model_ckpt: google/pegasus-cnn_dailymail
main.py CHANGED
@@ -1,6 +1,7 @@
1
  from textsummarizer.pipeline.stage_01_data_ingestion import DataIngestionPipeline
2
  from textsummarizer.pipeline.stage_02_data_validation import DataValidationPipeline
3
  from textsummarizer.pipeline.stage_03_data_transformation import DataTransformationPipeline
 
4
  from textsummarizer.logging import logger
5
 
6
  STAGE_NAME = "Data Ingestion stage"
@@ -31,6 +32,18 @@ try:
31
  data_transformaion = DataTransformationPipeline()
32
  data_transformaion.main()
33
  logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
 
 
 
 
 
 
 
 
 
 
 
 
34
  except Exception as e:
35
  logger.exception(e)
36
  raise e
 
1
  from textsummarizer.pipeline.stage_01_data_ingestion import DataIngestionPipeline
2
  from textsummarizer.pipeline.stage_02_data_validation import DataValidationPipeline
3
  from textsummarizer.pipeline.stage_03_data_transformation import DataTransformationPipeline
4
+ from textsummarizer.pipeline.stage_04_model_trainer import ModelTrainerPipeline
5
  from textsummarizer.logging import logger
6
 
7
  STAGE_NAME = "Data Ingestion stage"
 
32
  data_transformaion = DataTransformationPipeline()
33
  data_transformaion.main()
34
  logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
35
+ except Exception as e:
36
+ logger.exception(e)
37
+ raise e
38
+
39
+
40
+
41
+ STAGE_NAME = "Data Traniner stage"
42
+ try:
43
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
44
+ model_tranier = ModelTrainerPipeline()
45
+ model_tranier.main()
46
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
47
  except Exception as e:
48
  logger.exception(e)
49
  raise e
params.yaml CHANGED
@@ -1 +1,11 @@
1
- key : val
 
 
 
 
 
 
 
 
 
 
 
1
+ TrainingArguments:
2
+ num_train_epochs: 1
3
+ warmup_steps: 500
4
+ per_device_train_batch_size: 1
5
+ weight_decay: 0.01
6
+ logging_steps: 10
7
+ evaluation_strategy: steps
8
+ eval_steps: 500
9
+ save_steps: 1e6
10
+ gradient_accumulation_steps: 16
11
+
research/model_trainer.ipynb ADDED
File without changes
src/textsummarizer/config/configuration.py CHANGED
@@ -2,7 +2,8 @@ from textsummarizer.constants import *
2
  from textsummarizer.utils.common import read_yaml, create_directories
3
  from textsummarizer.entity.config_entity import (DataIngestionConfig,
4
  DataValidationConfig,
5
- DataTransformationConfig)
 
6
 
7
  class ConfigurationManager:
8
  def __init__(
@@ -58,5 +59,30 @@ class ConfigurationManager:
58
  )
59
 
60
  return data_transformation_config
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
 
 
2
  from textsummarizer.utils.common import read_yaml, create_directories
3
  from textsummarizer.entity.config_entity import (DataIngestionConfig,
4
  DataValidationConfig,
5
+ DataTransformationConfig,
6
+ ModelTrainerConfig)
7
 
8
  class ConfigurationManager:
9
  def __init__(
 
59
  )
60
 
61
  return data_transformation_config
62
+
63
+
64
+ def get_model_trainer_config(self) -> ModelTrainerConfig:
65
+ config = self.config.model_trainer
66
+ params = self.params.TrainingArguments
67
+
68
+ create_directories([config.root_dir])
69
+
70
+
71
+ model_trainer_config = ModelTrainerConfig(
72
+ root_dir = config.root_dir,
73
+ data_path = config.data_path,
74
+ model_ckpt = config.model_ckpt,
75
+ num_train_epochs =params.num_train_epochs,
76
+ warmup_steps =params.warmup_steps,
77
+ per_device_train_batch_size = params.per_device_train_batch_size,
78
+ weight_decay = params.weight_decay,
79
+ logging_steps = params.logging_steps,
80
+ evaluation_strategy =params.evaluation_strategy,
81
+ eval_steps =params.eval_steps,
82
+ save_steps = params.save_steps,
83
+ gradient_accumulation_steps = params.gradient_accumulation_steps
84
+ )
85
+
86
+ return model_trainer_config
87
 
88
 
src/textsummarizer/conponents/model_trainer.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import TrainingArguments, Trainer
2
+ from transformers import DataCollatorForSeq2Seq
3
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
+ from datasets import load_dataset, load_from_disk
5
+ from textsummarizer.entity.config_entity import ModelTrainerConfig
6
+ import torch
7
+ import os
8
+
9
+
10
+ class ModelTrainer:
11
+ def __init__(self, config : ModelTrainerConfig):
12
+ self.config = config
13
+ os.environ["WANDB_DISABLED"] = "true"
14
+
15
+
16
+ def train(self):
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
19
+ model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
20
+ seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
21
+
22
+ #loading data
23
+ dataset_samsum_pt = load_from_disk(self.config.data_path)
24
+
25
+
26
+ trainer_args = TrainingArguments(
27
+ output_dir=self.config.root_dir, num_train_epochs=self.config.num_train_epochs, warmup_steps=self.config.warmup_steps,
28
+ per_device_train_batch_size=self.config.per_device_train_batch_size, per_device_eval_batch_size=self.config.per_device_train_batch_size,
29
+ weight_decay=self.config.weight_decay, logging_steps=self.config.logging_steps,
30
+ evaluation_strategy=self.config.evaluation_strategy, eval_steps=self.config.eval_steps, save_steps=1e6,
31
+ gradient_accumulation_steps=self.config.gradient_accumulation_steps,
32
+ report_to="none"
33
+
34
+ )
35
+
36
+
37
+ trainer = Trainer(model=model_pegasus, args=trainer_args,
38
+ tokenizer=tokenizer, data_collator=seq2seq_data_collator,
39
+ train_dataset=dataset_samsum_pt["train"],
40
+ eval_dataset=dataset_samsum_pt["validation"])
41
+
42
+
43
+ trainer.train()
44
+
45
+ ## Save model
46
+ model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
47
+ ## Save tokenizer
48
+ tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))
src/textsummarizer/entity/config_entity.py CHANGED
@@ -23,4 +23,18 @@ class DataTransformationConfig:
23
  data_path : Path
24
  tokenizer_name : Path
25
 
26
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  data_path : Path
24
  tokenizer_name : Path
25
 
26
+
27
+ @dataclass(frozen=True)
28
+ class ModelTrainerConfig:
29
+ root_dir : Path
30
+ data_path : Path
31
+ model_ckpt : Path
32
+ num_train_epochs : int
33
+ warmup_steps : int
34
+ per_device_train_batch_size : int
35
+ weight_decay : float
36
+ logging_steps : int
37
+ evaluation_strategy: str
38
+ eval_steps: int
39
+ save_steps: float
40
+ gradient_accumulation_steps: int
src/textsummarizer/pipeline/stage_04_model_trainer.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from textsummarizer.conponents.model_trainer import ModelTrainer
2
+ from textsummarizer.config.configuration import ConfigurationManager
3
+
4
+ class ModelTrainerPipeline:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def main(self):
9
+ config = ConfigurationManager()
10
+ model_trainer_config = config.get_model_trainer_config()
11
+ model_trainer_config = ModelTrainer(config=model_trainer_config)
12
+ model_trainer_config.train()