gagan3012 commited on
Commit
c6e4955
·
1 Parent(s): c9eec48

black style added

Browse files
Makefile CHANGED
@@ -35,6 +35,7 @@ clean:
35
  ## Lint using flake8
36
  lint:
37
  flake8 src
 
38
 
39
  ## Upload Data to default DVC remote
40
  push:
 
35
  ## Lint using flake8
36
  lint:
37
  flake8 src
38
+ black src
39
 
40
  ## Upload Data to default DVC remote
41
  push:
src/data/make_dataset.py CHANGED
@@ -5,22 +5,21 @@ import os
5
  import pprint
6
 
7
 
8
-
9
- def make_dataset(dataset='cnn_dailymail', split='train'):
10
  """make dataset for summarisation"""
11
- if not os.path.exists('data/raw'):
12
- os.makedirs('data/raw')
13
- dataset = load_dataset(dataset, '3.0.0', split=split)
14
  df = pd.DataFrame()
15
- df['article'] = dataset['article']
16
- df['highlights'] = dataset['highlights']
17
- df.to_csv('data/raw/{}.csv'.format(split))
18
 
19
 
20
- if __name__ == '__main__':
21
  with open("params.yml") as f:
22
  params = yaml.safe_load(f)
23
  pprint.pprint(params)
24
- make_dataset(dataset=params['data'], split='train')
25
- make_dataset(dataset=params['data'], split='test')
26
- make_dataset(dataset=params['data'], split='validation')
 
5
  import pprint
6
 
7
 
8
+ def make_dataset(dataset="cnn_dailymail", split="train"):
 
9
  """make dataset for summarisation"""
10
+ if not os.path.exists("data/raw"):
11
+ os.makedirs("data/raw")
12
+ dataset = load_dataset(dataset, "3.0.0", split=split)
13
  df = pd.DataFrame()
14
+ df["article"] = dataset["article"]
15
+ df["highlights"] = dataset["highlights"]
16
+ df.to_csv("data/raw/{}.csv".format(split))
17
 
18
 
19
+ if __name__ == "__main__":
20
  with open("params.yml") as f:
21
  params = yaml.safe_load(f)
22
  pprint.pprint(params)
23
+ make_dataset(dataset=params["data"], split="train")
24
+ make_dataset(dataset=params["data"], split="test")
25
+ make_dataset(dataset=params["data"], split="validation")
src/data/process_data.py CHANGED
@@ -3,20 +3,20 @@ import yaml
3
  import os
4
 
5
 
6
- def process_data(split='train'):
7
 
8
  with open("params.yml") as f:
9
  params = yaml.safe_load(f)
10
 
11
- df = pd.read_csv('data/raw/{}.csv'.format(split))
12
- df.columns = ['Unnamed: 0', 'input_text', 'output_text']
13
- df = df.sample(frac=params['split'], replace=True, random_state=1)
14
  if os.path.exists("data/raw/{}.csv".format(split)):
15
  os.remove("data/raw/{}.csv".format(split))
16
- df.to_csv('data/processed/{}.csv'.format(split))
17
 
18
 
19
- if __name__ == '__main__':
20
- process_data(split='train')
21
- process_data(split='test')
22
- process_data(split='validation')
 
3
  import os
4
 
5
 
6
+ def process_data(split="train"):
7
 
8
  with open("params.yml") as f:
9
  params = yaml.safe_load(f)
10
 
11
+ df = pd.read_csv("data/raw/{}.csv".format(split))
12
+ df.columns = ["Unnamed: 0", "input_text", "output_text"]
13
+ df = df.sample(frac=params["split"], replace=True, random_state=1)
14
  if os.path.exists("data/raw/{}.csv".format(split)):
15
  os.remove("data/raw/{}.csv".format(split))
16
+ df.to_csv("data/processed/{}.csv".format(split))
17
 
18
 
19
+ if __name__ == "__main__":
20
+ process_data(split="train")
21
+ process_data(split="test")
22
+ process_data(split="validation")
src/models/__init__.py CHANGED
@@ -1 +1 @@
1
- from .model import Summarization
 
1
+ from .model import Summarization
src/models/evaluate_model.py CHANGED
@@ -13,14 +13,14 @@ def evaluate_model():
13
  with open("params.yml") as f:
14
  params = yaml.safe_load(f)
15
 
16
- test_df = pd.read_csv('data/processed/test.csv')[:25]
17
  model = Summarization()
18
- model.load_model(model_type=params['model_type'], model_dir=params['model_dir'])
19
- results = model.evaluate(test_df=test_df, metrics=params['metric'])
20
 
21
- with open('reports/metrics.txt', 'w') as fp:
22
  json.dump(results, fp)
23
 
24
 
25
- if __name__ == '__main__':
26
  evaluate_model()
 
13
  with open("params.yml") as f:
14
  params = yaml.safe_load(f)
15
 
16
+ test_df = pd.read_csv("data/processed/test.csv")[:25]
17
  model = Summarization()
18
+ model.load_model(model_type=params["model_type"], model_dir=params["model_dir"])
19
+ results = model.evaluate(test_df=test_df, metrics=params["metric"])
20
 
21
+ with open("reports/metrics.txt", "w") as fp:
22
  json.dump(results, fp)
23
 
24
 
25
+ if __name__ == "__main__":
26
  evaluate_model()
src/models/model.py CHANGED
@@ -3,7 +3,10 @@ import pandas as pd
3
  from transformers import (
4
  AdamW,
5
  T5ForConditionalGeneration,
6
- T5TokenizerFast as T5Tokenizer, MT5Tokenizer, MT5ForConditionalGeneration, ByT5Tokenizer,
 
 
 
7
  )
8
  from torch.utils.data import Dataset, DataLoader
9
  import pytorch_lightning as pl
@@ -28,11 +31,11 @@ class DataModule(Dataset):
28
  """
29
 
30
  def __init__(
31
- self,
32
- data: pd.DataFrame,
33
- tokenizer: T5Tokenizer,
34
- source_max_token_len: int = 512,
35
- target_max_token_len: int = 512,
36
  ):
37
  """
38
  :param data:
@@ -72,9 +75,7 @@ class DataModule(Dataset):
72
  )
73
 
74
  labels = output_encoding["input_ids"]
75
- labels[
76
- labels == 0
77
- ] = -100
78
 
79
  return dict(
80
  keywords=data_row["input_text"],
@@ -88,15 +89,15 @@ class DataModule(Dataset):
88
 
89
  class PLDataModule(LightningDataModule):
90
  def __init__(
91
- self,
92
- train_df: pd.DataFrame,
93
- test_df: pd.DataFrame,
94
- tokenizer: T5Tokenizer,
95
- source_max_token_len: int = 512,
96
- target_max_token_len: int = 512,
97
- batch_size: int = 4,
98
- split: float = 0.1,
99
- num_workers: int = 2
100
  ):
101
  """
102
  :param data_df:
@@ -131,28 +132,45 @@ class PLDataModule(LightningDataModule):
131
  )
132
 
133
  def train_dataloader(self):
134
- """ training dataloader """
135
  return DataLoader(
136
- self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers
 
 
 
137
  )
138
 
139
  def test_dataloader(self):
140
- """ test dataloader """
141
  return DataLoader(
142
- self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers
 
 
 
143
  )
144
 
145
  def val_dataloader(self):
146
- """ validation dataloader """
147
  return DataLoader(
148
- self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers
 
 
 
149
  )
150
 
151
 
152
  class LightningModel(LightningModule):
153
- """ PyTorch Lightning Model class"""
154
 
155
- def __init__(self, tokenizer, model, learning_rate, adam_epsilon, weight_decay, output: str = "outputs"):
 
 
 
 
 
 
 
 
156
  """
157
  initiates a PyTorch Lightning Model
158
  Args:
@@ -169,7 +187,7 @@ class LightningModel(LightningModule):
169
  self.weight_decay = weight_decay
170
 
171
  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
172
- """ forward step """
173
  output = self.model(
174
  input_ids,
175
  attention_mask=attention_mask,
@@ -180,7 +198,7 @@ class LightningModel(LightningModule):
180
  return output.loss, output.logits
181
 
182
  def training_step(self, batch, batch_size):
183
- """ training step """
184
  input_ids = batch["keywords_input_ids"]
185
  attention_mask = batch["keywords_attention_mask"]
186
  labels = batch["labels"]
@@ -196,7 +214,7 @@ class LightningModel(LightningModule):
196
  return loss
197
 
198
  def validation_step(self, batch, batch_size):
199
- """ validation step """
200
  input_ids = batch["keywords_input_ids"]
201
  attention_mask = batch["keywords_attention_mask"]
202
  labels = batch["labels"]
@@ -212,7 +230,7 @@ class LightningModel(LightningModule):
212
  return loss
213
 
214
  def test_step(self, batch, batch_size):
215
- """ test step """
216
  input_ids = batch["keywords_input_ids"]
217
  attention_mask = batch["keywords_attention_mask"]
218
  labels = batch["labels"]
@@ -229,29 +247,39 @@ class LightningModel(LightningModule):
229
  return loss
230
 
231
  def configure_optimizers(self):
232
- """ configure optimizers """
233
  model = self.model
234
  no_decay = ["bias", "LayerNorm.weight"]
235
  optimizer_grouped_parameters = [
236
  {
237
- "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
 
 
 
 
238
  "weight_decay": self.weight_decay,
239
  },
240
  {
241
- "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
 
 
 
 
242
  "weight_decay": 0.0,
243
  },
244
  ]
245
- optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
 
 
246
  self.opt = optimizer
247
  return [optimizer]
248
 
249
 
250
  class Summarization:
251
- """ Custom Summarization class """
252
 
253
  def __init__(self) -> None:
254
- """ initiates Summarization class """
255
  pass
256
 
257
  def from_pretrained(self, model_type="t5", model_name="t5-base") -> None:
@@ -278,20 +306,20 @@ class Summarization:
278
  )
279
 
280
  def train(
281
- self,
282
- train_df: pd.DataFrame,
283
- eval_df: pd.DataFrame,
284
- source_max_token_len: int = 512,
285
- target_max_token_len: int = 512,
286
- batch_size: int = 8,
287
- max_epochs: int = 5,
288
- use_gpu: bool = True,
289
- outputdir: str = "models",
290
- early_stopping_patience_epochs: int = 0, # 0 to disable early stopping feature
291
- learning_rate: float = 0.0001,
292
- adam_epsilon: float = 0.01,
293
- num_workers: int = 2,
294
- weight_decay: float = 0.0001
295
  ):
296
  """
297
  trains T5/MT5 model on custom dataset
@@ -323,12 +351,18 @@ class Summarization:
323
  )
324
 
325
  self.T5Model = LightningModel(
326
- tokenizer=self.tokenizer, model=self.model, output=outputdir,
327
- learning_rate=learning_rate, adam_epsilon=adam_epsilon, weight_decay=weight_decay
 
 
 
 
328
  )
329
 
330
- MLlogger = MLFlowLogger(experiment_name="Summarization",
331
- tracking_uri="https://dagshub.com/gagan3012/summarization.mlflow")
 
 
332
 
333
  WandLogger = WandbLogger(project="summarization-dagshub")
334
 
@@ -361,7 +395,7 @@ class Summarization:
361
  trainer.fit(self.T5Model, self.data_module)
362
 
363
  def load_model(
364
- self, model_type: str = 't5', model_dir: str = "models", use_gpu: bool = False
365
  ):
366
  """
367
  loads a checkpoint for inferencing/prediction
@@ -390,16 +424,15 @@ class Summarization:
390
  if torch.cuda.is_available():
391
  self.device = torch.device("cuda")
392
  else:
393
- raise Exception("exception ---> no gpu found. set use_gpu=False, to use CPU")
 
 
394
  else:
395
  self.device = torch.device("cpu")
396
 
397
  self.model = self.model.to(self.device)
398
 
399
- def save_model(
400
- self,
401
- model_dir="models"
402
- ):
403
  """
404
  Save model to dir
405
  :param model_dir:
@@ -410,19 +443,19 @@ class Summarization:
410
  self.model.save_pretrained(path)
411
 
412
  def predict(
413
- self,
414
- source_text: str,
415
- max_length: int = 512,
416
- num_return_sequences: int = 1,
417
- num_beams: int = 2,
418
- top_k: int = 50,
419
- top_p: float = 0.95,
420
- do_sample: bool = True,
421
- repetition_penalty: float = 2.5,
422
- length_penalty: float = 1.0,
423
- early_stopping: bool = True,
424
- skip_special_tokens: bool = True,
425
- clean_up_tokenization_spaces: bool = True,
426
  ):
427
  """
428
  generates prediction for T5/MT5 model
@@ -465,14 +498,10 @@ class Summarization:
465
  )
466
  return preds
467
 
468
- def evaluate(
469
- self,
470
- test_df: pd.DataFrame,
471
- metrics: str = "rouge"
472
- ):
473
  metric = load_metric(metrics)
474
- input_text = test_df['input_text']
475
- references = test_df['output_text']
476
  references = references.to_list()
477
 
478
  predictions = [self.predict(x) for x in tqdm(input_text)]
@@ -480,49 +509,49 @@ class Summarization:
480
  results = metric.compute(predictions=predictions, references=references)
481
 
482
  output = {
483
- 'Rouge 1': {
484
- 'Rouge_1 Low Precision': results["rouge1"].low.precision,
485
- 'Rouge_1 Low recall': results["rouge1"].low.recall,
486
- 'Rouge_1 Low F1': results["rouge1"].low.fmeasure,
487
- 'Rouge_1 Mid Precision': results["rouge1"].mid.precision,
488
- 'Rouge_1 Mid recall': results["rouge1"].mid.recall,
489
- 'Rouge_1 Mid F1': results["rouge1"].mid.fmeasure,
490
- 'Rouge_1 High Precision': results["rouge1"].high.precision,
491
- 'Rouge_1 High recall': results["rouge1"].high.recall,
492
- 'Rouge_1 High F1': results["rouge1"].high.fmeasure,
 
 
 
 
 
 
 
 
 
 
 
493
  },
494
- 'Rouge 2': {
495
- 'Rouge_2 Low Precision': results["rouge2"].low.precision,
496
- 'Rouge_2 Low recall': results["rouge2"].low.recall,
497
- 'Rouge_2 Low F1': results["rouge2"].low.fmeasure,
498
- 'Rouge_2 Mid Precision': results["rouge2"].mid.precision,
499
- 'Rouge_2 Mid recall': results["rouge2"].mid.recall,
500
- 'Rouge_2 Mid F1': results["rouge2"].mid.fmeasure,
501
- 'Rouge_2 High Precision': results["rouge2"].high.precision,
502
- 'Rouge_2 High recall': results["rouge2"].high.recall,
503
- 'Rouge_2 High F1': results["rouge2"].high.fmeasure,
504
  },
505
- 'Rouge L': {
506
- 'Rouge_L Low Precision': results["rougeL"].low.precision,
507
- 'Rouge_L Low recall': results["rougeL"].low.recall,
508
- 'Rouge_L Low F1': results["rougeL"].low.fmeasure,
509
- 'Rouge_L Mid Precision': results["rougeL"].mid.precision,
510
- 'Rouge_L Mid recall': results["rougeL"].mid.recall,
511
- 'Rouge_L Mid F1': results["rougeL"].mid.fmeasure,
512
- 'Rouge_L High Precision': results["rougeL"].high.precision,
513
- 'Rouge_L High recall': results["rougeL"].high.recall,
514
- 'Rouge_L High F1': results["rougeL"].high.fmeasure,
515
  },
516
- 'rougeLsum': {
517
- 'rougeLsum Low Precision': results["rougeLsum"].low.precision,
518
- 'rougeLsum Low recall': results["rougeLsum"].low.recall,
519
- 'rougeLsum Low F1': results["rougeLsum"].low.fmeasure,
520
- 'rougeLsum Mid Precision': results["rougeLsum"].mid.precision,
521
- 'rougeLsum Mid recall': results["rougeLsum"].mid.recall,
522
- 'rougeLsum Mid F1': results["rougeLsum"].mid.fmeasure,
523
- 'rougeLsum High Precision': results["rougeLsum"].high.precision,
524
- 'rougeLsum High recall': results["rougeLsum"].high.recall,
525
- 'rougeLsum High F1': results["rougeLsum"].high.fmeasure,
526
- }
527
  }
528
  return output
 
3
  from transformers import (
4
  AdamW,
5
  T5ForConditionalGeneration,
6
+ T5TokenizerFast as T5Tokenizer,
7
+ MT5Tokenizer,
8
+ MT5ForConditionalGeneration,
9
+ ByT5Tokenizer,
10
  )
11
  from torch.utils.data import Dataset, DataLoader
12
  import pytorch_lightning as pl
 
31
  """
32
 
33
  def __init__(
34
+ self,
35
+ data: pd.DataFrame,
36
+ tokenizer: T5Tokenizer,
37
+ source_max_token_len: int = 512,
38
+ target_max_token_len: int = 512,
39
  ):
40
  """
41
  :param data:
 
75
  )
76
 
77
  labels = output_encoding["input_ids"]
78
+ labels[labels == 0] = -100
 
 
79
 
80
  return dict(
81
  keywords=data_row["input_text"],
 
89
 
90
  class PLDataModule(LightningDataModule):
91
  def __init__(
92
+ self,
93
+ train_df: pd.DataFrame,
94
+ test_df: pd.DataFrame,
95
+ tokenizer: T5Tokenizer,
96
+ source_max_token_len: int = 512,
97
+ target_max_token_len: int = 512,
98
+ batch_size: int = 4,
99
+ split: float = 0.1,
100
+ num_workers: int = 2,
101
  ):
102
  """
103
  :param data_df:
 
132
  )
133
 
134
  def train_dataloader(self):
135
+ """training dataloader"""
136
  return DataLoader(
137
+ self.train_dataset,
138
+ batch_size=self.batch_size,
139
+ shuffle=True,
140
+ num_workers=self.num_workers,
141
  )
142
 
143
  def test_dataloader(self):
144
+ """test dataloader"""
145
  return DataLoader(
146
+ self.test_dataset,
147
+ batch_size=self.batch_size,
148
+ shuffle=False,
149
+ num_workers=self.num_workers,
150
  )
151
 
152
  def val_dataloader(self):
153
+ """validation dataloader"""
154
  return DataLoader(
155
+ self.test_dataset,
156
+ batch_size=self.batch_size,
157
+ shuffle=False,
158
+ num_workers=self.num_workers,
159
  )
160
 
161
 
162
  class LightningModel(LightningModule):
163
+ """PyTorch Lightning Model class"""
164
 
165
+ def __init__(
166
+ self,
167
+ tokenizer,
168
+ model,
169
+ learning_rate,
170
+ adam_epsilon,
171
+ weight_decay,
172
+ output: str = "outputs",
173
+ ):
174
  """
175
  initiates a PyTorch Lightning Model
176
  Args:
 
187
  self.weight_decay = weight_decay
188
 
189
  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
190
+ """forward step"""
191
  output = self.model(
192
  input_ids,
193
  attention_mask=attention_mask,
 
198
  return output.loss, output.logits
199
 
200
  def training_step(self, batch, batch_size):
201
+ """training step"""
202
  input_ids = batch["keywords_input_ids"]
203
  attention_mask = batch["keywords_attention_mask"]
204
  labels = batch["labels"]
 
214
  return loss
215
 
216
  def validation_step(self, batch, batch_size):
217
+ """validation step"""
218
  input_ids = batch["keywords_input_ids"]
219
  attention_mask = batch["keywords_attention_mask"]
220
  labels = batch["labels"]
 
230
  return loss
231
 
232
  def test_step(self, batch, batch_size):
233
+ """test step"""
234
  input_ids = batch["keywords_input_ids"]
235
  attention_mask = batch["keywords_attention_mask"]
236
  labels = batch["labels"]
 
247
  return loss
248
 
249
  def configure_optimizers(self):
250
+ """configure optimizers"""
251
  model = self.model
252
  no_decay = ["bias", "LayerNorm.weight"]
253
  optimizer_grouped_parameters = [
254
  {
255
+ "params": [
256
+ p
257
+ for n, p in model.named_parameters()
258
+ if not any(nd in n for nd in no_decay)
259
+ ],
260
  "weight_decay": self.weight_decay,
261
  },
262
  {
263
+ "params": [
264
+ p
265
+ for n, p in model.named_parameters()
266
+ if any(nd in n for nd in no_decay)
267
+ ],
268
  "weight_decay": 0.0,
269
  },
270
  ]
271
+ optimizer = AdamW(
272
+ optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon
273
+ )
274
  self.opt = optimizer
275
  return [optimizer]
276
 
277
 
278
  class Summarization:
279
+ """Custom Summarization class"""
280
 
281
  def __init__(self) -> None:
282
+ """initiates Summarization class"""
283
  pass
284
 
285
  def from_pretrained(self, model_type="t5", model_name="t5-base") -> None:
 
306
  )
307
 
308
  def train(
309
+ self,
310
+ train_df: pd.DataFrame,
311
+ eval_df: pd.DataFrame,
312
+ source_max_token_len: int = 512,
313
+ target_max_token_len: int = 512,
314
+ batch_size: int = 8,
315
+ max_epochs: int = 5,
316
+ use_gpu: bool = True,
317
+ outputdir: str = "models",
318
+ early_stopping_patience_epochs: int = 0, # 0 to disable early stopping feature
319
+ learning_rate: float = 0.0001,
320
+ adam_epsilon: float = 0.01,
321
+ num_workers: int = 2,
322
+ weight_decay: float = 0.0001,
323
  ):
324
  """
325
  trains T5/MT5 model on custom dataset
 
351
  )
352
 
353
  self.T5Model = LightningModel(
354
+ tokenizer=self.tokenizer,
355
+ model=self.model,
356
+ output=outputdir,
357
+ learning_rate=learning_rate,
358
+ adam_epsilon=adam_epsilon,
359
+ weight_decay=weight_decay,
360
  )
361
 
362
+ MLlogger = MLFlowLogger(
363
+ experiment_name="Summarization",
364
+ tracking_uri="https://dagshub.com/gagan3012/summarization.mlflow",
365
+ )
366
 
367
  WandLogger = WandbLogger(project="summarization-dagshub")
368
 
 
395
  trainer.fit(self.T5Model, self.data_module)
396
 
397
  def load_model(
398
+ self, model_type: str = "t5", model_dir: str = "models", use_gpu: bool = False
399
  ):
400
  """
401
  loads a checkpoint for inferencing/prediction
 
424
  if torch.cuda.is_available():
425
  self.device = torch.device("cuda")
426
  else:
427
+ raise Exception(
428
+ "exception ---> no gpu found. set use_gpu=False, to use CPU"
429
+ )
430
  else:
431
  self.device = torch.device("cpu")
432
 
433
  self.model = self.model.to(self.device)
434
 
435
+ def save_model(self, model_dir="models"):
 
 
 
436
  """
437
  Save model to dir
438
  :param model_dir:
 
443
  self.model.save_pretrained(path)
444
 
445
  def predict(
446
+ self,
447
+ source_text: str,
448
+ max_length: int = 512,
449
+ num_return_sequences: int = 1,
450
+ num_beams: int = 2,
451
+ top_k: int = 50,
452
+ top_p: float = 0.95,
453
+ do_sample: bool = True,
454
+ repetition_penalty: float = 2.5,
455
+ length_penalty: float = 1.0,
456
+ early_stopping: bool = True,
457
+ skip_special_tokens: bool = True,
458
+ clean_up_tokenization_spaces: bool = True,
459
  ):
460
  """
461
  generates prediction for T5/MT5 model
 
498
  )
499
  return preds
500
 
501
+ def evaluate(self, test_df: pd.DataFrame, metrics: str = "rouge"):
 
 
 
 
502
  metric = load_metric(metrics)
503
+ input_text = test_df["input_text"]
504
+ references = test_df["output_text"]
505
  references = references.to_list()
506
 
507
  predictions = [self.predict(x) for x in tqdm(input_text)]
 
509
  results = metric.compute(predictions=predictions, references=references)
510
 
511
  output = {
512
+ "Rouge 1": {
513
+ "Rouge_1 Low Precision": results["rouge1"].low.precision,
514
+ "Rouge_1 Low recall": results["rouge1"].low.recall,
515
+ "Rouge_1 Low F1": results["rouge1"].low.fmeasure,
516
+ "Rouge_1 Mid Precision": results["rouge1"].mid.precision,
517
+ "Rouge_1 Mid recall": results["rouge1"].mid.recall,
518
+ "Rouge_1 Mid F1": results["rouge1"].mid.fmeasure,
519
+ "Rouge_1 High Precision": results["rouge1"].high.precision,
520
+ "Rouge_1 High recall": results["rouge1"].high.recall,
521
+ "Rouge_1 High F1": results["rouge1"].high.fmeasure,
522
+ },
523
+ "Rouge 2": {
524
+ "Rouge_2 Low Precision": results["rouge2"].low.precision,
525
+ "Rouge_2 Low recall": results["rouge2"].low.recall,
526
+ "Rouge_2 Low F1": results["rouge2"].low.fmeasure,
527
+ "Rouge_2 Mid Precision": results["rouge2"].mid.precision,
528
+ "Rouge_2 Mid recall": results["rouge2"].mid.recall,
529
+ "Rouge_2 Mid F1": results["rouge2"].mid.fmeasure,
530
+ "Rouge_2 High Precision": results["rouge2"].high.precision,
531
+ "Rouge_2 High recall": results["rouge2"].high.recall,
532
+ "Rouge_2 High F1": results["rouge2"].high.fmeasure,
533
  },
534
+ "Rouge L": {
535
+ "Rouge_L Low Precision": results["rougeL"].low.precision,
536
+ "Rouge_L Low recall": results["rougeL"].low.recall,
537
+ "Rouge_L Low F1": results["rougeL"].low.fmeasure,
538
+ "Rouge_L Mid Precision": results["rougeL"].mid.precision,
539
+ "Rouge_L Mid recall": results["rougeL"].mid.recall,
540
+ "Rouge_L Mid F1": results["rougeL"].mid.fmeasure,
541
+ "Rouge_L High Precision": results["rougeL"].high.precision,
542
+ "Rouge_L High recall": results["rougeL"].high.recall,
543
+ "Rouge_L High F1": results["rougeL"].high.fmeasure,
544
  },
545
+ "rougeLsum": {
546
+ "rougeLsum Low Precision": results["rougeLsum"].low.precision,
547
+ "rougeLsum Low recall": results["rougeLsum"].low.recall,
548
+ "rougeLsum Low F1": results["rougeLsum"].low.fmeasure,
549
+ "rougeLsum Mid Precision": results["rougeLsum"].mid.precision,
550
+ "rougeLsum Mid recall": results["rougeLsum"].mid.recall,
551
+ "rougeLsum Mid F1": results["rougeLsum"].mid.fmeasure,
552
+ "rougeLsum High Precision": results["rougeLsum"].high.precision,
553
+ "rougeLsum High recall": results["rougeLsum"].high.recall,
554
+ "rougeLsum High F1": results["rougeLsum"].high.fmeasure,
555
  },
 
 
 
 
 
 
 
 
 
 
 
556
  }
557
  return output
src/models/predict_model.py CHANGED
@@ -11,14 +11,13 @@ def predict_model(text):
11
  with open("params.yml") as f:
12
  params = yaml.safe_load(f)
13
 
14
-
15
  model = Summarization()
16
- model.load_model(model_type=params['model_type'], model_dir=params['model_dir'])
17
  pre_summary = model.predict(text)
18
  return pre_summary
19
 
20
 
21
- if __name__ == '__main__':
22
- text = pd.load_csv('data/processed/test.csv')['input_text'][0]
23
  pre_summary = predict_model(text)
24
  print(pre_summary)
 
11
  with open("params.yml") as f:
12
  params = yaml.safe_load(f)
13
 
 
14
  model = Summarization()
15
+ model.load_model(model_type=params["model_type"], model_dir=params["model_dir"])
16
  pre_summary = model.predict(text)
17
  return pre_summary
18
 
19
 
20
+ if __name__ == "__main__":
21
+ text = pd.load_csv("data/processed/test.csv")["input_text"][0]
22
  pre_summary = predict_model(text)
23
  print(pre_summary)
src/models/train_model.py CHANGED
@@ -14,28 +14,35 @@ def train_model():
14
  params = yaml.safe_load(f)
15
 
16
  # Load the data
17
- train_df = pd.read_csv('data/processed/train.csv')
18
- eval_df = pd.read_csv('data/processed/validation.csv')
19
 
20
- train_df = train_df.sample(frac=params['split'], replace=True, random_state=1)
21
- eval_df = eval_df.sample(frac=params['split'], replace=True, random_state=1)
22
 
23
  model = Summarization()
24
- model.from_pretrained(model_type=params['model_type'], model_name=params['model_name'])
25
-
26
- model.train(train_df=train_df, eval_df=eval_df,
27
- batch_size=params['batch_size'], max_epochs=params['epochs'],
28
- use_gpu=params['use_gpu'], learning_rate=float(params['learning_rate']),
29
- num_workers=int(params['num_workers']))
30
-
31
- model.save_model(model_dir=params['model_dir'])
32
-
33
- with open('wandb/latest-run/files/wandb-summary.json') as json_file:
 
 
 
 
 
 
 
34
  data = json.load(json_file)
35
 
36
- with open('reports/training_metrics.txt', 'w') as fp:
37
  json.dump(data, fp)
38
 
39
 
40
- if __name__ == '__main__':
41
  train_model()
 
14
  params = yaml.safe_load(f)
15
 
16
  # Load the data
17
+ train_df = pd.read_csv("data/processed/train.csv")
18
+ eval_df = pd.read_csv("data/processed/validation.csv")
19
 
20
+ train_df = train_df.sample(frac=params["split"], replace=True, random_state=1)
21
+ eval_df = eval_df.sample(frac=params["split"], replace=True, random_state=1)
22
 
23
  model = Summarization()
24
+ model.from_pretrained(
25
+ model_type=params["model_type"], model_name=params["model_name"]
26
+ )
27
+
28
+ model.train(
29
+ train_df=train_df,
30
+ eval_df=eval_df,
31
+ batch_size=params["batch_size"],
32
+ max_epochs=params["epochs"],
33
+ use_gpu=params["use_gpu"],
34
+ learning_rate=float(params["learning_rate"]),
35
+ num_workers=int(params["num_workers"]),
36
+ )
37
+
38
+ model.save_model(model_dir=params["model_dir"])
39
+
40
+ with open("wandb/latest-run/files/wandb-summary.json") as json_file:
41
  data = json.load(json_file)
42
 
43
+ with open("reports/training_metrics.txt", "w") as fp:
44
  json.dump(data, fp)
45
 
46
 
47
+ if __name__ == "__main__":
48
  train_model()