AyushSoni14 commited on
Commit
39623cd
·
verified ·
1 Parent(s): 2d67ec4

Upload train_model.py

Browse files
Files changed (1) hide show
  1. train_model.py +67 -0
train_model.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from transformers import TrainingArguments, Trainer
4
+ import os
5
+ import torch
6
+
7
+ # Load dataset
8
+ ds = load_dataset("knkarthick/dialogsum")
9
+
10
+ # Load tokenizer and model
11
+ tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
12
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
13
+
14
+ # Preprocessing function
15
+ def preprocess_function(batch):
16
+ source = batch['dialogue']
17
+ target = batch['summary']
18
+ source_enc = tokenizer(source, padding='max_length', truncation=True, max_length=128)
19
+ target_enc = tokenizer(target, padding='max_length', truncation=True, max_length=128)
20
+ labels = target_enc['input_ids']
21
+ labels = [[(token if token != tokenizer.pad_token_id else -100) for token in label] for label in labels]
22
+ return {
23
+ 'input_ids': source_enc['input_ids'],
24
+ 'attention_mask': source_enc['attention_mask'],
25
+ 'labels': labels
26
+ }
27
+
28
+ # Apply preprocessing
29
+ df_source = ds.map(preprocess_function, batched=True)
30
+
31
+ # Training arguments
32
+ training_args = TrainingArguments(
33
+ output_dir='/content/TextSummarizer_output',
34
+ per_device_train_batch_size=8,
35
+ num_train_epochs=2,
36
+ save_total_limit=1,
37
+ save_strategy="epoch",
38
+ remove_unused_columns=True,
39
+ logging_dir='/content/logs',
40
+ logging_steps=50,
41
+ )
42
+
43
+ # Trainer
44
+ trainer = Trainer(
45
+ model=model,
46
+ args=training_args,
47
+ train_dataset=df_source['train'],
48
+ eval_dataset=df_source['test'],
49
+ )
50
+
51
+ # Train
52
+ trainer.train()
53
+
54
+ # Evaluate
55
+ eval_results = trainer.evaluate()
56
+ print("Evaluation Results:", eval_results)
57
+
58
+ # ===> Save to Google Drive path
59
+ save_path = "/content/drive/MyDrive/TextSummarizer2/model_directory"
60
+ os.makedirs(save_path, exist_ok=True)
61
+
62
+ # Save model and tokenizer (use safe_serialization for large model.safetensors)
63
+ model.save_pretrained(save_path, safe_serialization=True)
64
+ tokenizer.save_pretrained(save_path)
65
+
66
+ print(f"✅ Model and tokenizer saved to: {save_path}")
67
+ print("📦 Files saved:", os.listdir(save_path))