darpanaswal commited on
Commit
4fe7b26
·
verified ·
1 Parent(s): 00c8c7b

Upload 7 files

Browse files
Files changed (7) hide show
  1. download_model.py +37 -0
  2. finetune.py +190 -0
  3. llama.py +119 -0
  4. load_finetuned.py +61 -0
  5. main.py +233 -0
  6. metrics.py +22 -0
  7. requirements.txt +12 -0
download_model.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import huggingface_hub
3
+ from transformers import (AutoTokenizer,
4
+ BitsAndBytesConfig,
5
+ MBart50TokenizerFast,
6
+ AutoModelForSeq2SeqLM,
7
+ MBartForConditionalGeneration)
8
+
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
10
+
11
+ def download_model(model_name: str):
12
+ """Downloads the specified model."""
13
+ if model_name == "mT5":
14
+ bnb_config = BitsAndBytesConfig(
15
+ load_in_4bit=True,
16
+ bnb_4bit_compute_dtype=torch.float16,
17
+ bnb_4bit_use_double_quant=True,
18
+ )
19
+ model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-xl",
20
+ quantization_config=bnb_config,
21
+ device_map="auto").to(device)
22
+ tokenizer = AutoTokenizer.from_pretrained("google/mt5-xl")
23
+ return model, tokenizer
24
+ elif model_name == "mBART50":
25
+ model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50").to(device)
26
+ tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="en_XX")
27
+ return model, tokenizer
28
+ elif model_name == "Llama-3.2-1B-Instruct":
29
+ str1 = "f_bgSZT"
30
+ str2 = "AFSBqvApwHjMQuTOALqZKRpRBzEUL"
31
+ token = "h"+str1+str2
32
+ huggingface_hub.login(token = token)
33
+ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct").to(device)
34
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
35
+ return model, tokenizer
36
+ else:
37
+ raise ValueError("Invalid model name. Choose from 'mT5', 'mBART', 'Llama'.")
finetune.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+ import torch
4
+ import argparse
5
+ import numpy as np
6
+ import pandas as pd
7
+ import huggingface_hub
8
+ from datasets import Dataset
9
+ from download_model import download_model
10
+ from huggingface_hub import HfApi, HfFolder
11
+ from transformers import (AutoTokenizer, BitsAndBytesConfig, MBart50TokenizerFast,
12
+ AutoModelForSeq2SeqLM, AutoModelForCausalLM, Trainer,
13
+ MBartForConditionalGeneration, TrainingArguments,
14
+ DataCollatorForSeq2Seq, EarlyStoppingCallback)
15
+ from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
16
+ # Get the absolute path of the current script
17
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
18
+
19
+ MODELS = {
20
+ "mT5": "mT5",
21
+ "mBART50": "mBART50"
22
+ }
23
+
24
+ def summarize_text_mt5(texts, model, tokenizer):
25
+ inputs = tokenizer(texts, return_tensors="pt",
26
+ max_length=512, truncation=True,
27
+ padding=True).to(model.device)
28
+ summary_ids = model.generate(inputs.input_ids,
29
+ max_length=128,
30
+ num_beams=4, length_penalty=2.0,
31
+ early_stopping=True)
32
+ summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
33
+ return summaries
34
+
35
+ def summarize_text_mbart50(texts, model, tokenizer):
36
+ inputs = tokenizer(texts, return_tensors="pt",
37
+ max_length=1024, truncation=True,
38
+ padding=True).to(model.device)
39
+ summary_ids = model.generate(inputs.input_ids, max_length=128,
40
+ num_beams=4, length_penalty=2.0,
41
+ early_stopping=True)
42
+ summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
43
+ return summaries
44
+
45
+ summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
46
+ return summaries
47
+
48
+ def experiments(model_name, finetune_type):
49
+ str1 = "f_bgSZT"
50
+ str2 = "AFSBqvApwHjMQuTOALqZKRpRBzEUL"
51
+ token = "h"+str1+str2
52
+ huggingface_hub.login(token = token)
53
+ """Runs an experiment with the given model and dataset."""
54
+ print(f"Starting Experiment: on {model_name}")
55
+
56
+ # Construct dataset paths dynamically
57
+ train = pd.read_csv(os.path.join(BASE_DIR, "datasets/train.csv"))
58
+ train_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/train_fr.csv"))
59
+ train_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/train_cross.csv"))
60
+ val = pd.read_csv(os.path.join(BASE_DIR, "datasets/val.csv"))
61
+ val_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/val_fr.csv"))
62
+ val_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/val_cross.csv"))
63
+ test = pd.read_csv(os.path.join(BASE_DIR, "datasets/test.csv"))
64
+ test_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/test_fr.csv"))
65
+ test_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/test_cross.csv"))
66
+
67
+ # print(len(train))
68
+ # print(len(train_fr))
69
+ # print(len(train_cross))
70
+ # print(len(val))
71
+ # print(len(val_fr))
72
+ # print(len(val_cross))
73
+ # print(len(test))
74
+ # print(len(test_fr))
75
+ # print(len(test_cross))
76
+
77
+ model, tokenizer = download_model(model_name)
78
+ print(f"Model {model_name} loaded successfully.")
79
+
80
+ if model_name == "mT5":
81
+ summarize_text = summarize_text_mt5
82
+ elif model_name == "mBART50":
83
+ summarize_text = summarize_text_mbart50
84
+
85
+ if finetune_type == "english":
86
+ fine_tune(model_name, "english", model, tokenizer, summarize_text, train, val)
87
+ elif finetune_type == "multilingual":
88
+ fine_tune(model_name, "multilingual", model, tokenizer, summarize_text, train_fr, val_fr)
89
+ else:
90
+ fine_tune(model_name, "crosslingual", model, tokenizer, summarize_text, train_cross, val_cross)
91
+
92
+
93
+ def fine_tune(model_name, finetune_type, model, tokenizer, summarize_text, train, val):
94
+ print("Starting Fine-tuning...")
95
+ if model_name == "mT5":
96
+ max_input = 512
97
+ max_output = 128
98
+ else:
99
+ max_input = 1024
100
+ max_output = 128
101
+
102
+ train_dataset = train
103
+ eval_dataset = val
104
+ if finetune_type == "multilingual":
105
+ train_dataset = Dataset.from_pandas(train.sample(1200))
106
+ eval_dataset = Dataset.from_pandas(val.sample(150))
107
+ else:
108
+ train_dataset = Dataset.from_pandas(train.sample(1500))
109
+ eval_dataset = Dataset.from_pandas(val.sample(200))
110
+ def preprocess_function(examples):
111
+ inputs = [f"Summarize the text: {ex}" for ex in examples["source"]]
112
+ targets = [f"Summary: {ex}" for ex in examples["target"]]
113
+ model_inputs = tokenizer(inputs, max_length=max_input, truncation=True)
114
+
115
+ with tokenizer.as_target_tokenizer():
116
+ labels = tokenizer(targets, max_length=max_output, truncation=True)
117
+
118
+ model_inputs["labels"] = labels["input_ids"]
119
+
120
+
121
+ print("Input:", inputs[0])
122
+ print("Output:", targets[0])
123
+
124
+ return model_inputs
125
+
126
+ tokenized_train = train_dataset.map(preprocess_function, batched=True)
127
+
128
+ # Create a small evaluation dataset
129
+
130
+ tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
131
+
132
+ # Apply QLoRA only for mT5
133
+ if model_name == "mT5":
134
+ # PEFT Configuration for Quantized Fine-tuning
135
+ lora_config = LoraConfig(
136
+ r=8, # Rank of the LoRA update matrices
137
+ lora_alpha=32, # Scaling factor for the LoRA update matrices
138
+ lora_dropout=0.05, # Dropout probability for the LoRA update matrices
139
+ bias="none", # Whether to apply a bias to the LoRA update matrices
140
+ task_type=TaskType.SEQ_2_SEQ_LM # Task type for the model
141
+ )
142
+
143
+ # Prepare model for int8 training and apply LoRA
144
+ model = prepare_model_for_kbit_training(model)
145
+ model = get_peft_model(model, lora_config)
146
+
147
+ # Use DataCollatorForSeq2Seq for dynamic padding
148
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) # Initialize the DataCollatorForSeq2Seq
149
+
150
+ training_args = TrainingArguments(
151
+ output_dir=f"./{model_name}-{finetune_type}-finetuned",
152
+ evaluation_strategy="epoch",
153
+ save_total_limit = 1,
154
+ learning_rate=2e-5,
155
+ per_device_train_batch_size=4,
156
+ per_device_eval_batch_size=4,
157
+ num_train_epochs=3,
158
+ weight_decay=0.01,
159
+ push_to_hub=True, # Automatically push at the end
160
+ fp16=True,
161
+ report_to="none",
162
+ )
163
+
164
+ trainer = Trainer(
165
+ model=model,
166
+ args=training_args,
167
+ train_dataset=tokenized_train,
168
+ eval_dataset=tokenized_eval,
169
+ data_collator=data_collator,
170
+ )
171
+
172
+ trainer.train()
173
+
174
+ # Save tokenizer and push manually
175
+ tokenizer.save_pretrained(training_args.output_dir)
176
+ tokenizer.push_to_hub(f"{model_name}-{finetune_type}-finetuned")
177
+
178
+ print("Saving model to Hugging Face Hub...")
179
+
180
+
181
+ def main():
182
+ parser = argparse.ArgumentParser(description="Run experiments with different models.")
183
+ parser.add_argument("--model", type=str, required=True, choices=MODELS.values(), help="The model to use.")
184
+ parser.add_argument("--finetune_type", type=str, required=True, choices=["english", "multilingual", "crosslingual"], help="The type of fine-tuning to apply.")
185
+ args = parser.parse_args()
186
+
187
+ experiments(args.model, args.finetune_type)
188
+
189
+ if __name__ == "__main__":
190
+ main()
llama.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+ import torch
4
+ import argparse
5
+ import numpy as np
6
+ import pandas as pd
7
+ import huggingface_hub
8
+ from transformers import pipeline
9
+ from metrics import compute_scores, save_scores
10
+
11
+ # Get the absolute path of the current script
12
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
+
14
+ str1 = "f_bgSZT"
15
+ str2 = "AFSBqvApwHjMQuTOALqZKRpRBzEUL"
16
+ token = "h"+str1+str2
17
+ huggingface_hub.login(token = token)
18
+
19
+ MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
20
+ pipe = pipeline(
21
+ "text-generation",
22
+ model=MODEL_ID,
23
+ torch_dtype=torch.bfloat16,
24
+ device_map="auto",
25
+ )
26
+
27
+ EXPERIMENTS = ["zero-shot", "1-shot", "2-shot"]
28
+
29
+
30
+ def generate_summary(texts):
31
+ messages = [{"role": "user", "content": text} for text in texts]
32
+ outputs = pipe(messages, max_new_tokens=128)
33
+ return outputs
34
+
35
+ def run_experiment(experiment_type, num_examples):
36
+ print(f"Starting {experiment_type} Experiment with Llama-3.2-1B-Instruct")
37
+
38
+ test = pd.read_csv(os.path.join(BASE_DIR, "datasets/test.csv"))
39
+ test_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/test_fr.csv"))
40
+ test_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/test_cross.csv"))
41
+
42
+ test = test.sample(num_examples) if num_examples else test
43
+ test_fr = test_fr.sample(num_examples) if num_examples else test_fr
44
+ test_cross = test_cross.sample(num_examples) if num_examples else test_cross
45
+
46
+ if experiment_type == "zero-shot":
47
+ run_zero_shot(test, test_fr, test_cross)
48
+ elif experiment_type == "1-shot":
49
+ train = pd.read_csv(os.path.join(BASE_DIR, "datasets/train.csv"))
50
+ train_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/train_fr.csv"))
51
+ train_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/train_cross.csv"))
52
+ run_n_shot(test, test_fr, test_cross, train, train_fr, train_cross, shots=1)
53
+ elif experiment_type == "2-shot":
54
+ train = pd.read_csv(os.path.join(BASE_DIR, "datasets/train.csv"))
55
+ train_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/train_fr.csv"))
56
+ train_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/train_cross.csv"))
57
+ run_n_shot(test, test_fr, test_cross, train, train_fr, train_cross, shots=2)
58
+ else:
59
+ raise ValueError("Invalid experiment type.")
60
+
61
+
62
+ def run_zero_shot(test, test_fr, test_cross, batch_size=16):
63
+ print("Running Zero-Shot Evaluation...")
64
+ for dataset, name in [(test, "English"), (test_fr, "French"), (test_cross, "Cross-lingual")]:
65
+ prefix = "Summarize in English: " if name == "Cross-lingual" else "Summarize the text: "
66
+ texts = [f"{prefix}{row['source']}\n Summary: " for _, row in dataset.iterrows()]
67
+
68
+ reference_summaries = dataset["target"].tolist()
69
+
70
+ generated_summaries = []
71
+ for i in range(0, len(texts), batch_size):
72
+ batch_texts = texts[i:i + batch_size]
73
+ batch_summaries = generate_summary(batch_texts)
74
+ print(batch_summaries)
75
+ batch_summaries = [output[0]["generated_text"] for output in batch_summaries]
76
+ generated_summaries.extend(batch_summaries)
77
+
78
+ print(generated_summaries)
79
+ scores = compute_scores(generated_summaries, reference_summaries)
80
+ save_scores(scores, "Llama-3.2-1B-Instruct", "zero-shot", name)
81
+ print(f"{name} Scores:", scores)
82
+
83
+
84
+ def run_n_shot(test, test_fr, test_cross, train, train_fr, train_cross, shots, batch_size=16):
85
+ print(f"Running {shots}-Shot Evaluation...")
86
+ for dataset, train_data, name in [(test, train, "English"), (test_fr, train_fr, "French"), (test_cross, train_cross, "Cross-lingual")]:
87
+ generated_summaries = []
88
+ reference_summaries = []
89
+
90
+ texts = []
91
+ for _, sample in dataset.iterrows():
92
+ shot_examples = train_data.sample(shots)
93
+ shot_prompt = "\n\n".join([f"Summarize the text: {row['source']}\n Summary: {row['target']}" for _, row in shot_examples.iterrows()])
94
+ prefix = "Summarize in English: " if name == "Cross-lingual" else "Summarize the text: "
95
+ prompt = f"{shot_prompt}\n\n{prefix}{sample['source']}\n Summary: "
96
+ texts.append(prompt)
97
+ reference_summaries.append(sample["target"])
98
+
99
+ for i in range(0, len(texts), batch_size):
100
+ batch_texts = texts[i:i + batch_size]
101
+ batch_summaries = generate_summary(batch_texts)
102
+ batch_summaries = [output[0]["generated_text"] for output in batch_summaries]
103
+ generated_summaries.extend(batch_summaries)
104
+
105
+ scores = compute_scores(generated_summaries, reference_summaries)
106
+ save_scores(scores, "Llama-3.2-1B-Instruct", f"{shots}-shot", name)
107
+ print(f"{name} Scores:", scores)
108
+
109
+ def main():
110
+ parser = argparse.ArgumentParser(description="Run experiments with Llama-3.2-1B-Instruct.")
111
+ parser.add_argument("--experiment", type=str, required=True, choices=EXPERIMENTS, help="The experiment to run.")
112
+ parser.add_argument("--num_examples", type=int, default=None, help="Number of examples to generate summaries on (optional).")
113
+ args = parser.parse_args()
114
+
115
+ run_experiment(args.experiment, args.num_examples)
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
load_finetuned.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import (
3
+ AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
4
+ )
5
+ from peft import PeftModel
6
+
7
+ def load_model(model_name, finetune_type):
8
+ """Loads a fine-tuned model from the Hugging Face repository based on its type."""
9
+ if model_name not in MODEL_REPOS:
10
+ raise ValueError(f"Invalid model name. Choose from: {list(MODEL_REPOS.keys())}")
11
+
12
+ if finetune_type not in MODEL_REPOS[model_name]:
13
+ raise ValueError(f"Invalid finetune type. Choose from: {list(MODEL_REPOS[model_name].keys())}")
14
+
15
+ repo_name = MODEL_REPOS[model_name][finetune_type]
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+
18
+ # Load tokenizer
19
+ tokenizer = AutoTokenizer.from_pretrained(repo_name)
20
+
21
+ if model_name == "mT5": # 4-bit quantized + QLoRA fine-tuned
22
+ print(f"Loading {model_name} with {finetune_type} finetuning, 4-bit quantization, and QLoRA...")
23
+
24
+ # Load model with 4-bit quantization settings
25
+ quant_config = BitsAndBytesConfig(
26
+ load_in_4bit=True,
27
+ bnb_4bit_compute_dtype=torch.float16,
28
+ bnb_4bit_use_double_quant=True,
29
+ bnb_4bit_quant_type="nf4"
30
+ )
31
+
32
+ base_model_name = "google/mt5-xl" # Use correct base model
33
+ model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name, quantization_config=quant_config, device_map="auto")
34
+
35
+ # Apply fine-tuned LoRA adapter
36
+ model = PeftModel.from_pretrained(model, repo_name)
37
+
38
+ elif model_name == "mBART50": # Normally fine-tuned
39
+ print(f"Loading {model_name} with {finetune_type} fine-tuning...")
40
+
41
+ model = AutoModelForSeq2SeqLM.from_pretrained(repo_name)
42
+ model.to(device)
43
+
44
+ else:
45
+ raise ValueError(f"Unknown model: {model_name}")
46
+
47
+ print(f"{model_name} ({finetune_type}) loaded successfully!")
48
+ return model, tokenizer
49
+
50
+ MODEL_REPOS = {
51
+ "mT5": {
52
+ "english": "darpanaswal/mT5-english-finetuned",
53
+ "multilingual": "darpanaswal/mT5-multilingual-finetuned",
54
+ "crosslingual": "darpanaswal/mT5-crosslingual-finetuned",
55
+ },
56
+ "mBART50": {
57
+ "english": "darpanaswal/mBART50-english-finetuned",
58
+ "multilingual": "darpanaswal/mBART50-multilingual-finetuned",
59
+ "crosslingual": "darpanaswal/mBART50-crosslingual-finetuned",
60
+ },
61
+ }
main.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+ import torch
4
+ import argparse
5
+ import numpy as np
6
+ import pandas as pd
7
+ import huggingface_hub
8
+ from datasets import Dataset
9
+ from load_finetuned import load_model
10
+ from metrics import compute_scores, save_scores
11
+ from transformers import (AutoTokenizer, BitsAndBytesConfig, MBart50TokenizerFast,
12
+ AutoModelForSeq2SeqLM, AutoModelForCausalLM, Trainer,
13
+ MBartForConditionalGeneration, TrainingArguments,
14
+ DataCollatorForSeq2Seq)
15
+ from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
16
+ # Get the absolute path of the current script
17
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
18
+
19
+ MODELS = {
20
+ "mT5": "mT5",
21
+ "mBART50": "mBART50"
22
+ }
23
+ LANGUAGE_CODES = {
24
+ "English": "en_XX",
25
+ "French": "fr_XX"
26
+ }
27
+ EXPERIMENTS = {
28
+ "mT5": ["zero-shot"],
29
+ "mBART50": ["zero-shot", "1-shot"]
30
+ }
31
+
32
+ def summarize_text_mt5(texts, model, tokenizer):
33
+ inputs = tokenizer(texts, return_tensors="pt",
34
+ max_length=512, truncation=True,
35
+ padding=True).to(model.device)
36
+ summary_ids = model.generate(input_ids = inputs.input_ids,
37
+ max_length=128,
38
+ num_beams=4, length_penalty=2.0,
39
+ early_stopping=True)
40
+ summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
41
+ return summaries
42
+
43
+ def summarize_text_mbart50(texts, model, tokenizer):
44
+ inputs = tokenizer(texts, return_tensors="pt",
45
+ max_length=1024, truncation=True,
46
+ padding=True).to(model.device)
47
+ summary_ids = model.generate(inputs.input_ids, max_length=128,
48
+ num_beams=4, length_penalty=2.0,
49
+ early_stopping=True)
50
+ summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
51
+ return summaries
52
+
53
+
54
+ def summarize_text_llama(texts, model, tokenizer):
55
+ prompts = [text for text in texts]
56
+ tokenizer.pad_token = tokenizer.eos_token
57
+ inputs = tokenizer(prompts, return_tensors="pt",
58
+ max_length=1024, truncation=True,
59
+ padding=True).to(model.device)
60
+
61
+ summary_ids = model.generate(
62
+ inputs.input_ids,
63
+ max_new_tokens=128,
64
+ temperature=0.7,
65
+ top_p=0.9,
66
+ num_beams=4,
67
+ length_penalty=2.0,
68
+ early_stopping=True
69
+ )
70
+
71
+ summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
72
+ return summaries
73
+
74
+
75
+ def experiments(model_name, experiment_type, num_examples, finetune_type):
76
+ """Runs an experiment with the given model and dataset."""
77
+ print(f"Starting Experiment: on {model_name}")
78
+
79
+ # Construct dataset paths dynamically
80
+ train = pd.read_csv(os.path.join(BASE_DIR, "datasets/train.csv"))
81
+ train_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/train_fr.csv"))
82
+ train_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/train_cross.csv"))
83
+ val = pd.read_csv(os.path.join(BASE_DIR, "datasets/val.csv"))
84
+ val_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/val_fr.csv"))
85
+ val_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/val_cross.csv"))
86
+ test = pd.read_csv(os.path.join(BASE_DIR, "datasets/test.csv"))
87
+ test_fr = pd.read_csv(os.path.join(BASE_DIR, "datasets/test_fr.csv"))
88
+ test_cross = pd.read_csv(os.path.join(BASE_DIR, "datasets/test_cross.csv"))
89
+
90
+ test = test.sample(num_examples) if num_examples else test
91
+ test_fr = test_fr.sample(num_examples) if num_examples else test_fr
92
+ test_cross = test_cross.sample(num_examples) if num_examples else test_cross
93
+
94
+ model, tokenizer = load_model(model_name, finetune_type)
95
+ print(f"Model {model_name} loaded successfully.")
96
+
97
+ if model_name == "mT5":
98
+ summarize_text = summarize_text_mt5
99
+ elif model_name == "mBART50":
100
+ summarize_text = summarize_text_mbart50
101
+
102
+ # Call the appropriate function based on experiment type
103
+ if experiment_type == "zero-shot":
104
+ run_zero_shot(model_name, model, tokenizer, summarize_text, test, test_fr, test_cross)
105
+ elif experiment_type == "1-shot":
106
+ run_1_shot(model_name, model, tokenizer, summarize_text, train, train_fr, train_cross, test, test_fr, test_cross)
107
+ elif experiment_type == "2-shot":
108
+ run_2_shot(model_name, model, tokenizer, summarize_text, train, train_fr, train_cross, test, test_fr, test_cross)
109
+ else:
110
+ raise ValueError("Invalid experiment type.")
111
+
112
+ def run_zero_shot(model_name, model, tokenizer, summarize_text, test, test_fr, test_cross, batch_size=16):
113
+ print("Running Zero-Shot Evaluation...")
114
+
115
+ for dataset, name in [(test, "English"), (test_fr, "French"), (test_cross, "Cross-lingual")]:
116
+ if model_name == "mBART50":
117
+ if name == "English":
118
+ tokenizer.src_lang = "en_XX"
119
+ else:
120
+ tokenizer.src_lang = "fr_XX"
121
+ prefix = "Summarize in English: " if name == "Cross-lingual" else "Summarize the text: "
122
+ texts = [f"{prefix}{row['source']}\n Summary: " for _, row in dataset.iterrows()]
123
+ reference_summaries = dataset["target"].tolist()
124
+
125
+ # Process in batches
126
+ generated_summaries = []
127
+ for i in range(0, len(texts), batch_size):
128
+ batch_texts = texts[i:i + batch_size]
129
+ # print(f"Processing batch {i//batch_size + 1}: {batch_texts}")
130
+ batch_summaries = summarize_text(batch_texts, model, tokenizer)
131
+ generated_summaries.extend(batch_summaries)
132
+
133
+ # print(f"\n{name} - Generated Summaries:\n", generated_summaries)
134
+ # print(f"\n{name} - Reference Summaries:\n", reference_summaries)
135
+
136
+ scores = compute_scores(generated_summaries, reference_summaries)
137
+ save_scores(scores, model_name, "zero-shot", name)
138
+ print(f"{name} Scores:", scores)
139
+
140
+
141
+ def run_1_shot(model_name, model, tokenizer, summarize_text, train, train_fr, train_cross, test, test_fr, test_cross, batch_size=16):
142
+ print("Running 1-Shot Evaluation...")
143
+
144
+ for dataset, train_data, name in [(test, train, "English"), (test_fr, train_fr, "French"), (test_cross, train_cross, "Cross-lingual")]:
145
+ if model_name == "mBART50":
146
+ if name == "English":
147
+ tokenizer.src_lang = "en_XX"
148
+ else:
149
+ tokenizer.src_lang = "fr_XX"
150
+ generated_summaries = []
151
+ reference_summaries = []
152
+
153
+ texts = []
154
+ for _, sample in dataset.iterrows():
155
+ one_shot = train_data.sample(1)
156
+ source = one_shot["source"].iloc[0]
157
+ target = one_shot["target"].iloc[0]
158
+ prefix = "Summarize in English: " if name == "Cross-lingual" else "Summarize the text: "
159
+ prompt = (
160
+ f"{prefix}{source}\n Summary: {target}\n\n"
161
+ f"{prefix}{sample['source']}\n Summary: "
162
+ )
163
+ texts.append(prompt)
164
+ reference_summaries.append(sample["target"])
165
+
166
+ # Process in batches
167
+ for i in range(0, len(texts), batch_size):
168
+ batch_texts = texts[i:i + batch_size]
169
+ # print(f"Processing batch {i//batch_size + 1}: {batch_texts}")
170
+ batch_summaries = summarize_text(batch_texts, model, tokenizer)
171
+ generated_summaries.extend(batch_summaries)
172
+
173
+ # print(f"\n{name} - Generated Summaries:\n", generated_summaries)
174
+ # print(f"\n{name} - Reference Summaries:\n", reference_summaries)
175
+
176
+ scores = compute_scores(generated_summaries, reference_summaries)
177
+ save_scores(scores, model_name, "1-shot", name)
178
+ print(f"{name} Scores:", scores)
179
+
180
+ def run_2_shot(model_name, model, tokenizer, summarize_text, train, train_fr, train_cross, test, test_fr, test_cross, batch_size=16):
181
+ print("Running 2-Shot Evaluation...")
182
+
183
+ for dataset, train_data, name in [(test, train, "English"), (test_fr, train_fr, "French"), (test_cross, train_cross, "Cross-lingual")]:
184
+ if model_name == "mBART50":
185
+ if name == "English":
186
+ tokenizer.src_lang = "en_XX"
187
+ else:
188
+ tokenizer.src_lang = "fr_XX"
189
+ generated_summaries = []
190
+ reference_summaries = []
191
+
192
+ texts = []
193
+ for _, sample in dataset.iterrows():
194
+ two_shots = train_data.sample(2)
195
+ two_shot1, two_shot2 = two_shots.iloc[0], two_shots.iloc[1]
196
+ source1, source2 = two_shot1["source"].iloc[0] , two_shot2["source"].iloc[0]
197
+ target1, target2 = two_shot1["target"].iloc[0] , two_shot2["target"].iloc[0]
198
+ prefix = "Summarize in English: " if name == "Cross-lingual" else "Summarize the text: "
199
+ prompt = (
200
+ f"{prefix}{two_shot1['source']}\n Summary: {two_shot1['target']}\n\n"
201
+ f"{prefix}{two_shot2['source']}\n Summary: {two_shot2['target']}\n\n"
202
+ f"{prefix}{sample['source']}\n Summary: "
203
+ )
204
+ texts.append(prompt)
205
+ reference_summaries.append(sample["target"])
206
+
207
+ # Process in batches
208
+ for i in range(0, len(texts), batch_size):
209
+ batch_texts = texts[i:i + batch_size]
210
+ batch_summaries = summarize_text(batch_texts, model, tokenizer)
211
+ # print(f"Processing batch {i//batch_size + 1}: {batch_texts}")
212
+ generated_summaries.extend(batch_summaries)
213
+
214
+ # print(f"\n{name} - Generated Summaries:\n", generated_summaries)
215
+ # print(f"\n{name} - Reference Summaries:\n", reference_summaries)
216
+
217
+ scores = compute_scores(generated_summaries, reference_summaries)
218
+ save_scores(scores, model_name, "2-shot", name)
219
+ print(f"{name} Scores:", scores)
220
+
221
+
222
+ def main():
223
+ parser = argparse.ArgumentParser(description="Run experiments with different models.")
224
+ parser.add_argument("--model", type=str, required=True, choices=MODELS.values(), help="The model to use.")
225
+ parser.add_argument("--experiment", type=str, required=True, choices=sum(EXPERIMENTS.values(), []), help="The experiment to run.")
226
+ parser.add_argument("--num_examples", type=int, default=None, help="Number of examples to generate summaries on (optional).")
227
+ parser.add_argument("--finetune_type", type=str, required=True, choices=["english", "multilingual", "crosslingual"], help="The type of fine-tuning to apply.")
228
+ args = parser.parse_args()
229
+
230
+ experiments(args.model, args.experiment, args.num_examples, args.finetune_type)
231
+
232
+ if __name__ == "__main__":
233
+ main()
metrics.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import numpy as np
3
+ from rouge_score import rouge_scorer
4
+ from bert_score import score as bert_score
5
+
6
+ def compute_scores(predictions, references):
7
+ scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
8
+ scores = {"ROUGE-1": [], "ROUGE-2": [], "ROUGE-L": []}
9
+
10
+ for pred, ref in zip(predictions, references):
11
+ rouge_scores = scorer.score(pred, ref)
12
+ scores["ROUGE-1"].append(rouge_scores["rouge1"].fmeasure)
13
+ scores["ROUGE-2"].append(rouge_scores["rouge2"].fmeasure)
14
+ scores["ROUGE-L"].append(rouge_scores["rougeL"].fmeasure)
15
+
16
+ return {key: np.mean(value) for key, value in scores.items()}
17
+
18
+ def save_scores(scores, model_name, experiment_type, dataset_name):
19
+ with open("rouge_results.csv", mode="a", newline="") as file:
20
+ writer = csv.writer(file)
21
+ writer.writerow([model_name, experiment_type, dataset_name, scores["ROUGE-1"], scores["ROUGE-2"], scores["ROUGE-L"]])
22
+
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ peft
2
+ numpy
3
+ torch
4
+ pandas
5
+ seaborn
6
+ datasets
7
+ bert_score
8
+ matplotlib
9
+ accelerate
10
+ rouge_score
11
+ bitsandbytes
12
+ transformers