remiai3 commited on
Commit
fae0b89
·
verified ·
1 Parent(s): 383ca4f

Update all_in_one.py

Browse files
Files changed (1) hide show
  1. all_in_one.py +114 -114
all_in_one.py CHANGED
@@ -1,115 +1,115 @@
1
- import os
2
- import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
4
- from datasets import load_from_disk
5
- import matplotlib.pyplot as plt
6
-
7
- # Set Hugging Face token (replace with your actual token)
8
- os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" # Replace with your HF_TOKEN
9
-
10
- # Download model and tokenizer
11
- model_name = "Salesforce/codegen-350M-multi"
12
- local_model_path = "./codegen_model"
13
- tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
14
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, cache_dir=local_model_path)
15
-
16
- # Set padding token
17
- tokenizer.pad_token = tokenizer.eos_token
18
-
19
- # Move model to CPU
20
- device = torch.device("cpu")
21
- model.to(device)
22
-
23
- # Load custom dataset
24
- dataset_path = "./custom_dataset"
25
- dataset = load_from_disk(dataset_path)
26
-
27
- # Tokenize dataset
28
- def tokenize_function(examples):
29
- inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
30
- return tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
31
-
32
- tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])
33
-
34
- # Data collator for language modeling
35
- data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
36
-
37
- # Define training arguments
38
- training_args = TrainingArguments(
39
- output_dir="./finetuned_codegen",
40
- overwrite_output_dir=True,
41
- num_train_epochs=5, # Increased epochs for better fine-tuning
42
- per_device_train_batch_size=1,
43
- gradient_accumulation_steps=4,
44
- save_steps=500,
45
- save_total_limit=2,
46
- logging_steps=100,
47
- learning_rate=5e-5,
48
- fp16=False,
49
- no_cuda=True,
50
- dataloader_pin_memory=False,
51
- )
52
-
53
- # Custom callback to store training loss
54
- class LossCallback(TrainerCallback):
55
- def __init__(self):
56
- self.losses = []
57
-
58
- def on_log(self, args, state, control, logs=None, **kwargs):
59
- if logs and "loss" in logs:
60
- self.losses.append(logs["loss"])
61
-
62
- loss_callback = LossCallback()
63
-
64
- # Initialize Trainer
65
- trainer = Trainer(
66
- model=model,
67
- args=training_args,
68
- train_dataset=tokenized_dataset,
69
- data_collator=data_collator,
70
- callbacks=[loss_callback],
71
- )
72
-
73
- # Start fine-tuning
74
- print("Starting fine-tuning...")
75
- trainer.train()
76
-
77
- # Save fine-tuned model
78
- model.save_pretrained("./finetuned_codegen")
79
- tokenizer.save_pretrained("./finetuned_codegen")
80
-
81
- # Plot training loss
82
- plt.plot(loss_callback.losses, label="Training Loss")
83
- plt.xlabel("Steps")
84
- plt.ylabel("Loss")
85
- plt.title("Fine-Tuning Loss Curve")
86
- plt.legend()
87
- plt.savefig("./finetuned_codegen/loss_plot.png")
88
- plt.show()
89
-
90
- print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png")
91
-
92
- # Test fine-tuned model
93
- print("\nTesting fine-tuned model...")
94
- prompts = [
95
- "Write a Python program to print 'Hello, World!'",
96
- "Write a Python function to add two numbers.",
97
- "Write a Python function to subtract two numbers.",
98
- "Write a Python function to calculate factorial of a number",
99
- "Write a Python function to check if a number is prime",
100
- "Write a Python function to reverse a string"
101
- ]
102
-
103
- for prompt in prompts:
104
- inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
105
- outputs = model.generate(
106
- **inputs,
107
- max_length=200,
108
- num_return_sequences=1,
109
- pad_token_id=tokenizer.eos_token_id,
110
- do_sample=True,
111
- temperature=0.7,
112
- top_p=0.9
113
- )
114
- generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
115
  print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}")
 
1
+ import os
2
+ import torch
3
+ import json
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
5
+ from datasets import Dataset
6
+ import matplotlib.pyplot as plt
7
+
8
+ # Set Hugging Face token (replace with your actual token)
9
+ os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" # Replace with your HF_TOKEN
10
+
11
+ # Download model and tokenizer
12
+ model_name = "Salesforce/codegen-350M-multi"
13
+ local_model_path = "./codegen_model"
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
15
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, cache_dir=local_model_path)
16
+
17
+ # Set padding token
18
+ tokenizer.pad_token = tokenizer.eos_token
19
+
20
+ # Move model to CPU
21
+ device = torch.device("cpu")
22
+ model.to(device)
23
+
24
+ # Load custom dataset from JSONL
25
+ dataset_path = "./custom_dataset.jsonl"
26
+ data = []
27
+ with open(dataset_path, 'r', encoding='utf-8') as f:
28
+ for line in f:
29
+ data.append(json.loads(line.strip()))
30
+ dataset = Dataset.from_list(data)
31
+
32
+ # Tokenize dataset
33
+ def tokenize_function(examples):
34
+ inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
35
+ return tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
36
+
37
+ tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])
38
+
39
+ # Data collator for language modeling
40
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
41
+
42
+ # Define training arguments
43
+ training_args = TrainingArguments(
44
+ output_dir="./finetuned_codegen",
45
+ overwrite_output_dir=True,
46
+ num_train_epochs=3,
47
+ per_device_train_batch_size=1,
48
+ gradient_accumulation_steps=4,
49
+ save_steps=500,
50
+ save_total_limit=2,
51
+ logging_steps=100,
52
+ learning_rate=5e-5,
53
+ fp16=False,
54
+ no_cuda=True,
55
+ dataloader_pin_memory=False,
56
+ )
57
+
58
+ # Custom callback to store training loss
59
+ class LossCallback(TrainerCallback):
60
+ def __init__(self):
61
+ self.losses = []
62
+
63
+ def on_log(self, args, state, control, logs=None, **kwargs):
64
+ if logs and "loss" in logs:
65
+ self.losses.append(logs["loss"])
66
+
67
+ loss_callback = LossCallback()
68
+
69
+ # Initialize Trainer
70
+ trainer = Trainer(
71
+ model=model,
72
+ args=training_args,
73
+ train_dataset=tokenized_dataset,
74
+ data_collator=data_collator,
75
+ callbacks=[loss_callback],
76
+ )
77
+
78
+ # Start fine-tuning
79
+ print("Starting fine-tuning...")
80
+ trainer.train()
81
+
82
+ # Save fine-tuned model
83
+ model.save_pretrained("./finetuned_codegen")
84
+ tokenizer.save_pretrained("./finetuned_codegen")
85
+
86
+ # Plot training loss
87
+ plt.plot(loss_callback.losses, label="Training Loss")
88
+ plt.xlabel("Steps")
89
+ plt.ylabel("Loss")
90
+ plt.title("Fine-Tuning Loss Curve")
91
+ plt.legend()
92
+ plt.savefig("./finetuned_codegen/loss_plot.png")
93
+ plt.show()
94
+
95
+ print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png")
96
+
97
+ # Test fine-tuned model
98
+ print("\nTesting fine-tuned model...")
99
+ prompts = [
100
+ "Write a Python program to print 'Hello, World!'"
101
+ ]
102
+
103
+ for prompt in prompts:
104
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
105
+ outputs = model.generate(
106
+ **inputs,
107
+ max_length=200,
108
+ num_return_sequences=1,
109
+ pad_token_id=tokenizer.eos_token_id,
110
+ do_sample=True,
111
+ temperature=0.7,
112
+ top_p=0.9
113
+ )
114
+ generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
115
  print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}")