vsagar100 commited on
Commit
ac0aa1f
·
verified ·
1 Parent(s): 0df4cd0

Update new_review_code.py

Browse files
Files changed (1) hide show
  1. new_review_code.py +67 -58
new_review_code.py CHANGED
@@ -1,74 +1,83 @@
1
- import os
2
  import torch
3
-
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments,BitsAndBytesConfig
 
 
 
 
 
5
  from datasets import load_dataset
6
- from trl import SFTTrainer
7
- from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training
8
- from utils import find_all_linear_names, print_trainable_parameters
9
 
10
- output_dir="./results"
11
- model_name ="codellama/CodeLlama-7b-hf"
 
 
12
 
13
- dataset = load_dataset('timdettmers/openassistant-guanaco', split="train")
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- bnb_config = BitsAndBytesConfig(
16
- load_in_4bit=True,
17
- bnb_4bit_quant_type="nf4",
18
- bnb_4bit_compute_dtype=torch.bfloat16,
19
- bnb_4bit_use_double_quant=True,
 
 
20
  )
21
 
22
- base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=bnb_config)
23
- base_model.config.use_cache = False
24
- base_model = prepare_model_for_kbit_training(base_model)
25
 
26
- tokenizer = AutoTokenizer.from_pretrained(model_name)
27
- tokenizer.pad_token = tokenizer.eos_token
28
- tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
29
 
30
- # Change the LORA hyperparameters accordingly to fit your use case
31
- peft_config = LoraConfig(
32
- r=32,
33
- lora_alpha=16,
34
- target_modules=find_all_linear_names(base_model),
35
- lora_dropout=0.05,
36
- bias="none",
37
- task_type="CAUSAL_LM",
 
 
 
 
 
38
  )
39
 
40
- base_model = get_peft_model(base_model, peft_config)
41
- print_trainable_parameters(base_model)
42
-
43
- # Parameters for training arguments details => https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py#L158
44
- training_args = TrainingArguments(
45
- per_device_train_batch_size=1,
46
- gradient_accumulation_steps=1,
47
- gradient_checkpointing =True,
48
- max_grad_norm= 0.3,
49
- num_train_epochs=3,
50
- learning_rate=1e-4,
51
- bf16=True,
52
- save_total_limit=3,
53
- logging_steps=300,
54
- output_dir=output_dir,
55
- optim="paged_adamw_32bit",
56
- lr_scheduler_type="constant",
57
- warmup_ratio=0.05,
58
  )
59
 
60
- trainer = SFTTrainer(
61
- base_model,
62
- train_dataset=dataset,
63
- dataset_text_field="text",
64
- tokenizer=tokenizer,
65
- max_seq_length=512,
66
- args=training_args
67
  )
68
 
69
- trainer.train()
70
- trainer.save_model(output_dir)
 
 
 
 
71
 
72
- output_dir = os.path.join(output_dir, "final_checkpoint")
73
- trainer.model.save_pretrained(output_dir)
74
- tokenizer.save_pretrained(output_dir)
 
 
1
  import torch
2
+ from transformers import (
3
+ AutoModelForCausalLM,
4
+ AutoTokenizer,
5
+ TrainingArguments,
6
+ Trainer,
7
+ DataCollatorForLanguageModeling
8
+ )
9
  from datasets import load_dataset
10
+ import pandas as pd
 
 
11
 
12
+ # Check GPU availability
13
+ print("CUDA Available:", torch.cuda.is_available())
14
+ print("Current Device:", torch.cuda.current_device())
15
+ print("Device Name:", torch.cuda.get_device_name(0))
16
 
17
+ # Load and prepare dataset
18
+ def load_custom_dataset(file_path):
19
+ # Read CSV
20
+ df = pd.read_csv(file_path)
21
+
22
+ # Ensure 'text' column exists
23
+ if 'text' not in df.columns:
24
+ raise ValueError("CSV must have a 'text' column")
25
+
26
+ # Convert to Hugging Face dataset
27
+ dataset = load_dataset('csv', data_files=file_path, split='train')
28
+ return dataset
29
 
30
+ # Model and Tokenizer Setup
31
+ model_name = "codellama/CodeLlama-7b-hf"
32
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
33
+ model = AutoModelForCausalLM.from_pretrained(
34
+ model_name,
35
+ torch_dtype=torch.float16, # Use float16 for memory efficiency
36
+ device_map="auto" # Automatic device mapping
37
  )
38
 
39
+ # Tokenization function
40
+ def tokenize_function(examples):
41
+ return tokenizer(examples['text'], truncation=True, max_length=1024)
42
 
43
+ # Prepare dataset
44
+ dataset = load_custom_dataset('instructions.csv')
45
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
46
 
47
+ # Training Arguments
48
+ training_args = TrainingArguments(
49
+ output_dir="./ansible-review-model",
50
+ overwrite_output_dir=True,
51
+ num_train_epochs=4,
52
+ per_device_train_batch_size=2,
53
+ save_steps=10_000,
54
+ save_total_limit=2,
55
+ prediction_loss_only=True,
56
+ learning_rate=2e-4,
57
+ warmup_ratio=0.1,
58
+ fp16=True, # Use mixed precision
59
+ logging_dir='./logs',
60
  )
61
 
62
+ # Data Collator
63
+ data_collator = DataCollatorForLanguageModeling(
64
+ tokenizer=tokenizer,
65
+ mlm=False # For causal language modeling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  )
67
 
68
+ # Trainer
69
+ trainer = Trainer(
70
+ model=model,
71
+ args=training_args,
72
+ train_dataset=tokenized_dataset,
73
+ data_collator=data_collator,
 
74
  )
75
 
76
+ # Start Training
77
+ trainer.train()
78
+
79
+ # Save Model and Tokenizer
80
+ trainer.save_model("./ansible-review-model")
81
+ tokenizer.save_pretrained("./ansible-review-model")
82
 
83
+ print("Training Complete!")