kalekarnn commited on
Commit
24a6b10
·
verified ·
1 Parent(s): dc52bd5

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +143 -0
  2. requirements.txt +12 -0
  3. train.py +143 -0
app.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Dataset
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
+ from transformers import TrainingArguments
4
+ from trl import SFTTrainer, SFTConfig
5
+ from peft import LoraConfig, prepare_model_for_kbit_training
6
+ import torch
7
+
8
+ # Configure quantization
9
+ bnb_config = BitsAndBytesConfig(
10
+ load_in_4bit=True,
11
+ bnb_4bit_quant_type="nf4",
12
+ bnb_4bit_compute_dtype=torch.float16,
13
+ bnb_4bit_use_double_quant=True,
14
+ )
15
+
16
+ # Load model and tokenizer
17
+ model_name = "microsoft/phi-2"
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ model_name,
20
+ quantization_config=bnb_config,
21
+ device_map="auto",
22
+ trust_remote_code=True
23
+ )
24
+ model.config.use_cache = False
25
+
26
+ # Load tokenizer
27
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
28
+ tokenizer.pad_token = tokenizer.eos_token
29
+
30
+ # Prepare model for k-bit training
31
+ model = prepare_model_for_kbit_training(model)
32
+
33
+ # Configure LoRA
34
+ peft_config = LoraConfig(
35
+ r=16,
36
+ lora_alpha=32,
37
+ lora_dropout=0.05,
38
+ bias="none",
39
+ task_type="CAUSAL_LM",
40
+ target_modules=["q_proj", "k_proj", "v_proj", "dense"]
41
+ )
42
+
43
+ # Load and preprocess dataset
44
+ ds = load_dataset("OpenAssistant/oasst1")
45
+ train_dataset = ds['train']
46
+
47
+ def format_conversation(example):
48
+ """Format the conversation for instruction fine-tuning"""
49
+ # Only process root messages (start of conversations)
50
+ if example["role"] == "prompter" and example["parent_id"] is None:
51
+ conversation = []
52
+ current_msg = example
53
+ conversation.append(("Human", current_msg["text"]))
54
+
55
+ # Follow the conversation thread
56
+ current_id = current_msg["message_id"]
57
+ while current_id in message_children:
58
+ # Get the next message in conversation
59
+ next_msg = message_children[current_id]
60
+ if next_msg["role"] == "assistant":
61
+ conversation.append(("Assistant", next_msg["text"]))
62
+ elif next_msg["role"] == "prompter":
63
+ conversation.append(("Human", next_msg["text"]))
64
+ current_id = next_msg["message_id"]
65
+
66
+ if len(conversation) >= 2: # At least one exchange (human->assistant)
67
+ formatted_text = ""
68
+ for speaker, text in conversation:
69
+ formatted_text += f"{speaker}: {text}\n\n"
70
+ return {"text": formatted_text.strip()}
71
+ return {"text": None}
72
+
73
+ # Build message relationships
74
+ print("Building conversation threads...")
75
+ message_children = {}
76
+ for example in train_dataset:
77
+ if example["parent_id"] is not None:
78
+ message_children[example["parent_id"]] = example
79
+
80
+ # Format complete conversations
81
+ print("\nFormatting conversations...")
82
+ processed_dataset = []
83
+ for example in train_dataset:
84
+ result = format_conversation(example)
85
+ if result["text"] is not None:
86
+ processed_dataset.append(result)
87
+ if len(processed_dataset) % 100 == 0 and len(processed_dataset) > 0:
88
+ print(f"Found {len(processed_dataset)} valid conversations")
89
+
90
+ print(f"Final dataset size: {len(processed_dataset)} conversations")
91
+
92
+ # Convert to Dataset format
93
+ train_dataset = Dataset.from_list(processed_dataset)
94
+
95
+ # Remove the redundant conversion
96
+ # train_dataset = list(train_dataset)
97
+ # train_dataset = Dataset.from_list(train_dataset)
98
+
99
+ # Convert to standard dataset for training
100
+ train_dataset = list(train_dataset)
101
+ train_dataset = Dataset.from_list(train_dataset)
102
+
103
+ # Configure SFT parameters
104
+ sft_config = SFTConfig(
105
+ output_dir="phi2-finetuned",
106
+ num_train_epochs=1,
107
+ max_steps=500,
108
+ per_device_train_batch_size=4,
109
+ gradient_accumulation_steps=1,
110
+ learning_rate=2e-4,
111
+ weight_decay=0.001,
112
+ logging_steps=1,
113
+ logging_strategy="steps",
114
+ save_strategy="steps",
115
+ save_steps=100,
116
+ save_total_limit=3,
117
+ push_to_hub=False,
118
+ max_seq_length=512,
119
+ report_to="none",
120
+ )
121
+
122
+ # Initialize trainer
123
+ trainer = SFTTrainer(
124
+ model=model,
125
+ train_dataset=train_dataset, # Changed from dataset to train_dataset
126
+ peft_config=peft_config,
127
+ args=sft_config,
128
+ )
129
+
130
+ # Train the model
131
+ trainer.train()
132
+
133
+ # Save the trained model in Hugging Face format
134
+ trainer.save_model("phi2-finetuned-final")
135
+
136
+ # Save the model in PyTorch format
137
+ model_save_path = "phi2-finetuned-final/model.pt"
138
+ torch.save({
139
+ 'model_state_dict': trainer.model.state_dict(),
140
+ 'config': trainer.model.config,
141
+ 'peft_config': peft_config,
142
+ }, model_save_path)
143
+ print(f"Model saved in PyTorch format at: {model_save_path}")
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers>=4.34.0
2
+ datasets>=2.14.0
3
+ peft>=0.5.0
4
+ bitsandbytes>=0.41.1
5
+ accelerate>=0.23.0
6
+ torch>=2.0.0
7
+ bitsandbytes
8
+ trl
9
+ gradio
10
+ torch
11
+ transformers
12
+ peft
train.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Dataset
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
+ from transformers import TrainingArguments
4
+ from trl import SFTTrainer, SFTConfig
5
+ from peft import LoraConfig, prepare_model_for_kbit_training
6
+ import torch
7
+
8
+ # Configure quantization
9
+ bnb_config = BitsAndBytesConfig(
10
+ load_in_4bit=True,
11
+ bnb_4bit_quant_type="nf4",
12
+ bnb_4bit_compute_dtype=torch.float16,
13
+ bnb_4bit_use_double_quant=True,
14
+ )
15
+
16
+ # Load model and tokenizer
17
+ model_name = "microsoft/phi-2"
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ model_name,
20
+ quantization_config=bnb_config,
21
+ device_map="auto",
22
+ trust_remote_code=True
23
+ )
24
+ model.config.use_cache = False
25
+
26
+ # Load tokenizer
27
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
28
+ tokenizer.pad_token = tokenizer.eos_token
29
+
30
+ # Prepare model for k-bit training
31
+ model = prepare_model_for_kbit_training(model)
32
+
33
+ # Configure LoRA
34
+ peft_config = LoraConfig(
35
+ r=16,
36
+ lora_alpha=32,
37
+ lora_dropout=0.05,
38
+ bias="none",
39
+ task_type="CAUSAL_LM",
40
+ target_modules=["q_proj", "k_proj", "v_proj", "dense"]
41
+ )
42
+
43
+ # Load and preprocess dataset
44
+ ds = load_dataset("OpenAssistant/oasst1")
45
+ train_dataset = ds['train']
46
+
47
+ def format_conversation(example):
48
+ """Format the conversation for instruction fine-tuning"""
49
+ # Only process root messages (start of conversations)
50
+ if example["role"] == "prompter" and example["parent_id"] is None:
51
+ conversation = []
52
+ current_msg = example
53
+ conversation.append(("Human", current_msg["text"]))
54
+
55
+ # Follow the conversation thread
56
+ current_id = current_msg["message_id"]
57
+ while current_id in message_children:
58
+ # Get the next message in conversation
59
+ next_msg = message_children[current_id]
60
+ if next_msg["role"] == "assistant":
61
+ conversation.append(("Assistant", next_msg["text"]))
62
+ elif next_msg["role"] == "prompter":
63
+ conversation.append(("Human", next_msg["text"]))
64
+ current_id = next_msg["message_id"]
65
+
66
+ if len(conversation) >= 2: # At least one exchange (human->assistant)
67
+ formatted_text = ""
68
+ for speaker, text in conversation:
69
+ formatted_text += f"{speaker}: {text}\n\n"
70
+ return {"text": formatted_text.strip()}
71
+ return {"text": None}
72
+
73
+ # Build message relationships
74
+ print("Building conversation threads...")
75
+ message_children = {}
76
+ for example in train_dataset:
77
+ if example["parent_id"] is not None:
78
+ message_children[example["parent_id"]] = example
79
+
80
+ # Format complete conversations
81
+ print("\nFormatting conversations...")
82
+ processed_dataset = []
83
+ for example in train_dataset:
84
+ result = format_conversation(example)
85
+ if result["text"] is not None:
86
+ processed_dataset.append(result)
87
+ if len(processed_dataset) % 100 == 0 and len(processed_dataset) > 0:
88
+ print(f"Found {len(processed_dataset)} valid conversations")
89
+
90
+ print(f"Final dataset size: {len(processed_dataset)} conversations")
91
+
92
+ # Convert to Dataset format
93
+ train_dataset = Dataset.from_list(processed_dataset)
94
+
95
+ # Remove the redundant conversion
96
+ # train_dataset = list(train_dataset)
97
+ # train_dataset = Dataset.from_list(train_dataset)
98
+
99
+ # Convert to standard dataset for training
100
+ train_dataset = list(train_dataset)
101
+ train_dataset = Dataset.from_list(train_dataset)
102
+
103
+ # Configure SFT parameters
104
+ sft_config = SFTConfig(
105
+ output_dir="phi2-finetuned",
106
+ num_train_epochs=1,
107
+ max_steps=500,
108
+ per_device_train_batch_size=4,
109
+ gradient_accumulation_steps=1,
110
+ learning_rate=2e-4,
111
+ weight_decay=0.001,
112
+ logging_steps=1,
113
+ logging_strategy="steps",
114
+ save_strategy="steps",
115
+ save_steps=100,
116
+ save_total_limit=3,
117
+ push_to_hub=False,
118
+ max_seq_length=512,
119
+ report_to="none",
120
+ )
121
+
122
+ # Initialize trainer
123
+ trainer = SFTTrainer(
124
+ model=model,
125
+ train_dataset=train_dataset, # Changed from dataset to train_dataset
126
+ peft_config=peft_config,
127
+ args=sft_config,
128
+ )
129
+
130
+ # Train the model
131
+ trainer.train()
132
+
133
+ # Save the trained model in Hugging Face format
134
+ trainer.save_model("phi2-finetuned-final")
135
+
136
+ # Save the model in PyTorch format
137
+ model_save_path = "phi2-finetuned-final/model.pt"
138
+ torch.save({
139
+ 'model_state_dict': trainer.model.state_dict(),
140
+ 'config': trainer.model.config,
141
+ 'peft_config': peft_config,
142
+ }, model_save_path)
143
+ print(f"Model saved in PyTorch format at: {model_save_path}")