Spaces:

kalekarnn
/

fine-tuned-phi-2-model

Sleeping

App Files Files Community

kalekarnn commited on Mar 22

Commit

24a6b10

verified ·

1 Parent(s): dc52bd5

Upload 3 files

Browse files

Files changed (3) hide show

app.py +143 -0
requirements.txt +12 -0
train.py +143 -0

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from datasets import load_dataset, Dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from transformers import TrainingArguments
+from trl import SFTTrainer, SFTConfig
+from peft import LoraConfig, prepare_model_for_kbit_training
+import torch
+# Configure quantization
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True,
+)
+# Load model and tokenizer
+model_name = "microsoft/phi-2"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    device_map="auto",
+    trust_remote_code=True
+)
+model.config.use_cache = False
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+# Prepare model for k-bit training
+model = prepare_model_for_kbit_training(model)
+# Configure LoRA
+peft_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=["q_proj", "k_proj", "v_proj", "dense"]
+)
+# Load and preprocess dataset
+ds = load_dataset("OpenAssistant/oasst1")
+train_dataset = ds['train']
+def format_conversation(example):
+    """Format the conversation for instruction fine-tuning"""
+    # Only process root messages (start of conversations)
+    if example["role"] == "prompter" and example["parent_id"] is None:
+        conversation = []
+        current_msg = example
+        conversation.append(("Human", current_msg["text"]))
+        # Follow the conversation thread
+        current_id = current_msg["message_id"]
+        while current_id in message_children:
+            # Get the next message in conversation
+            next_msg = message_children[current_id]
+            if next_msg["role"] == "assistant":
+                conversation.append(("Assistant", next_msg["text"]))
+            elif next_msg["role"] == "prompter":
+                conversation.append(("Human", next_msg["text"]))
+            current_id = next_msg["message_id"]
+        if len(conversation) >= 2:  # At least one exchange (human->assistant)
+            formatted_text = ""
+            for speaker, text in conversation:
+                formatted_text += f"{speaker}: {text}\n\n"
+            return {"text": formatted_text.strip()}
+    return {"text": None}
+# Build message relationships
+print("Building conversation threads...")
+message_children = {}
+for example in train_dataset:
+    if example["parent_id"] is not None:
+        message_children[example["parent_id"]] = example
+# Format complete conversations
+print("\nFormatting conversations...")
+processed_dataset = []
+for example in train_dataset:
+    result = format_conversation(example)
+    if result["text"] is not None:
+        processed_dataset.append(result)
+    if len(processed_dataset) % 100 == 0 and len(processed_dataset) > 0:
+        print(f"Found {len(processed_dataset)} valid conversations")
+print(f"Final dataset size: {len(processed_dataset)} conversations")
+# Convert to Dataset format
+train_dataset = Dataset.from_list(processed_dataset)
+# Remove the redundant conversion
+# train_dataset = list(train_dataset)
+# train_dataset = Dataset.from_list(train_dataset)
+# Convert to standard dataset for training
+train_dataset = list(train_dataset)
+train_dataset = Dataset.from_list(train_dataset)
+# Configure SFT parameters
+sft_config = SFTConfig(
+    output_dir="phi2-finetuned",
+    num_train_epochs=1,
+    max_steps=500,
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=1,
+    learning_rate=2e-4,
+    weight_decay=0.001,
+    logging_steps=1,
+    logging_strategy="steps",
+    save_strategy="steps",
+    save_steps=100,
+    save_total_limit=3,
+    push_to_hub=False,
+    max_seq_length=512,
+    report_to="none",
+)
+# Initialize trainer
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=train_dataset,  # Changed from dataset to train_dataset
+    peft_config=peft_config,
+    args=sft_config,
+)
+# Train the model
+trainer.train()
+# Save the trained model in Hugging Face format
+trainer.save_model("phi2-finetuned-final")
+# Save the model in PyTorch format
+model_save_path = "phi2-finetuned-final/model.pt"
+torch.save({
+    'model_state_dict': trainer.model.state_dict(),
+    'config': trainer.model.config,
+    'peft_config': peft_config,
+}, model_save_path)
+print(f"Model saved in PyTorch format at: {model_save_path}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+transformers>=4.34.0
+datasets>=2.14.0
+peft>=0.5.0
+bitsandbytes>=0.41.1
+accelerate>=0.23.0
+torch>=2.0.0
+bitsandbytes
+trl
+gradio
+torch
+transformers
+peft

train.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from datasets import load_dataset, Dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from transformers import TrainingArguments
+from trl import SFTTrainer, SFTConfig
+from peft import LoraConfig, prepare_model_for_kbit_training
+import torch
+# Configure quantization
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True,
+)
+# Load model and tokenizer
+model_name = "microsoft/phi-2"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    device_map="auto",
+    trust_remote_code=True
+)
+model.config.use_cache = False
+# Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+# Prepare model for k-bit training
+model = prepare_model_for_kbit_training(model)
+# Configure LoRA
+peft_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=["q_proj", "k_proj", "v_proj", "dense"]
+)
+# Load and preprocess dataset
+ds = load_dataset("OpenAssistant/oasst1")
+train_dataset = ds['train']
+def format_conversation(example):
+    """Format the conversation for instruction fine-tuning"""
+    # Only process root messages (start of conversations)
+    if example["role"] == "prompter" and example["parent_id"] is None:
+        conversation = []
+        current_msg = example
+        conversation.append(("Human", current_msg["text"]))
+        # Follow the conversation thread
+        current_id = current_msg["message_id"]
+        while current_id in message_children:
+            # Get the next message in conversation
+            next_msg = message_children[current_id]
+            if next_msg["role"] == "assistant":
+                conversation.append(("Assistant", next_msg["text"]))
+            elif next_msg["role"] == "prompter":
+                conversation.append(("Human", next_msg["text"]))
+            current_id = next_msg["message_id"]
+        if len(conversation) >= 2:  # At least one exchange (human->assistant)
+            formatted_text = ""
+            for speaker, text in conversation:
+                formatted_text += f"{speaker}: {text}\n\n"
+            return {"text": formatted_text.strip()}
+    return {"text": None}
+# Build message relationships
+print("Building conversation threads...")
+message_children = {}
+for example in train_dataset:
+    if example["parent_id"] is not None:
+        message_children[example["parent_id"]] = example
+# Format complete conversations
+print("\nFormatting conversations...")
+processed_dataset = []
+for example in train_dataset:
+    result = format_conversation(example)
+    if result["text"] is not None:
+        processed_dataset.append(result)
+    if len(processed_dataset) % 100 == 0 and len(processed_dataset) > 0:
+        print(f"Found {len(processed_dataset)} valid conversations")
+print(f"Final dataset size: {len(processed_dataset)} conversations")
+# Convert to Dataset format
+train_dataset = Dataset.from_list(processed_dataset)
+# Remove the redundant conversion
+# train_dataset = list(train_dataset)
+# train_dataset = Dataset.from_list(train_dataset)
+# Convert to standard dataset for training
+train_dataset = list(train_dataset)
+train_dataset = Dataset.from_list(train_dataset)
+# Configure SFT parameters
+sft_config = SFTConfig(
+    output_dir="phi2-finetuned",
+    num_train_epochs=1,
+    max_steps=500,
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=1,
+    learning_rate=2e-4,
+    weight_decay=0.001,
+    logging_steps=1,
+    logging_strategy="steps",
+    save_strategy="steps",
+    save_steps=100,
+    save_total_limit=3,
+    push_to_hub=False,
+    max_seq_length=512,
+    report_to="none",
+)
+# Initialize trainer
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=train_dataset,  # Changed from dataset to train_dataset
+    peft_config=peft_config,
+    args=sft_config,
+)
+# Train the model
+trainer.train()
+# Save the trained model in Hugging Face format
+trainer.save_model("phi2-finetuned-final")
+# Save the model in PyTorch format
+model_save_path = "phi2-finetuned-final/model.pt"
+torch.save({
+    'model_state_dict': trainer.model.state_dict(),
+    'config': trainer.model.config,
+    'peft_config': peft_config,
+}, model_save_path)
+print(f"Model saved in PyTorch format at: {model_save_path}")