syubraj
/

Phi-3.5-mini-instruct-MedicalChat-QLoRA

@@ -24,28 +24,62 @@ This is a **LoRA adapter** for the `microsoft/Phi-3.5-mini-instruct` model, fine
 To use this adapter, you need the base model **`microsoft/Phi-3.5-mini-instruct`**. Load it with `peft`:
 ```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
-base_model_name = "microsoft/Phi-3.5-mini-instruct"  # Base model
-lora_model_path = "syubraj/Phi-3.5-mini-instruct-MedicalChat-QLoRA"  # QLoRA adapter
-# Load base model
-base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
 tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-# Load LoRA adapter
-lora_model = PeftModel.from_pretrained(base_model, lora_model_path)
-lora_model.to("cuda")  # Move to GPU if available
-# Generate a response
-def generate_response(prompt, max_length=256):
-    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-    outputs = lora_model.generate(**inputs, max_length=max_length)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-# Example
-print(generate_response("Hi Doctor, what are the symptoms of flu?"))
 ```
 ---

 To use this adapter, you need the base model **`microsoft/Phi-3.5-mini-instruct`**. Load it with `peft`:
 ```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from peft import PeftModel
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(device)
+# Define base model and your fine-tuned LoRA checkpoint
+base_model_name = "microsoft/Phi-3.5-mini-instruct"
+lora_model_path = "syubraj/Phi-3.5-mini-instruct-MedicalChat-QLoRA"
+# Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+# Load model with proper 4-bit quantization settings
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4"
+)
+base_model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/Phi-3.5-mini-instruct",
+    quantization_config=bnb_config,
+    device_map="auto"
+)
+model = PeftModel.from_pretrained(base_model, lora_model_path)
+model = model.merge_and_unload()
+model.to(device)
+print("Model successfully loaded!")
+# Inference function
+def generate_response(user_query, system_message=None, max_length=1024):
+    if system_message is None:
+        system_message = ("You are a trusted AI-powered medical assistant. "
+                          "Analyze patient queries carefully and provide accurate, professional, and empathetic responses. "
+                          "Prioritize patient safety, adhere to medical best practices, and recommend consulting a healthcare provider when necessary.")
+    # Prepare input prompt
+    prompt = f"<|system|> {system_message} <|end|>\n<|user|> {user_query} <|end|>\n<|assistant|>"
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    outputs = model.generate(**inputs, max_length=max_length)
+    # Decode response
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return response.split("<|assistant|>")[-1].strip().split("<|end|>")[0].strip()
+if __name__ == "__main__":
+    res = generate_response("Hi, How can someone let go of fever?")
+    print(res)
 ```
 ---