Spaces:

Braszczynski
/

ID2223Lab2

Runtime error

Braszczynski commited on Dec 9, 2024

Commit

ed36972

verified ·

1 Parent(s): 8700197

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoAdapterModel, TextStreamer
 # Configuration Variables
 model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"  # Replace with your actual model name
@@ -10,17 +11,14 @@ max_seq_length = 512  # Adjust as needed
 dtype = None          # Example dtype, adjust based on your setup
 load_in_4bit = True   # Set to True if you want to use 4-bit quantization
-# Dynamically select device
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"Using device: {device}")
-# Conditional import based on GPU availability
-if device.type == "cuda":
-    from unsloth import FastLanguageModel
-    model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True).to(device)
-    model.load_adapter(lora_adapter)
-else:
-    raise RuntimeError("No CUDA GPU available. Please ensure your Space has GPU enabled.")
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     # Combine system message and chat history
@@ -35,7 +33,7 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
         return_tensors="pt",
         truncation=True,
         max_length=max_seq_length,
-    ).to(device)
     # Generate the response
     with torch.no_grad():

 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoAdapterModel, TextStreamer
+from unsloth import FastLanguageModel
 # Configuration Variables
 model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"  # Replace with your actual model name
 dtype = None          # Example dtype, adjust based on your setup
 load_in_4bit = True   # Set to True if you want to use 4-bit quantization
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = lora_adapter,
+    max_seq_length = max_seq_length,
+    dtype = dtype,
+    load_in_4bit = load_in_4bit,
+)
+FastLanguageModel.for_inference(model) # Enable native 2x faster inference
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     # Combine system message and chat history
         return_tensors="pt",
         truncation=True,
         max_length=max_seq_length,
+    ).to("cuda")
     # Generate the response
     with torch.no_grad():