Braszczynski commited on
Commit
94d5aca
·
verified ·
1 Parent(s): 32ab136

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -12
app.py CHANGED
@@ -1,24 +1,26 @@
1
  import gradio as gr
2
  import torch
3
- from unsloth import FastLanguageModel
4
- from transformers import TextStreamer
5
 
6
  # Configuration Variables
7
  model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" # Replace with your actual model name
8
  lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"
9
 
10
  max_seq_length = 512 # Adjust as needed
11
- dtype = None # Example dtype, adjust based on your setup
12
- load_in_4bit = True # Set to True if you want to use 4-bit quantization
13
-
14
- # Load the tokenizer
15
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
16
-
17
- # Load the base model with adapters
18
- model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True).to("cuda")
19
- model.load_adapter(lora_adapter)
20
 
 
 
 
21
 
 
 
 
 
 
 
 
22
 
23
  def respond(message, history, system_message, max_tokens, temperature, top_p):
24
  # Combine system message and chat history
@@ -51,6 +53,9 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
51
  response = response[len(chat_history):].strip() # Remove the input context
52
  return response
53
 
 
 
 
54
  # Define the Gradio interface
55
  demo = gr.ChatInterface(
56
  respond,
@@ -63,4 +68,4 @@ demo = gr.ChatInterface(
63
  )
64
 
65
  if __name__ == "__main__":
66
- demo.launch()
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoTokenizer, AutoAdapterModel, TextStreamer
 
4
 
5
  # Configuration Variables
6
  model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" # Replace with your actual model name
7
  lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"
8
 
9
  max_seq_length = 512 # Adjust as needed
10
+ dtype = None # Example dtype, adjust based on your setup
11
+ load_in_4bit = True # Set to True if you want to use 4-bit quantization
 
 
 
 
 
 
 
12
 
13
+ # Dynamically select device
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ print(f"Using device: {device}")
16
 
17
+ # Conditional import based on GPU availability
18
+ if device.type == "cuda":
19
+ from unsloth import FastLanguageModel
20
+ model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True).to(device)
21
+ model.load_adapter(lora_adapter)
22
+ else:
23
+ raise RuntimeError("No CUDA GPU available. Please ensure your Space has GPU enabled.")
24
 
25
  def respond(message, history, system_message, max_tokens, temperature, top_p):
26
  # Combine system message and chat history
 
53
  response = response[len(chat_history):].strip() # Remove the input context
54
  return response
55
 
56
+ # Load the tokenizer
57
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
58
+
59
  # Define the Gradio interface
60
  demo = gr.ChatInterface(
61
  respond,
 
68
  )
69
 
70
  if __name__ == "__main__":
71
+ demo.launch()