Grandediw commited on
Commit
1279ab7
·
verified ·
1 Parent(s): 5baa435

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -10
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import gradio as gr
3
  import torch
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
  # Load the Hugging Face API token from environment variable
7
  token = os.getenv("HUGGINGFACE_API_TOKEN")
@@ -12,19 +12,26 @@ if not token:
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
 
14
  # Load the tokenizer and model using the token
15
- model_repo = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
16
- tokenizer = AutoTokenizer.from_pretrained(model_repo, use_auth_token=token)
 
 
 
 
 
 
 
 
 
 
17
  model = AutoModelForCausalLM.from_pretrained(
18
  model_repo,
19
- use_auth_token=token,
20
  device_map="auto",
21
- torch_dtype=torch.float16,
22
- load_in_4bit=True,
23
- quantization_config={"bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4"}
24
  )
25
 
26
- # Move the model to the device
27
- model.to(device)
28
  model.eval()
29
 
30
  # Define the inference function
@@ -37,7 +44,7 @@ def infer(prompt):
37
 
38
  # Gradio interface
39
  with gr.Blocks() as demo:
40
- gr.Markdown("## Llama 3.2 3B Instruct Model Inference")
41
 
42
  with gr.Row():
43
  prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...")
 
1
  import os
2
  import gradio as gr
3
  import torch
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
5
 
6
  # Load the Hugging Face API token from environment variable
7
  token = os.getenv("HUGGINGFACE_API_TOKEN")
 
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
 
14
  # Load the tokenizer and model using the token
15
+ model_repo = "unsloth/llama-3.2-3b-instruct-bnb-4bit"
16
+ tokenizer = AutoTokenizer.from_pretrained(model_repo, token=token)
17
+
18
+ # Configure 4-bit quantization
19
+ quantization_config = BitsAndBytesConfig(
20
+ load_in_4bit=True,
21
+ bnb_4bit_use_double_quant=True,
22
+ bnb_4bit_quant_type="nf4",
23
+ bnb_4bit_compute_dtype=torch.float16
24
+ )
25
+
26
+ # Load the model with quantization configuration
27
  model = AutoModelForCausalLM.from_pretrained(
28
  model_repo,
29
+ token=token,
30
  device_map="auto",
31
+ quantization_config=quantization_config
 
 
32
  )
33
 
34
+ # Ensure the model is in evaluation mode
 
35
  model.eval()
36
 
37
  # Define the inference function
 
44
 
45
  # Gradio interface
46
  with gr.Blocks() as demo:
47
+ gr.Markdown("## LLaMA 3.2 3B Instruct Model Inference")
48
 
49
  with gr.Row():
50
  prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...")