Spaces:

Grandediw
/

lab2-2024

Runtime error

Grandediw commited on Dec 5, 2024

Commit

1279ab7

verified ·

1 Parent(s): 5baa435

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 # Load the Hugging Face API token from environment variable
 token = os.getenv("HUGGINGFACE_API_TOKEN")
@@ -12,19 +12,26 @@ if not token:
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load the tokenizer and model using the token
-model_repo = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
-tokenizer = AutoTokenizer.from_pretrained(model_repo, use_auth_token=token)
 model = AutoModelForCausalLM.from_pretrained(
     model_repo,
-    use_auth_token=token,
     device_map="auto",
-    torch_dtype=torch.float16,
-    load_in_4bit=True,
-    quantization_config={"bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4"}
 )
-# Move the model to the device
-model.to(device)
 model.eval()
 # Define the inference function
@@ -37,7 +44,7 @@ def infer(prompt):
 # Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("## Llama 3.2 3B Instruct Model Inference")
     with gr.Row():
         prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...")

 import os
 import gradio as gr
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 # Load the Hugging Face API token from environment variable
 token = os.getenv("HUGGINGFACE_API_TOKEN")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load the tokenizer and model using the token
+model_repo = "unsloth/llama-3.2-3b-instruct-bnb-4bit"
+tokenizer = AutoTokenizer.from_pretrained(model_repo, token=token)
+# Configure 4-bit quantization
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16
+)
+# Load the model with quantization configuration
 model = AutoModelForCausalLM.from_pretrained(
     model_repo,
+    token=token,
     device_map="auto",
+    quantization_config=quantization_config
 )
+# Ensure the model is in evaluation mode
 model.eval()
 # Define the inference function
 # Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("## LLaMA 3.2 3B Instruct Model Inference")
     with gr.Row():
         prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...")