Spaces:

ahmedbasemdev
/

FineTunedChatbot

Runtime error

ahmedbasemdev commited on Nov 23, 2024

Commit

5266797

verified ·

1 Parent(s): 2fcb420

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,20 +1,22 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
 import gradio as gr
 # Model and tokenizer paths
 model_name = "ahmedbasemdev/llama-3.2-3b-ChatBot"
-# Load the model
-print("Loading the model...")
-model = AutoModelForCausalLM.from_pretrained(model_name)
-# Apply dynamic quantization to reduce model size and improve CPU performance
-print("Applying quantization...")
-model = torch.quantization.quantize_dynamic(
-    model,  # Model to quantize
-    {torch.nn.Linear},  # Layers to quantize (e.g., Linear layers)
-    dtype=torch.qint8,  # Quantized data type
 )
 # Load the tokenizer
@@ -30,7 +32,7 @@ def single_inference(question):
         messages,
         add_generation_prompt=True,
         return_tensors="pt"
-    ).to("cpu")  # Ensure everything runs on CPU
     # Generate a response
     terminators = [
@@ -55,9 +57,9 @@ interface = gr.Interface(
     fn=single_inference,
     inputs="text",
     outputs="text",
-    title="Chatbot",
     description="Ask me anything!"
 )
 # Launch the Gradio app
-interface.launch()

+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import gradio as gr
 # Model and tokenizer paths
 model_name = "ahmedbasemdev/llama-3.2-3b-ChatBot"
+# Configure 4-bit quantization
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,  # Enable 4-bit quantization
+    bnb_4bit_use_double_quant=True,  # Use double quantization
+    bnb_4bit_quant_type="nf4",  # Use NF4 quantization type for better accuracy
+)
+# Load the model with 4-bit quantization
+print("Loading the quantized model...")
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    device_map="auto",  # Automatically map to available device (CPU)
 )
 # Load the tokenizer
         messages,
         add_generation_prompt=True,
         return_tensors="pt"
+    ).to(model.device)  # Ensure it runs on the correct device
     # Generate a response
     terminators = [
     fn=single_inference,
     inputs="text",
     outputs="text",
+    title="Quantized Chatbot",
     description="Ask me anything!"
 )
 # Launch the Gradio app
+interface.launch()