ahmedbasemdev commited on
Commit
5266797
·
verified ·
1 Parent(s): 2fcb420

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -14
app.py CHANGED
@@ -1,20 +1,22 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer
2
- import torch
3
  import gradio as gr
4
 
5
  # Model and tokenizer paths
6
  model_name = "ahmedbasemdev/llama-3.2-3b-ChatBot"
7
 
8
- # Load the model
9
- print("Loading the model...")
10
- model = AutoModelForCausalLM.from_pretrained(model_name)
 
 
 
11
 
12
- # Apply dynamic quantization to reduce model size and improve CPU performance
13
- print("Applying quantization...")
14
- model = torch.quantization.quantize_dynamic(
15
- model, # Model to quantize
16
- {torch.nn.Linear}, # Layers to quantize (e.g., Linear layers)
17
- dtype=torch.qint8, # Quantized data type
18
  )
19
 
20
  # Load the tokenizer
@@ -30,7 +32,7 @@ def single_inference(question):
30
  messages,
31
  add_generation_prompt=True,
32
  return_tensors="pt"
33
- ).to("cpu") # Ensure everything runs on CPU
34
 
35
  # Generate a response
36
  terminators = [
@@ -55,9 +57,9 @@ interface = gr.Interface(
55
  fn=single_inference,
56
  inputs="text",
57
  outputs="text",
58
- title="Chatbot",
59
  description="Ask me anything!"
60
  )
61
 
62
  # Launch the Gradio app
63
- interface.launch()
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 
2
  import gradio as gr
3
 
4
  # Model and tokenizer paths
5
  model_name = "ahmedbasemdev/llama-3.2-3b-ChatBot"
6
 
7
+ # Configure 4-bit quantization
8
+ bnb_config = BitsAndBytesConfig(
9
+ load_in_4bit=True, # Enable 4-bit quantization
10
+ bnb_4bit_use_double_quant=True, # Use double quantization
11
+ bnb_4bit_quant_type="nf4", # Use NF4 quantization type for better accuracy
12
+ )
13
 
14
+ # Load the model with 4-bit quantization
15
+ print("Loading the quantized model...")
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ model_name,
18
+ quantization_config=bnb_config,
19
+ device_map="auto", # Automatically map to available device (CPU)
20
  )
21
 
22
  # Load the tokenizer
 
32
  messages,
33
  add_generation_prompt=True,
34
  return_tensors="pt"
35
+ ).to(model.device) # Ensure it runs on the correct device
36
 
37
  # Generate a response
38
  terminators = [
 
57
  fn=single_inference,
58
  inputs="text",
59
  outputs="text",
60
+ title="Quantized Chatbot",
61
  description="Ask me anything!"
62
  )
63
 
64
  # Launch the Gradio app
65
+ interface.launch()