ahmedbasemdev commited on
Commit
2fcb420
·
verified ·
1 Parent(s): c9746e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -21
app.py CHANGED
@@ -1,34 +1,38 @@
1
- import gradio as gr
2
-
3
- # Load your model and tokenizer
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
  import torch
 
6
 
7
- # Specify the model name
8
  model_name = "ahmedbasemdev/llama-3.2-3b-ChatBot"
9
 
10
- # Load the model with 8-bit quantization
11
- model = AutoModelForCausalLM.from_pretrained(
12
- model_name,
13
- device_map="auto", # Automatically map the model to the available device (CPU)
14
- load_in_8bit=True, # Enable 8-bit quantization
15
- torch_dtype=torch.float16 # Use mixed precision
 
 
 
 
16
  )
17
 
18
  # Load the tokenizer
19
  tokenizer = AutoTokenizer.from_pretrained(model_name)
20
 
 
21
  def single_inference(question):
22
  messages = []
23
-
24
  messages.append({"role": "user", "content": question})
25
 
 
26
  input_ids = tokenizer.apply_chat_template(
27
  messages,
28
  add_generation_prompt=True,
29
  return_tensors="pt"
30
- ).to(model.device)
31
 
 
32
  terminators = [
33
  tokenizer.eos_token_id,
34
  tokenizer.convert_tokens_to_ids("<|eot_id|>")
@@ -45,15 +49,15 @@ def single_inference(question):
45
  output = tokenizer.decode(response, skip_special_tokens=True)
46
  return output
47
 
48
- # Create the Gradio interface
 
49
  interface = gr.Interface(
50
- fn=single_inference, # Function to wrap
51
- inputs=gr.Textbox(lines=2, placeholder="Ask a question..."), # Input type
52
- outputs=gr.Textbox(label="Response"), # Output type
53
- title="Chat with Your Model", # App title
54
- description="Enter a question, and the model will generate a response.", # App description
55
  )
56
 
57
- # Launch the app
58
- if __name__ == "__main__":
59
- interface.launch()
 
 
 
 
1
  from transformers import AutoModelForCausalLM, AutoTokenizer
2
  import torch
3
+ import gradio as gr
4
 
5
+ # Model and tokenizer paths
6
  model_name = "ahmedbasemdev/llama-3.2-3b-ChatBot"
7
 
8
+ # Load the model
9
+ print("Loading the model...")
10
+ model = AutoModelForCausalLM.from_pretrained(model_name)
11
+
12
+ # Apply dynamic quantization to reduce model size and improve CPU performance
13
+ print("Applying quantization...")
14
+ model = torch.quantization.quantize_dynamic(
15
+ model, # Model to quantize
16
+ {torch.nn.Linear}, # Layers to quantize (e.g., Linear layers)
17
+ dtype=torch.qint8, # Quantized data type
18
  )
19
 
20
  # Load the tokenizer
21
  tokenizer = AutoTokenizer.from_pretrained(model_name)
22
 
23
+ # Define the inference function
24
  def single_inference(question):
25
  messages = []
 
26
  messages.append({"role": "user", "content": question})
27
 
28
+ # Tokenize the input
29
  input_ids = tokenizer.apply_chat_template(
30
  messages,
31
  add_generation_prompt=True,
32
  return_tensors="pt"
33
+ ).to("cpu") # Ensure everything runs on CPU
34
 
35
+ # Generate a response
36
  terminators = [
37
  tokenizer.eos_token_id,
38
  tokenizer.convert_tokens_to_ids("<|eot_id|>")
 
49
  output = tokenizer.decode(response, skip_special_tokens=True)
50
  return output
51
 
52
+ # Gradio interface
53
+ print("Setting up Gradio app...")
54
  interface = gr.Interface(
55
+ fn=single_inference,
56
+ inputs="text",
57
+ outputs="text",
58
+ title="Chatbot",
59
+ description="Ask me anything!"
60
  )
61
 
62
+ # Launch the Gradio app
63
+ interface.launch()