davnas commited on
Commit
b3990cf
·
verified ·
1 Parent(s): 584e514

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -6
app.py CHANGED
@@ -1,17 +1,23 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
 
5
  # Load model and tokenizer with CPU-compatible settings
6
  model_name = "davnas/Italian_Cousine_2.1"
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
 
 
 
 
8
  model = AutoModelForCausalLM.from_pretrained(
9
  model_name,
10
- device_map="auto",
11
  torch_dtype=torch.float32,
12
- load_in_8bit=False, # Disable quantization
13
- load_in_4bit=False, # Disable quantization
14
- quantization_config=None,
15
  use_safetensors=True,
16
  low_cpu_mem_usage=True,
17
  )
@@ -34,7 +40,7 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
34
  tokenize=True,
35
  add_generation_prompt=True,
36
  return_tensors="pt"
37
- ).to(model.device)
38
 
39
  # Generate response
40
  with torch.no_grad():
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
  import torch
4
 
5
  # Load model and tokenizer with CPU-compatible settings
6
  model_name = "davnas/Italian_Cousine_2.1"
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+
9
+ # Configure quantization properly
10
+ quantization_config = BitsAndBytesConfig(
11
+ load_in_4bit=False,
12
+ load_in_8bit=False,
13
+ bnb_4bit_quant_type=None
14
+ )
15
+
16
  model = AutoModelForCausalLM.from_pretrained(
17
  model_name,
18
+ device_map="cpu", # Explicitly set to CPU
19
  torch_dtype=torch.float32,
20
+ quantization_config=quantization_config,
 
 
21
  use_safetensors=True,
22
  low_cpu_mem_usage=True,
23
  )
 
40
  tokenize=True,
41
  add_generation_prompt=True,
42
  return_tensors="pt"
43
+ )
44
 
45
  # Generate response
46
  with torch.no_grad():