davnas commited on
Commit
584e514
·
verified ·
1 Parent(s): 0ace396

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -2,14 +2,18 @@ import gradio as gr
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
 
5
- # Load model and tokenizer
6
  model_name = "davnas/Italian_Cousine_2.1"
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
8
  model = AutoModelForCausalLM.from_pretrained(
9
  model_name,
10
- torch_dtype=torch.float32, # Use float32 for CPU
 
 
 
 
 
11
  low_cpu_mem_usage=True,
12
- device_map="auto"
13
  )
14
 
15
  def respond(message, history, system_message, max_tokens, temperature, top_p):
@@ -30,7 +34,7 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
30
  tokenize=True,
31
  add_generation_prompt=True,
32
  return_tensors="pt"
33
- )
34
 
35
  # Generate response
36
  with torch.no_grad():
@@ -41,7 +45,6 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
41
  temperature=temperature,
42
  top_p=top_p,
43
  pad_token_id=tokenizer.pad_token_id,
44
- streaming=True
45
  )
46
 
47
  # Decode and return the response
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
 
5
+ # Load model and tokenizer with CPU-compatible settings
6
  model_name = "davnas/Italian_Cousine_2.1"
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
8
  model = AutoModelForCausalLM.from_pretrained(
9
  model_name,
10
+ device_map="auto",
11
+ torch_dtype=torch.float32,
12
+ load_in_8bit=False, # Disable quantization
13
+ load_in_4bit=False, # Disable quantization
14
+ quantization_config=None,
15
+ use_safetensors=True,
16
  low_cpu_mem_usage=True,
 
17
  )
18
 
19
  def respond(message, history, system_message, max_tokens, temperature, top_p):
 
34
  tokenize=True,
35
  add_generation_prompt=True,
36
  return_tensors="pt"
37
+ ).to(model.device)
38
 
39
  # Generate response
40
  with torch.no_grad():
 
45
  temperature=temperature,
46
  top_p=top_p,
47
  pad_token_id=tokenizer.pad_token_id,
 
48
  )
49
 
50
  # Decode and return the response