File size: 2,750 Bytes
1b43757
b3990cf
8c45748
1b43757
584e514
8c45748
 
b3990cf
 
 
 
 
 
 
 
8c45748
 
b3990cf
584e514
b3990cf
584e514
8c45748
 
da721b5
8c45748
 
 
c0b7ba2
8c45748
7d3f0d0
8c45748
 
7d3f0d0
c0b7ba2
8c45748
 
 
 
 
 
 
 
b3990cf
8c45748
 
 
 
 
 
 
 
 
 
 
7d3f0d0
8c45748
 
 
d7828e5
c0b7ba2
1b43757
 
 
c0b7ba2
8c45748
c0b7ba2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b43757
 
 
 
 
c0b7ba2
1b43757
8c45748
 
 
1b43757
 
 
8c45748
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# Load model and tokenizer with CPU-compatible settings
model_name = "davnas/Italian_Cousine_2.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configure quantization properly
quantization_config = BitsAndBytesConfig(
    load_in_4bit=False,
    load_in_8bit=False,
    bnb_4bit_quant_type=None
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cpu",  # Explicitly set to CPU
    torch_dtype=torch.float32,
    quantization_config=quantization_config,
    use_safetensors=True,
    low_cpu_mem_usage=True,
)

def respond(message, history, system_message, max_tokens, temperature, top_p):
    # Format the conversation
    messages = [{"role": "system", "content": system_message}]
    
    # Add history
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    
    # Add current message
    messages.append({"role": "user", "content": message})
    
    # Create the prompt using the tokenizer's chat template
    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )
    
    # Generate response
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    # Decode and return the response
    response = tokenizer.decode(output_ids[0][len(input_ids[0]):], skip_special_tokens=True)
    return response

# Create the interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(
            value="You are a professional chef assistant who provides accurate and detailed recipes.",
            label="System message"
        ),
        gr.Slider(
            minimum=1,
            maximum=2048,
            value=512,
            step=1,
            label="Max new tokens"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=4.0,
            value=0.7,
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)"
        ),
    ],
    title="Italian Cuisine Chatbot",
    description="Ask me anything about Italian cuisine or cooking!"
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)