File size: 2,750 Bytes
313b838
40577a2
d58a9a4
313b838
ef91991
d58a9a4
 
40577a2
 
 
 
 
 
 
 
d58a9a4
 
40577a2
ef91991
40577a2
ef91991
d58a9a4
be6d20e
313b838
be6d20e
d58a9a4
 
be6d20e
d58a9a4
 
 
 
be6d20e
 
d58a9a4
 
 
 
 
 
 
 
40577a2
be6d20e
d58a9a4
 
 
 
 
 
 
 
 
 
be6d20e
d58a9a4
 
 
313b838
5b94e9b
313b838
 
 
be6d20e
d58a9a4
be6d20e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313b838
 
 
 
 
be6d20e
313b838
d58a9a4
 
 
313b838
 
 
be6d20e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# Load model and tokenizer with CPU-compatible settings
model_name = "davnas/Italian_Cousine_2.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configure quantization properly
quantization_config = BitsAndBytesConfig(
    load_in_4bit=False,
    load_in_8bit=False,
    bnb_4bit_quant_type=None
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cpu",  # Explicitly set to CPU
    torch_dtype=torch.float32,
    quantization_config=quantization_config,
    use_safetensors=True,
    low_cpu_mem_usage=True,
)

def respond(message, history, system_message, max_tokens, temperature, top_p):
    # Format the conversation
    messages = [{"role": "system", "content": system_message}]
    
    # Add history
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    
    # Add current message
    messages.append({"role": "user", "content": message})
    
    # Create the prompt using the tokenizer's chat template
    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )
    
    # Generate response
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    # Decode and return the response
    response = tokenizer.decode(output_ids[0][len(input_ids[0]):], skip_special_tokens=True)
    return response

# Create the interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(
            value="You are a professional chef assistant who provides accurate and detailed recipes.",
            label="System message"
        ),
        gr.Slider(
            minimum=1,
            maximum=2048,
            value=512,
            step=1,
            label="Max new tokens"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=4.0,
            value=0.7,
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)"
        ),
    ],
    title="Italian Cuisine Chatbot",
    description="Ask me anything about Italian cuisine or cooking!"
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)