File size: 5,426 Bytes
657585b
210ec4a
d8164ce
 
210ec4a
 
 
 
 
 
d8164ce
657585b
d8164ce
 
 
 
 
1c3f8cd
9b61493
1c3f8cd
9b61493
 
 
 
210ec4a
9b61493
1c3f8cd
9b61493
1c3f8cd
9b61493
 
 
 
 
 
 
1c3f8cd
9b61493
1c3f8cd
9b61493
 
1c3f8cd
9b61493
 
 
 
 
1c3f8cd
9b61493
 
 
1c3f8cd
9b61493
 
1c3f8cd
9b61493
 
 
 
 
 
1c3f8cd
9b61493
 
 
 
 
 
 
 
 
 
 
 
1c3f8cd
9b61493
 
1c3f8cd
9b61493
 
1c3f8cd
 
 
 
 
9b61493
1c3f8cd
9b61493
1c3f8cd
 
9b61493
1c3f8cd
9b61493
 
1c3f8cd
9b61493
1c3f8cd
9b61493
 
 
 
 
1c3f8cd
 
9b61493
1c3f8cd
9b61493
1c3f8cd
 
9b61493
 
 
 
 
 
 
 
 
 
 
 
6ecb4e5
9b61493
 
6ecb4e5
 
 
 
 
9b61493
 
1c3f8cd
9b61493
 
1c3f8cd
6451b24
9b61493
 
657585b
8f9fe18
1c3f8cd
210ec4a
8f9fe18
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
from pathlib import Path
import torch
from threading import Event, Thread
from transformers import AutoConfig, AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM
import openvino as ov
import openvino.properties as props
import openvino.properties.hint as hints
import openvino.properties.streams as streams
import gradio as gr

from llm_config import SUPPORTED_LLM_MODELS

# Initialize model language options
model_languages = list(SUPPORTED_LLM_MODELS)

# Define Gradio interface within a Blocks context
with gr.Blocks() as iface:
    # Dropdown for model language selection
    model_language = gr.Dropdown(
        choices=model_languages,
        value=model_languages[0],
        label="Model Language"
    )

    # Dropdown for model ID, dynamically populated
    model_id = gr.Dropdown(
        choices=[],  # will be populated dynamically
        label="Model",
        value=None
    )

    # Function to update model_id dropdown choices based on model_language
    def update_model_id(model_language_value):
        model_ids = list(SUPPORTED_LLM_MODELS[model_language_value])
        return gr.Dropdown.update(value=model_ids[0], choices=model_ids)

    # Update model_id choices when model_language changes
    model_language.change(update_model_id, inputs=model_language, outputs=model_id)

    # Checkbox for INT4 model preparation
    prepare_int4_model = gr.Checkbox(
        value=True,
        label="Prepare INT4 Model"
    )

    # Checkbox for enabling AWQ (shown conditionally)
    enable_awq = gr.Checkbox(
        value=False,
        label="Enable AWQ",
        visible=False  # visibility can be controlled in the UI logic
    )

    # Dropdown for device selection
    device = gr.Dropdown(
        choices=["CPU", "GPU"],
        value="CPU",
        label="Device"
    )

    # Function to retrieve model configuration and path
    def get_model_path(model_language_value, model_id_value):
        model_configuration = SUPPORTED_LLM_MODELS[model_language_value][model_id_value]
        pt_model_name = model_id_value.split("-")[0]
        int4_model_dir = Path(model_id_value) / "INT4_compressed_weights"
        return model_configuration, int4_model_dir, pt_model_name

    # Function to download the model if not already present
    def download_model_if_needed(model_language_value, model_id_value):
        model_configuration, int4_model_dir, pt_model_name = get_model_path(model_language_value, model_id_value)
        int4_weights = int4_model_dir / "openvino_model.bin"
        if not int4_weights.exists():
            print(f"Downloading model {model_id_value}...")
            # Download logic (e.g., requests.get(model_configuration["model_url"])) can go here
        return int4_model_dir

    # Load the model based on selected options
    def load_model(model_language_value, model_id_value):
        int4_model_dir = download_model_if_needed(model_language_value, model_id_value)
        ov_config = {
            hints.performance_mode(): hints.PerformanceMode.LATENCY,
            streams.num(): "1",
            props.cache_dir(): ""
        }
        core = ov.Core()
        tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)
        ov_model = OVModelForCausalLM.from_pretrained(
            int4_model_dir,
            device=device.value,
            ov_config=ov_config,
            config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True),
            trust_remote_code=True
        )
        return tok, ov_model

    # Gradio sliders for model generation parameters
    temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Temperature")
    top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
    top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K")
    repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, label="Repetition Penalty")

    # Conversation history state
    history = gr.State([])

    # Function to generate responses based on model and input
    def generate_response(history, temperature, top_p, top_k, repetition_penalty, model_language_value, model_id_value):
        tok, ov_model = load_model(model_language_value, model_id_value)

        def convert_history_to_token(history):
            input_tokens = tok(" ".join([msg[0] for msg in history]), return_tensors="pt").input_ids
            return input_tokens
        
        input_ids = convert_history_to_token(history)

        generate_kwargs = dict(
            input_ids=input_ids,
            max_new_tokens=256,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=repetition_penalty
        )
        
        # Stream response to textbox
        response = ""
        for new_text in ov_model.generate(**generate_kwargs):
            response += new_text
            history[-1][1] = response
            yield history

    # Set up the interface with inputs and outputs
    iface = gr.Interface(
        fn=generate_response,
        inputs=[history, temperature, top_p, top_k, repetition_penalty, model_language, model_id],
        outputs=[gr.Textbox(label="Conversation History"), history],
        live=True,
        title="OpenVINO Chatbot"
    )

# Launch the Gradio app
if __name__ == "__main__":
    iface.launch(debug=True, share=True, server_name="0.0.0.0", server_port=7860)