File size: 5,994 Bytes
657585b
210ec4a
d8164ce
 
 
 
210ec4a
 
 
 
 
 
d8164ce
657585b
d8164ce
 
 
 
 
9b61493
 
 
 
 
 
210ec4a
9b61493
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657585b
8f9fe18
 
210ec4a
8f9fe18
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import os
from pathlib import Path
import requests
import shutil
import torch
from threading import Event, Thread
from transformers import AutoConfig, AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM
import openvino as ov
import openvino.properties as props
import openvino.properties.hint as hints
import openvino.properties.streams as streams
import gradio as gr

from llm_config import SUPPORTED_LLM_MODELS

# Initialize model language options
model_languages = list(SUPPORTED_LLM_MODELS)

# Gradio Interface inside Blocks
with gr.Blocks() as iface:
    model_language = gr.Dropdown(
        choices=model_languages,
        value=model_languages[0],
        label="Model Language"
    )

    model_id = gr.Dropdown(
        choices=[],  # will be dynamically populated
        label="Model",
        value=None
    )

    # Function to update model_id dropdown choices based on model_language
    def update_model_id(model_language_value):
        model_ids = list(SUPPORTED_LLM_MODELS[model_language_value])
        return gr.update(value=model_ids[0], choices=model_ids)

    model_language.change(update_model_id, inputs=model_language, outputs=model_id)

    # Gradio checkbox for preparing INT4 model
    prepare_int4_model = gr.Checkbox(
        value=True,
        label="Prepare INT4 Model"
    )

    # Gradio checkbox for enabling AWQ (depends on INT4 checkbox)
    enable_awq = gr.Checkbox(
        value=False,
        label="Enable AWQ",
        visible=False
    )

    # Gradio dropdown for device selection
    device = gr.Dropdown(
        choices=["CPU", "GPU"],
        value="CPU",
        label="Device"
    )

    # Model directory and setup based on selections
    def get_model_path(model_language_value, model_id_value):
        model_configuration = SUPPORTED_LLM_MODELS[model_language_value][model_id_value]
        pt_model_id = model_configuration["model_id"]
        pt_model_name = model_id_value.split("-")[0]
        int4_model_dir = Path(model_id_value) / "INT4_compressed_weights"
        return model_configuration, int4_model_dir, pt_model_name

    # Function to download the model if not already present
    def download_model_if_needed(model_language_value, model_id_value):
        model_configuration, int4_model_dir, pt_model_name = get_model_path(model_language_value, model_id_value)
        
        int4_weights = int4_model_dir / "openvino_model.bin"
        
        if not int4_weights.exists():
            print(f"Downloading model {model_id_value}...")
            # Add your download logic here (e.g., from a URL)
            # Example:
            # r = requests.get(model_configuration["model_url"])
            # with open(int4_weights, "wb") as f:
            #     f.write(r.content)

        return int4_model_dir

    # Load the model
    def load_model(model_language_value, model_id_value):
        int4_model_dir = download_model_if_needed(model_language_value, model_id_value)
        
        # Load the OpenVINO model
        ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
        core = ov.Core()
        
        model_dir = int4_model_dir
        model_configuration = SUPPORTED_LLM_MODELS[model_language_value][model_id_value]
        
        tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
        ov_model = OVModelForCausalLM.from_pretrained(
            model_dir,
            device=device.value,  # Use Gradio dropdown value for device
            ov_config=ov_config,
            config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
            trust_remote_code=True
        )
        
        return tok, ov_model, model_configuration

    # Gradio UI for temperature and other model parameters
    temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Temperature")
    top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
    top_k = gr.Slider(minimum=0, maximum=50, value=50, label="Top K")
    repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, label="Repetition Penalty")

    # Conversation history input/output
    history = gr.State([])  # store the conversation history

    # Gradio function for generating responses
    def generate_response(history, temperature, top_p, top_k, repetition_penalty, model_language_value, model_id_value):
        tok, ov_model, model_configuration = load_model(model_language_value, model_id_value)
        
        def convert_history_to_token(history):
            input_tokens = tok(" ".join([msg[0] for msg in history]), return_tensors="pt").input_ids
            return input_tokens
        
        input_ids = convert_history_to_token(history)
        streamer = gr.Textbox.update()

        generate_kwargs = dict(
            input_ids=input_ids,
            max_new_tokens=256,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=repetition_penalty,
            streamer=streamer
        )
        
        event = Event()
        def generate_and_signal_complete():
            ov_model.generate(**generate_kwargs)
            event.set()
        
        t1 = Thread(target=generate_and_signal_complete)
        t1.start()

        partial_text = ""
        for new_text in streamer:
            partial_text += new_text
            history[-1][1] = partial_text
            yield history

    # Interface setup
    iface = gr.Interface(
        fn=generate_response,
        inputs=[
            history, 
            temperature, 
            top_p, 
            top_k, 
            repetition_penalty, 
            model_language, 
            model_id
        ],
        outputs=[gr.Textbox(label="Conversation History")],
        live=True,
        title="OpenVINO Chatbot"
    )

# Launch Gradio app
if __name__ == "__main__":
    iface.launch(debug=True, share=True, server_name="0.0.0.0", server_port=7860)