Spaces:
Running
Running
import os | |
import torch | |
from transformers import AutoTokenizer, AutoConfig | |
from optimum.intel.openvino import OVModelForCausalLM | |
import openvino as ov | |
import gradio as gr | |
from typing import List, Tuple | |
from threading import Event, Thread | |
from gradio_helper import make_demo | |
from llm_config import SUPPORTED_LLM_MODELS | |
# Define model configuration | |
model_language = "English" # For example, set the model language to English | |
model_id = "qwen2.5-0.5b-instruct" # For example, select a model ID | |
# Load model configuration | |
model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id] | |
pt_model_id = model_configuration["model_id"] | |
int4_model_dir = os.path.join(model_id, "INT4_compressed_weights") | |
# Load the OpenVINO model and tokenizer | |
device = "CPU" # Or GPU if available | |
core = ov.Core() | |
model_name = model_configuration["model_id"] | |
tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True) | |
# Load the OpenVINO model | |
ov_model = OVModelForCausalLM.from_pretrained( | |
int4_model_dir, | |
device=device, | |
config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True), | |
trust_remote_code=True, | |
) | |
def convert_history_to_token(history: List[Tuple[str, str]]): | |
""" | |
Converts conversation history to tokens based on model configuration. | |
""" | |
input_ids = tok.encode(history[-1][0]) # Simple example for tokenizing the last user input. | |
return torch.LongTensor([input_ids]) | |
def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id): | |
""" | |
Generates the next part of the conversation. | |
""" | |
input_ids = convert_history_to_token(history) | |
streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True) | |
generate_kwargs = dict( | |
input_ids=input_ids, | |
max_new_tokens=256, | |
temperature=temperature, | |
do_sample=temperature > 0.0, | |
top_p=top_p, | |
top_k=top_k, | |
repetition_penalty=repetition_penalty, | |
streamer=streamer, | |
) | |
# Generation process | |
ov_model.generate(**generate_kwargs) | |
# Stream and update history | |
partial_text = "" | |
for new_text in streamer: | |
partial_text += new_text | |
history[-1][1] = partial_text | |
yield history | |
def request_cancel(): | |
ov_model.request.cancel() | |
# Gradio UI | |
demo = make_demo(run_fn=bot, stop_fn=request_cancel, title="OpenVINO Chatbot", language="en") | |
demo.launch(debug=True, share=True) | |