Spaces:

lightmate
/

llm-chatbot

Running

File size: 2,458 Bytes

import os
import torch
from transformers import AutoTokenizer, AutoConfig
from optimum.intel.openvino import OVModelForCausalLM
import openvino as ov
import gradio as gr
from typing import List, Tuple
from threading import Event, Thread
from gradio_helper import make_demo
from llm_config import SUPPORTED_LLM_MODELS

# Define model configuration
model_language = "English"  # For example, set the model language to English
model_id = "qwen2.5-0.5b-instruct"  # For example, select a model ID

# Load model configuration
model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id]
pt_model_id = model_configuration["model_id"]
int4_model_dir = os.path.join(model_id, "INT4_compressed_weights")

# Load the OpenVINO model and tokenizer
device = "CPU"  # Or GPU if available
core = ov.Core()
model_name = model_configuration["model_id"]
tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)

# Load the OpenVINO model
ov_model = OVModelForCausalLM.from_pretrained(
    int4_model_dir,
    device=device,
    config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True),
    trust_remote_code=True,
)

def convert_history_to_token(history: List[Tuple[str, str]]):
    """
    Converts conversation history to tokens based on model configuration.
    """
    input_ids = tok.encode(history[-1][0])  # Simple example for tokenizing the last user input.
    return torch.LongTensor([input_ids])

def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
    """
    Generates the next part of the conversation.
    """
    input_ids = convert_history_to_token(history)
    streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        input_ids=input_ids,
        max_new_tokens=256,
        temperature=temperature,
        do_sample=temperature > 0.0,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
        streamer=streamer,
    )
    
    # Generation process
    ov_model.generate(**generate_kwargs)
    
    # Stream and update history
    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        history[-1][1] = partial_text
        yield history

def request_cancel():
    ov_model.request.cancel()

# Gradio UI
demo = make_demo(run_fn=bot, stop_fn=request_cancel, title="OpenVINO Chatbot", language="en")
demo.launch(debug=True, share=True)