Spaces:

lightmate
/

llm-chatbot

Running

App Files Files Community

llm-chatbot / app.py

lightmate

Update app.py

fb42888 verified 8 months ago

raw

history blame

2.46 kB

	import os
	import torch
	from transformers import AutoTokenizer, AutoConfig
	from optimum.intel.openvino import OVModelForCausalLM
	import openvino as ov
	import gradio as gr
	from typing import List, Tuple
	from threading import Event, Thread
	from gradio_helper import make_demo
	from llm_config import SUPPORTED_LLM_MODELS

	# Define model configuration
	model_language = "English" # For example, set the model language to English
	model_id = "qwen2.5-0.5b-instruct" # For example, select a model ID

	# Load model configuration
	model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id]
	pt_model_id = model_configuration["model_id"]
	int4_model_dir = os.path.join(model_id, "INT4_compressed_weights")

	# Load the OpenVINO model and tokenizer
	device = "CPU" # Or GPU if available
	core = ov.Core()
	model_name = model_configuration["model_id"]
	tok = AutoTokenizer.from_pretrained(int4_model_dir, trust_remote_code=True)

	# Load the OpenVINO model
	ov_model = OVModelForCausalLM.from_pretrained(
	int4_model_dir,
	device=device,
	config=AutoConfig.from_pretrained(int4_model_dir, trust_remote_code=True),
	trust_remote_code=True,
	)

	def convert_history_to_token(history: List[Tuple[str, str]]):
	"""
	Converts conversation history to tokens based on model configuration.
	"""
	input_ids = tok.encode(history[-1][0]) # Simple example for tokenizing the last user input.
	return torch.LongTensor([input_ids])

	def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
	"""
	Generates the next part of the conversation.
	"""
	input_ids = convert_history_to_token(history)
	streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	input_ids=input_ids,
	max_new_tokens=256,
	temperature=temperature,
	do_sample=temperature > 0.0,
	top_p=top_p,
	top_k=top_k,
	repetition_penalty=repetition_penalty,
	streamer=streamer,
	)

	# Generation process
	ov_model.generate(**generate_kwargs)

	# Stream and update history
	partial_text = ""
	for new_text in streamer:
	partial_text += new_text
	history[-1][1] = partial_text
	yield history

	def request_cancel():
	ov_model.request.cancel()

	# Gradio UI
	demo = make_demo(run_fn=bot, stop_fn=request_cancel, title="OpenVINO Chatbot", language="en")
	demo.launch(debug=True, share=True)