Spaces:

lightmate
/

llm-chatbot

Running

App Files Files Community

llm-chatbot / app.py

lightmate

Update app.py

c6b2b77 verified 8 months ago

raw

history blame

6.37 kB

	import os
	import torch
	import gradio as gr
	from pathlib import Path
	from transformers import AutoConfig, AutoTokenizer
	from optimum.intel.openvino import OVModelForCausalLM
	from typing import List, Tuple
	from threading import Event, Thread
	from gradio_helper import make_demo # Your helper function for Gradio demo
	from llm_config import SUPPORTED_LLM_MODELS # Model configuration
	from notebook_utils import device_widget # Device selection utility
	import openvino as ov
	import openvino.properties as props
	import openvino.properties.hint as hints
	import openvino.properties.streams as streams
	import requests

	# Define the model loading function (same as in your notebook)
	def convert_to_int4(model_id, model_configuration, enable_awq=False):
	compression_configs = {
	"qwen2.5-0.5b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
	"default": {"sym": False, "group_size": 128, "ratio": 0.8},
	}
	model_compression_params = compression_configs.get(model_id, compression_configs["default"])

	# Example conversion logic
	int4_model_dir = Path(model_id) / "INT4_compressed_weights"
	if (int4_model_dir / "openvino_model.xml").exists():
	return int4_model_dir
	remote_code = model_configuration.get("remote_code", False)
	export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format int4"
	int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
	if model_compression_params["sym"]:
	int4_compression_args += " --sym"
	if enable_awq:
	int4_compression_args += " --awq --dataset wikitext2 --num-samples 128"
	export_command_base += int4_compression_args
	if remote_code:
	export_command_base += " --trust-remote-code"
	export_command = export_command_base + f" {str(int4_model_dir)}"

	# Execute export command (shell command)
	os.system(export_command)
	return int4_model_dir

	# Model and tokenizer loading
	def load_model(model_dir, device):
	ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}
	core = ov.Core()
	model_name = model_configuration["model_id"]
	tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

	ov_model = OVModelForCausalLM.from_pretrained(
	model_dir,
	device=device,
	ov_config=ov_config,
	config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
	trust_remote_code=True,
	)

	return ov_model, tok

	# Gradio Interface for Bot interaction
	def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
	input_ids = convert_history_to_token(history)
	if input_ids.shape[1] > 2000:
	history = [history[-1]] # Limit input size
	input_ids = convert_history_to_token(history)

	streamer = TextIteratorStreamer(tok, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)

	generate_kwargs = dict(
	input_ids=input_ids,
	max_new_tokens=256,
	temperature=temperature,
	do_sample=temperature > 0.0,
	top_p=top_p,
	top_k=top_k,
	repetition_penalty=repetition_penalty,
	streamer=streamer,
	)

	# Function to generate response in a separate thread
	def generate_and_signal_complete():
	ov_model.generate(**generate_kwargs)
	stream_complete.set()

	t1 = Thread(target=generate_and_signal_complete)
	t1.start()

	# Process partial text and return updated history
	partial_text = ""
	for new_text in streamer:
	partial_text = text_processor(partial_text, new_text)
	history[-1][1] = partial_text
	yield history

	# Define a Gradio interface for user interaction
	def create_gradio_interface():
	# Dropdown for selecting model language and model ID
	model_language = list(SUPPORTED_LLM_MODELS.keys()) # List of model languages
	model_id = gr.Dropdown(choices=model_language, value=model_language[0], label="Model Language")

	# Once model language is selected, show the respective model IDs
	def update_model_ids(model_language):
	model_ids = list(SUPPORTED_LLM_MODELS[model_language].keys())
	return gr.Dropdown.update(choices=model_ids, value=model_ids[0])

	model_id_selector = gr.Dropdown(choices=model_language, value=model_language[0], label="Model ID")
	model_id_selector.change(update_model_ids, inputs=model_language, outputs=model_id_selector)

	# Set up a checkbox for enabling AWQ compression
	enable_awq = gr.Checkbox(value=False, label="Enable AWQ for Compression")

	# Initialize model selection based on language and ID
	def load_model_on_select(model_language, model_id, enable_awq):
	model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id]
	int4_model_dir = convert_to_int4(model_id, model_configuration, enable_awq)

	# Load the model and tokenizer
	device = device_widget("CPU") # or any device you want to use
	ov_model, tok = load_model(int4_model_dir, device)

	# Return the loaded model and tokenizer
	return ov_model, tok

	# Connect model selection UI to load model dynamically
	load_button = gr.Button("Load Model")
	load_button.click(load_model_on_select, inputs=[model_language, model_id, enable_awq], outputs=[gr.Textbox(label="Model Status")])

	# Create the Gradio chatbot interface
	chatbot = gr.Chatbot()

	# Parameters for bot generation
	temperature = gr.Slider(minimum=0, maximum=1, step=0.1, label="Temperature", value=0.7)
	top_p = gr.Slider(minimum=0, maximum=1, step=0.1, label="Top-p", value=0.9)
	top_k = gr.Slider(minimum=0, maximum=50, step=1, label="Top-k", value=50)
	repetition_penalty = gr.Slider(minimum=0, maximum=2, step=0.1, label="Repetition Penalty", value=1.0)

	# Run the Gradio interface
	demo = gr.Interface(
	fn=bot,
	inputs=[chatbot, temperature, top_p, top_k, repetition_penalty],
	outputs=[chatbot],
	title="OpenVINO Chatbot",
	live=True
	)

	return demo

	# Run the Gradio app
	if __name__ == "__main__":
	app = create_gradio_interface()
	app.launch(debug=True, share=True) # share=True for public access