Spaces:

qinfeng722
/

llm-studio

Running

App Files Files Community

llm-studio / llm_studio /app_utils /sections /chat.py

qinfeng722

Upload 322 files

5caedb4 verified 4 months ago

raw

history blame contribute delete

9.73 kB

	import gc
	import logging
	import os

	import numpy as np
	import torch
	from accelerate import dispatch_model, infer_auto_device_map
	from accelerate.utils import get_balanced_memory
	from h2o_wave import Q
	from h2o_wave import data as chat_data
	from h2o_wave import ui

	from llm_studio.app_utils.utils import get_experiments, get_ui_elements_for_cfg, set_env
	from llm_studio.python_configs.base import DefaultConfigProblemBase
	from llm_studio.src.datasets.text_utils import get_texts, get_tokenizer
	from llm_studio.src.utils.config_utils import (
	NON_GENERATION_PROBLEM_TYPES,
	load_config_yaml,
	)
	from llm_studio.src.utils.export_utils import get_prediction_dataframe
	from llm_studio.src.utils.modeling_utils import load_checkpoint

	logger = logging.getLogger(__name__)


	async def chat_tab(q: Q, load_model=True):
	if not await should_start_chat(q):
	return

	if load_model:
	q.page["experiment/display/chat"] = ui.form_card(
	box="first",
	items=[ui.progress(label="Loading the model...")],
	)

	q.client["experiment/display/chat/messages"] = []
	q.client.delete_cards.add("experiment/display/chat")

	q.page["experiment/display/chat/settings"] = ui.form_card(
	box="second",
	items=[
	ui.expander(
	name="chat_settings",
	label="Chat Settings",
	items=[ui.progress(label="Loading model configuration...")],
	expanded=True,
	)
	],
	)
	q.client.delete_cards.add("experiment/display/chat/settings")

	await q.page.save()
	logger.info(torch.cuda.memory_allocated())

	if load_model:
	with set_env(HF_TOKEN=q.client["default_huggingface_api_token"]):
	gpu_id = q.client["gpu_used_for_chat"] - 1
	cfg, model, tokenizer = load_cfg_model_tokenizer(
	q.client["experiment/display/experiment_path"], device=f"cuda:{gpu_id}"
	)
	q.client["experiment/display/chat/cfg"] = cfg
	q.client["experiment/display/chat/model"] = model
	q.client["experiment/display/chat/tokenizer"] = tokenizer
	initial_message = "Model successfully loaded, how can I help you?"

	else:
	cfg = q.client["experiment/display/chat/cfg"]
	assert q.client["experiment/display/chat/model"] is not None
	assert q.client["experiment/display/chat/tokenizer"] is not None
	initial_message = "Chat History cleaned. How can I help you?"

	# Load validation dataframe and texts
	validation_dataframe = get_prediction_dataframe(cfg.output_directory)
	if cfg.dataset.parent_id_column != "None":
	# sample and parent ids can have any dtype, such as str, int, float, etc.
	# id column can be int, while parent_id column can be float
	# (as some values are NaN) so we cast id to the same dtype
	sample_ids = (
	validation_dataframe["id"]
	.astype(validation_dataframe[cfg.dataset.parent_id_column].dtype)
	.tolist()
	)
	parent_ids = validation_dataframe[cfg.dataset.parent_id_column].tolist()

	sample_ids_set = set(sample_ids)
	is_seed_prompt = [
	False if idx in sample_ids_set else True for idx in parent_ids
	]
	validation_dataframe["is_seed_prompt"] = is_seed_prompt

	validation_dataframe = validation_dataframe.loc[
	validation_dataframe["is_seed_prompt"]
	]
	validation_texts = get_texts(validation_dataframe, cfg)

	# Hide fields that are should not be visible in the UI
	cfg.prediction._visibility["metric"] = -1
	cfg.prediction._visibility["batch_size_inference"] = -1
	cfg.prediction._visibility["min_length_inference"] = -1
	cfg.prediction._visibility["stop_tokens"] = -1

	logger.info(torch.cuda.memory_allocated())
	q.page["experiment/display/chat"] = ui.chatbot_card(
	box="first",
	data=chat_data(fields="content from_user", t="list"), # type: ignore
	name="experiment/display/chat/chatbot",
	events=["stop", "suggestion"],
	suggestions=[
	ui.chat_suggestion(
	"Write a poem about H2O LLM Studio",
	label="Write a poem",
	caption="about H2O LLM Studio",
	icon="Edit",
	),
	ui.chat_suggestion(
	"Plan a trip to Europe",
	label="Plan a trip",
	caption="to Europe",
	icon="Airplane",
	),
	ui.chat_suggestion(
	"Give me ideas for a new project",
	label="Give me ideas",
	caption="for a new project",
	icon="Lightbulb",
	),
	ui.chat_suggestion(
	np.random.choice(validation_texts),
	label="Random sample from validation set",
	icon="Chat",
	),
	],
	)
	q.page["experiment/display/chat"].data += [initial_message, False]

	option_items = get_ui_elements_for_cfg(
	cfg=q.client["experiment/display/chat/cfg"].prediction,
	q=q,
	pre="chat/cfg_predictions",
	)
	q.page["experiment/display/chat/settings"] = ui.form_card(
	box="second",
	items=[
	ui.buttons(
	[
	ui.button(
	name="experiment/display/chat/clear_history",
	label="Clear History",
	primary=True,
	),
	ui.button(
	name="experiment/display/chat/copy_chat",
	label="Copy to clipboard",
	primary=True,
	),
	]
	),
	ui.expander(
	name="chat_settings",
	label="Chat Settings",
	items=option_items,
	expanded=True,
	),
	],
	)


	async def should_start_chat(q: Q):
	cfg: DefaultConfigProblemBase = load_config_yaml(
	os.path.join(q.client["experiment/display/experiment_path"], "cfg.yaml")
	)

	if cfg.problem_type in NON_GENERATION_PROBLEM_TYPES:
	q.page["experiment/display/chat"] = ui.form_card(
	box="first",
	items=[
	ui.text(
	"Chatbot is not available for this problem type. "
	"Please select a text generation problem."
	)
	],
	title="",
	)
	q.client.delete_cards.add("experiment/display/chat")
	return False

	# gpu id in UI is offset by 1 to be in sync with experiment UI
	gpu_id = q.client["gpu_used_for_chat"] - 1
	if gpu_is_blocked(q, gpu_id):
	q.page["experiment/display/chat"] = ui.form_card(
	box="first",
	items=[
	ui.text(
	f"""Chatbot is not available when GPU{q.client["gpu_used_for_chat"]}
	is blocked by another experiment.
	You can change "Gpu used for Chat" in the settings tab
	to use another GPU for the chatbot. """
	)
	],
	title="",
	)
	q.client.delete_cards.add("experiment/display/chat")
	return False
	return True


	def gpu_is_blocked(q, gpu_id):
	experiments = get_experiments(q=q)
	running_experiments = experiments[experiments.status.isin(["running"])]
	gpu_blocked = any(
	[
	str(gpu_id) in gpu_list
	for gpu_list in running_experiments["gpu_list"]
	.apply(lambda x: x.split(","))
	.to_list()
	]
	)
	return gpu_blocked


	def load_cfg_model_tokenizer(
	experiment_path: str, merge: bool = False, device: str = "cuda:0"
	):
	"""Loads the model, tokenizer and configuration from the experiment path."""
	cfg = load_config_yaml(os.path.join(experiment_path, "cfg.yaml"))
	cfg.architecture.pretrained = False
	cfg.architecture.gradient_checkpointing = False
	cfg.environment._device = device.replace("_shard", "")
	cfg.environment._local_rank = 0
	cfg.prediction._visibility["num_history"] = 1

	tokenizer = get_tokenizer(cfg)

	gc.collect()
	torch.cuda.empty_cache()

	if (
	merge
	and cfg.training.lora
	and cfg.architecture.backbone_dtype in ("int4", "int8")
	):
	# Force to float16 for merging LORA weights.
	# TODO: Could be configurable in the future to allow bfloat16.
	logger.info("Loading backbone in float16 for merging LORA weights.")
	cfg.architecture.backbone_dtype = "float16"
	cfg.architecture.pretrained = True

	# if "cpu" in device:
	# cfg.architecture.backbone_dtype = "float32"

	with torch.device(cfg.environment._device):
	model = cfg.architecture.model_class(cfg)
	cfg.architecture.pretrained_weights = os.path.join(
	experiment_path, "checkpoint.pth"
	)
	load_checkpoint(cfg, model, strict=False)

	if device == "cpu_shard":
	max_memory = get_balanced_memory(
	model,
	)
	device_map = infer_auto_device_map(model, max_memory=max_memory)
	model = dispatch_model(
	model,
	device_map=device_map,
	)

	if merge and cfg.training.lora:
	# merges the LoRa layers into the base model.
	# This is needed if one wants to use the base model as a standalone model.
	logger.info("Merging LORA layers with base model.")
	model.backbone = model.backbone.merge_and_unload()

	model = model.eval()
	model.backbone.use_cache = True

	return cfg, model, tokenizer