Spaces:
Sleeping
Sleeping
import gc | |
import logging | |
import os | |
import numpy as np | |
import torch | |
from accelerate import dispatch_model, infer_auto_device_map | |
from accelerate.utils import get_balanced_memory | |
from h2o_wave import Q | |
from h2o_wave import data as chat_data | |
from h2o_wave import ui | |
from llm_studio.app_utils.utils import get_experiments, get_ui_elements_for_cfg, set_env | |
from llm_studio.python_configs.base import DefaultConfigProblemBase | |
from llm_studio.src.datasets.text_utils import get_texts, get_tokenizer | |
from llm_studio.src.utils.config_utils import ( | |
NON_GENERATION_PROBLEM_TYPES, | |
load_config_yaml, | |
) | |
from llm_studio.src.utils.export_utils import get_prediction_dataframe | |
from llm_studio.src.utils.modeling_utils import load_checkpoint | |
logger = logging.getLogger(__name__) | |
async def chat_tab(q: Q, load_model=True): | |
if not await should_start_chat(q): | |
return | |
if load_model: | |
q.page["experiment/display/chat"] = ui.form_card( | |
box="first", | |
items=[ui.progress(label="Loading the model...")], | |
) | |
q.client["experiment/display/chat/messages"] = [] | |
q.client.delete_cards.add("experiment/display/chat") | |
q.page["experiment/display/chat/settings"] = ui.form_card( | |
box="second", | |
items=[ | |
ui.expander( | |
name="chat_settings", | |
label="Chat Settings", | |
items=[ui.progress(label="Loading model configuration...")], | |
expanded=True, | |
) | |
], | |
) | |
q.client.delete_cards.add("experiment/display/chat/settings") | |
await q.page.save() | |
logger.info(torch.cuda.memory_allocated()) | |
if load_model: | |
with set_env(HF_TOKEN=q.client["default_huggingface_api_token"]): | |
gpu_id = q.client["gpu_used_for_chat"] - 1 | |
cfg, model, tokenizer = load_cfg_model_tokenizer( | |
q.client["experiment/display/experiment_path"], device=f"cuda:{gpu_id}" | |
) | |
q.client["experiment/display/chat/cfg"] = cfg | |
q.client["experiment/display/chat/model"] = model | |
q.client["experiment/display/chat/tokenizer"] = tokenizer | |
initial_message = "Model successfully loaded, how can I help you?" | |
else: | |
cfg = q.client["experiment/display/chat/cfg"] | |
assert q.client["experiment/display/chat/model"] is not None | |
assert q.client["experiment/display/chat/tokenizer"] is not None | |
initial_message = "Chat History cleaned. How can I help you?" | |
# Load validation dataframe and texts | |
validation_dataframe = get_prediction_dataframe(cfg.output_directory) | |
if cfg.dataset.parent_id_column != "None": | |
# sample and parent ids can have any dtype, such as str, int, float, etc. | |
# id column can be int, while parent_id column can be float | |
# (as some values are NaN) so we cast id to the same dtype | |
sample_ids = ( | |
validation_dataframe["id"] | |
.astype(validation_dataframe[cfg.dataset.parent_id_column].dtype) | |
.tolist() | |
) | |
parent_ids = validation_dataframe[cfg.dataset.parent_id_column].tolist() | |
sample_ids_set = set(sample_ids) | |
is_seed_prompt = [ | |
False if idx in sample_ids_set else True for idx in parent_ids | |
] | |
validation_dataframe["is_seed_prompt"] = is_seed_prompt | |
validation_dataframe = validation_dataframe.loc[ | |
validation_dataframe["is_seed_prompt"] | |
] | |
validation_texts = get_texts(validation_dataframe, cfg) | |
# Hide fields that are should not be visible in the UI | |
cfg.prediction._visibility["metric"] = -1 | |
cfg.prediction._visibility["batch_size_inference"] = -1 | |
cfg.prediction._visibility["min_length_inference"] = -1 | |
cfg.prediction._visibility["stop_tokens"] = -1 | |
logger.info(torch.cuda.memory_allocated()) | |
q.page["experiment/display/chat"] = ui.chatbot_card( | |
box="first", | |
data=chat_data(fields="content from_user", t="list"), # type: ignore | |
name="experiment/display/chat/chatbot", | |
events=["stop", "suggestion"], | |
suggestions=[ | |
ui.chat_suggestion( | |
"Write a poem about H2O LLM Studio", | |
label="Write a poem", | |
caption="about H2O LLM Studio", | |
icon="Edit", | |
), | |
ui.chat_suggestion( | |
"Plan a trip to Europe", | |
label="Plan a trip", | |
caption="to Europe", | |
icon="Airplane", | |
), | |
ui.chat_suggestion( | |
"Give me ideas for a new project", | |
label="Give me ideas", | |
caption="for a new project", | |
icon="Lightbulb", | |
), | |
ui.chat_suggestion( | |
np.random.choice(validation_texts), | |
label="Random sample from validation set", | |
icon="Chat", | |
), | |
], | |
) | |
q.page["experiment/display/chat"].data += [initial_message, False] | |
option_items = get_ui_elements_for_cfg( | |
cfg=q.client["experiment/display/chat/cfg"].prediction, | |
q=q, | |
pre="chat/cfg_predictions", | |
) | |
q.page["experiment/display/chat/settings"] = ui.form_card( | |
box="second", | |
items=[ | |
ui.buttons( | |
[ | |
ui.button( | |
name="experiment/display/chat/clear_history", | |
label="Clear History", | |
primary=True, | |
), | |
ui.button( | |
name="experiment/display/chat/copy_chat", | |
label="Copy to clipboard", | |
primary=True, | |
), | |
] | |
), | |
ui.expander( | |
name="chat_settings", | |
label="Chat Settings", | |
items=option_items, | |
expanded=True, | |
), | |
], | |
) | |
async def should_start_chat(q: Q): | |
cfg: DefaultConfigProblemBase = load_config_yaml( | |
os.path.join(q.client["experiment/display/experiment_path"], "cfg.yaml") | |
) | |
if cfg.problem_type in NON_GENERATION_PROBLEM_TYPES: | |
q.page["experiment/display/chat"] = ui.form_card( | |
box="first", | |
items=[ | |
ui.text( | |
"Chatbot is not available for this problem type. " | |
"Please select a text generation problem." | |
) | |
], | |
title="", | |
) | |
q.client.delete_cards.add("experiment/display/chat") | |
return False | |
# gpu id in UI is offset by 1 to be in sync with experiment UI | |
gpu_id = q.client["gpu_used_for_chat"] - 1 | |
if gpu_is_blocked(q, gpu_id): | |
q.page["experiment/display/chat"] = ui.form_card( | |
box="first", | |
items=[ | |
ui.text( | |
f"""Chatbot is not available when GPU{q.client["gpu_used_for_chat"]} | |
is blocked by another experiment. | |
You can change "Gpu used for Chat" in the settings tab | |
to use another GPU for the chatbot. """ | |
) | |
], | |
title="", | |
) | |
q.client.delete_cards.add("experiment/display/chat") | |
return False | |
return True | |
def gpu_is_blocked(q, gpu_id): | |
experiments = get_experiments(q=q) | |
running_experiments = experiments[experiments.status.isin(["running"])] | |
gpu_blocked = any( | |
[ | |
str(gpu_id) in gpu_list | |
for gpu_list in running_experiments["gpu_list"] | |
.apply(lambda x: x.split(",")) | |
.to_list() | |
] | |
) | |
return gpu_blocked | |
def load_cfg_model_tokenizer( | |
experiment_path: str, merge: bool = False, device: str = "cuda:0" | |
): | |
"""Loads the model, tokenizer and configuration from the experiment path.""" | |
cfg = load_config_yaml(os.path.join(experiment_path, "cfg.yaml")) | |
cfg.architecture.pretrained = False | |
cfg.architecture.gradient_checkpointing = False | |
cfg.environment._device = device.replace("_shard", "") | |
cfg.environment._local_rank = 0 | |
cfg.prediction._visibility["num_history"] = 1 | |
tokenizer = get_tokenizer(cfg) | |
gc.collect() | |
torch.cuda.empty_cache() | |
if ( | |
merge | |
and cfg.training.lora | |
and cfg.architecture.backbone_dtype in ("int4", "int8") | |
): | |
# Force to float16 for merging LORA weights. | |
# TODO: Could be configurable in the future to allow bfloat16. | |
logger.info("Loading backbone in float16 for merging LORA weights.") | |
cfg.architecture.backbone_dtype = "float16" | |
cfg.architecture.pretrained = True | |
# if "cpu" in device: | |
# cfg.architecture.backbone_dtype = "float32" | |
with torch.device(cfg.environment._device): | |
model = cfg.architecture.model_class(cfg) | |
cfg.architecture.pretrained_weights = os.path.join( | |
experiment_path, "checkpoint.pth" | |
) | |
load_checkpoint(cfg, model, strict=False) | |
if device == "cpu_shard": | |
max_memory = get_balanced_memory( | |
model, | |
) | |
device_map = infer_auto_device_map(model, max_memory=max_memory) | |
model = dispatch_model( | |
model, | |
device_map=device_map, | |
) | |
if merge and cfg.training.lora: | |
# merges the LoRa layers into the base model. | |
# This is needed if one wants to use the base model as a standalone model. | |
logger.info("Merging LORA layers with base model.") | |
model.backbone = model.backbone.merge_and_unload() | |
model = model.eval() | |
model.backbone.use_cache = True | |
return cfg, model, tokenizer | |