Spaces:

qinfeng722
/

llm-studio

Running

File size: 9,725 Bytes

5caedb4

import gc
import logging
import os

import numpy as np
import torch
from accelerate import dispatch_model, infer_auto_device_map
from accelerate.utils import get_balanced_memory
from h2o_wave import Q
from h2o_wave import data as chat_data
from h2o_wave import ui

from llm_studio.app_utils.utils import get_experiments, get_ui_elements_for_cfg, set_env
from llm_studio.python_configs.base import DefaultConfigProblemBase
from llm_studio.src.datasets.text_utils import get_texts, get_tokenizer
from llm_studio.src.utils.config_utils import (
    NON_GENERATION_PROBLEM_TYPES,
    load_config_yaml,
)
from llm_studio.src.utils.export_utils import get_prediction_dataframe
from llm_studio.src.utils.modeling_utils import load_checkpoint

logger = logging.getLogger(__name__)


async def chat_tab(q: Q, load_model=True):
    if not await should_start_chat(q):
        return

    if load_model:
        q.page["experiment/display/chat"] = ui.form_card(
            box="first",
            items=[ui.progress(label="Loading the model...")],
        )

    q.client["experiment/display/chat/messages"] = []
    q.client.delete_cards.add("experiment/display/chat")

    q.page["experiment/display/chat/settings"] = ui.form_card(
        box="second",
        items=[
            ui.expander(
                name="chat_settings",
                label="Chat Settings",
                items=[ui.progress(label="Loading model configuration...")],
                expanded=True,
            )
        ],
    )
    q.client.delete_cards.add("experiment/display/chat/settings")

    await q.page.save()
    logger.info(torch.cuda.memory_allocated())

    if load_model:
        with set_env(HF_TOKEN=q.client["default_huggingface_api_token"]):
            gpu_id = q.client["gpu_used_for_chat"] - 1
            cfg, model, tokenizer = load_cfg_model_tokenizer(
                q.client["experiment/display/experiment_path"], device=f"cuda:{gpu_id}"
            )
        q.client["experiment/display/chat/cfg"] = cfg
        q.client["experiment/display/chat/model"] = model
        q.client["experiment/display/chat/tokenizer"] = tokenizer
        initial_message = "Model successfully loaded, how can I help you?"

    else:
        cfg = q.client["experiment/display/chat/cfg"]
        assert q.client["experiment/display/chat/model"] is not None
        assert q.client["experiment/display/chat/tokenizer"] is not None
        initial_message = "Chat History cleaned. How can I help you?"

    # Load validation dataframe and texts
    validation_dataframe = get_prediction_dataframe(cfg.output_directory)
    if cfg.dataset.parent_id_column != "None":
        # sample and parent ids can have any dtype, such as str, int, float, etc.
        # id column can be int, while parent_id column can be float
        # (as some values are NaN) so we cast id to the same dtype
        sample_ids = (
            validation_dataframe["id"]
            .astype(validation_dataframe[cfg.dataset.parent_id_column].dtype)
            .tolist()
        )
        parent_ids = validation_dataframe[cfg.dataset.parent_id_column].tolist()

        sample_ids_set = set(sample_ids)
        is_seed_prompt = [
            False if idx in sample_ids_set else True for idx in parent_ids
        ]
        validation_dataframe["is_seed_prompt"] = is_seed_prompt

        validation_dataframe = validation_dataframe.loc[
            validation_dataframe["is_seed_prompt"]
        ]
    validation_texts = get_texts(validation_dataframe, cfg)

    # Hide fields that are should not be visible in the UI
    cfg.prediction._visibility["metric"] = -1
    cfg.prediction._visibility["batch_size_inference"] = -1
    cfg.prediction._visibility["min_length_inference"] = -1
    cfg.prediction._visibility["stop_tokens"] = -1

    logger.info(torch.cuda.memory_allocated())
    q.page["experiment/display/chat"] = ui.chatbot_card(
        box="first",
        data=chat_data(fields="content from_user", t="list"),  # type: ignore
        name="experiment/display/chat/chatbot",
        events=["stop", "suggestion"],
        suggestions=[
            ui.chat_suggestion(
                "Write a poem about H2O LLM Studio",
                label="Write a poem",
                caption="about H2O LLM Studio",
                icon="Edit",
            ),
            ui.chat_suggestion(
                "Plan a trip to Europe",
                label="Plan a trip",
                caption="to Europe",
                icon="Airplane",
            ),
            ui.chat_suggestion(
                "Give me ideas for a new project",
                label="Give me ideas",
                caption="for a new project",
                icon="Lightbulb",
            ),
            ui.chat_suggestion(
                np.random.choice(validation_texts),
                label="Random sample from validation set",
                icon="Chat",
            ),
        ],
    )
    q.page["experiment/display/chat"].data += [initial_message, False]

    option_items = get_ui_elements_for_cfg(
        cfg=q.client["experiment/display/chat/cfg"].prediction,
        q=q,
        pre="chat/cfg_predictions",
    )
    q.page["experiment/display/chat/settings"] = ui.form_card(
        box="second",
        items=[
            ui.buttons(
                [
                    ui.button(
                        name="experiment/display/chat/clear_history",
                        label="Clear History",
                        primary=True,
                    ),
                    ui.button(
                        name="experiment/display/chat/copy_chat",
                        label="Copy to clipboard",
                        primary=True,
                    ),
                ]
            ),
            ui.expander(
                name="chat_settings",
                label="Chat Settings",
                items=option_items,
                expanded=True,
            ),
        ],
    )


async def should_start_chat(q: Q):
    cfg: DefaultConfigProblemBase = load_config_yaml(
        os.path.join(q.client["experiment/display/experiment_path"], "cfg.yaml")
    )

    if cfg.problem_type in NON_GENERATION_PROBLEM_TYPES:
        q.page["experiment/display/chat"] = ui.form_card(
            box="first",
            items=[
                ui.text(
                    "Chatbot is not available for this problem type. "
                    "Please select a text generation problem."
                )
            ],
            title="",
        )
        q.client.delete_cards.add("experiment/display/chat")
        return False

    # gpu id in UI is offset by 1 to be in sync with experiment UI
    gpu_id = q.client["gpu_used_for_chat"] - 1
    if gpu_is_blocked(q, gpu_id):
        q.page["experiment/display/chat"] = ui.form_card(
            box="first",
            items=[
                ui.text(
                    f"""Chatbot is not available when GPU{q.client["gpu_used_for_chat"]}
                        is blocked by another experiment.
                        You can change "Gpu used for Chat" in the settings tab
                        to use another GPU for the chatbot. """
                )
            ],
            title="",
        )
        q.client.delete_cards.add("experiment/display/chat")
        return False
    return True


def gpu_is_blocked(q, gpu_id):
    experiments = get_experiments(q=q)
    running_experiments = experiments[experiments.status.isin(["running"])]
    gpu_blocked = any(
        [
            str(gpu_id) in gpu_list
            for gpu_list in running_experiments["gpu_list"]
            .apply(lambda x: x.split(","))
            .to_list()
        ]
    )
    return gpu_blocked


def load_cfg_model_tokenizer(
    experiment_path: str, merge: bool = False, device: str = "cuda:0"
):
    """Loads the model, tokenizer and configuration from the experiment path."""
    cfg = load_config_yaml(os.path.join(experiment_path, "cfg.yaml"))
    cfg.architecture.pretrained = False
    cfg.architecture.gradient_checkpointing = False
    cfg.environment._device = device.replace("_shard", "")
    cfg.environment._local_rank = 0
    cfg.prediction._visibility["num_history"] = 1

    tokenizer = get_tokenizer(cfg)

    gc.collect()
    torch.cuda.empty_cache()

    if (
        merge
        and cfg.training.lora
        and cfg.architecture.backbone_dtype in ("int4", "int8")
    ):
        # Force to float16 for merging LORA weights.
        # TODO: Could be configurable in the future to allow bfloat16.
        logger.info("Loading backbone in float16 for merging LORA weights.")
        cfg.architecture.backbone_dtype = "float16"
        cfg.architecture.pretrained = True

    # if "cpu" in device:
    #     cfg.architecture.backbone_dtype = "float32"

    with torch.device(cfg.environment._device):
        model = cfg.architecture.model_class(cfg)
        cfg.architecture.pretrained_weights = os.path.join(
            experiment_path, "checkpoint.pth"
        )
        load_checkpoint(cfg, model, strict=False)

    if device == "cpu_shard":
        max_memory = get_balanced_memory(
            model,
        )
        device_map = infer_auto_device_map(model, max_memory=max_memory)
        model = dispatch_model(
            model,
            device_map=device_map,
        )

    if merge and cfg.training.lora:
        # merges the LoRa layers into the base model.
        # This is needed if one wants to use the base model as a standalone model.
        logger.info("Merging LORA layers with base model.")
        model.backbone = model.backbone.merge_and_unload()

    model = model.eval()
    model.backbone.use_cache = True

    return cfg, model, tokenizer