Databricks Dolly LLMs

import gradio as gr
from transformers import pipeline
import torch
from models.modeling_moss import MossForCausalLM
from models.tokenization_moss import MossTokenizer
from models.configuration_moss import MossConfig
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from transformers import AutoTokenizer, AutoModelForCausalLM

# nstruct_pipeline_3b = pipeline(model="fnlp/moss-moon-003-sft-int4", torch_dtype=torch.float, trust_remote_code=True,
#                                device_map="auto")
model_path = "fnlp/moss-moon-003-sft-int4"

# config = MossConfig.from_pretrained(model_path)
# tokenizer = MossTokenizer.from_pretrained(model_path)
#
# with init_empty_weights():
#   raw_model = MossForCausalLM._from_config(config, torch_dtype=torch.float)
# raw_model.tie_weights()
# model = load_checkpoint_and_dispatch(
#   raw_model, checkpoint=model_path, device_map="balanced_low_0", no_split_module_classes=["MossBlock"], dtype=torch.float,
#   offload_folder="offload_folder"
# )

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float()
model = model.eval()


def generate(query, temperature, top_p, top_k, max_new_tokens):
  return model.generate(query, temperature, top_p, top_k, max_new_tokens)


with gr.Blocks() as demo:
  gr.Markdown(
    """<h1><center>Databricks Dolly LLMs</center></h1>

    This demo compares the smaller two variants of the Databricks Dolly models, the [2.8B](https://huggingface.co/databricks/dolly-v2-3b), and the [6.9B](https://huggingface.co/databricks/dolly-v2-7b). They are all based on the EluetherAI's Pythia models fine-tuned with approx [15K instruction demonstrations](https://huggingface.co/datasets/HuggingFaceH4/databricks_dolly_15k)
"""
  )
  with gr.Row():
    with gr.Column():
      with gr.Row():
        instruction = gr.Textbox(placeholder="Enter your question here", label="Question", elem_id="q-input")
      with gr.Row():
        with gr.Column():
          with gr.Row():
            temperature = gr.Slider(
              label="Temperature",
              value=0.5,
              minimum=0.0,
              maximum=2.0,
              step=0.1,
              interactive=True,
              info="Higher values produce more diverse outputs",
            )
        with gr.Column():
          with gr.Row():
            top_p = gr.Slider(
              label="Top-p (nucleus sampling)",
              value=0.95,
              minimum=0.0,
              maximum=1,
              step=0.05,
              interactive=True,
              info="Higher values sample fewer low-probability tokens",
            )
        with gr.Column():
          with gr.Row():
            top_k = gr.Slider(
              label="Top-k",
              value=50,
              minimum=0.0,
              maximum=100,
              step=1,
              interactive=True,
              info="Sample from a shortlist of top-k tokens",
            )
        with gr.Column():
          with gr.Row():
            max_new_tokens = gr.Slider(
              label="Maximum new tokens",
              value=256,
              minimum=0,
              maximum=2048,
              step=5,
              interactive=True,
              info="The maximum number of new tokens to generate",
            )
        with gr.Row():
          submit = gr.Button("Generate Answers")
  with gr.Row():
    with gr.Column():
      with gr.Box():
        gr.Markdown("**Dolly 3B**")
        output_3b = gr.Markdown()
    with gr.Column():
      with gr.Box():
        gr.Markdown("**Dolly 7B**")
        output_7b = gr.Markdown()
      # with gr.Column():
      #  with gr.Box():
      #         gr.Markdown("**Dolly 12B**")
      #         output_12b = gr.Markdown()
  submit.click(generate, inputs=[instruction, temperature, top_p, top_k, max_new_tokens],
               outputs=[output_3b, output_7b])
  instruction.submit(generate, inputs=[instruction, temperature, top_p, top_k, max_new_tokens],
                     outputs=[output_3b, output_7b])

demo.launch()