import gradio as gr from transformers import pipeline import torch from models.modeling_moss import MossForCausalLM from models.tokenization_moss import MossTokenizer from models.configuration_moss import MossConfig from accelerate import init_empty_weights, load_checkpoint_and_dispatch from transformers import AutoTokenizer, AutoModelForCausalLM # nstruct_pipeline_3b = pipeline(model="fnlp/moss-moon-003-sft-int4", torch_dtype=torch.float, trust_remote_code=True, # device_map="auto") model_path = "fnlp/moss-moon-003-sft-int4" # config = MossConfig.from_pretrained(model_path) # tokenizer = MossTokenizer.from_pretrained(model_path) # # with init_empty_weights(): # raw_model = MossForCausalLM._from_config(config, torch_dtype=torch.float) # raw_model.tie_weights() # model = load_checkpoint_and_dispatch( # raw_model, checkpoint=model_path, device_map="balanced_low_0", no_split_module_classes=["MossBlock"], dtype=torch.float, # offload_folder="offload_folder" # ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float() model = model.eval() def generate(query, temperature, top_p, top_k, max_new_tokens): return model.generate(query, temperature, top_p, top_k, max_new_tokens) with gr.Blocks() as demo: gr.Markdown( """

Databricks Dolly LLMs

This demo compares the smaller two variants of the Databricks Dolly models, the [2.8B](https://huggingface.co/databricks/dolly-v2-3b), and the [6.9B](https://huggingface.co/databricks/dolly-v2-7b). They are all based on the EluetherAI's Pythia models fine-tuned with approx [15K instruction demonstrations](https://huggingface.co/datasets/HuggingFaceH4/databricks_dolly_15k) """ ) with gr.Row(): with gr.Column(): with gr.Row(): instruction = gr.Textbox(placeholder="Enter your question here", label="Question", elem_id="q-input") with gr.Row(): with gr.Column(): with gr.Row(): temperature = gr.Slider( label="Temperature", value=0.5, minimum=0.0, maximum=2.0, step=0.1, interactive=True, info="Higher values produce more diverse outputs", ) with gr.Column(): with gr.Row(): top_p = gr.Slider( label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample fewer low-probability tokens", ) with gr.Column(): with gr.Row(): top_k = gr.Slider( label="Top-k", value=50, minimum=0.0, maximum=100, step=1, interactive=True, info="Sample from a shortlist of top-k tokens", ) with gr.Column(): with gr.Row(): max_new_tokens = gr.Slider( label="Maximum new tokens", value=256, minimum=0, maximum=2048, step=5, interactive=True, info="The maximum number of new tokens to generate", ) with gr.Row(): submit = gr.Button("Generate Answers") with gr.Row(): with gr.Column(): with gr.Box(): gr.Markdown("**Dolly 3B**") output_3b = gr.Markdown() with gr.Column(): with gr.Box(): gr.Markdown("**Dolly 7B**") output_7b = gr.Markdown() # with gr.Column(): # with gr.Box(): # gr.Markdown("**Dolly 12B**") # output_12b = gr.Markdown() submit.click(generate, inputs=[instruction, temperature, top_p, top_k, max_new_tokens], outputs=[output_3b, output_7b]) instruction.submit(generate, inputs=[instruction, temperature, top_p, top_k, max_new_tokens], outputs=[output_3b, output_7b]) demo.launch()