falcon-180b-demo

Running

File size: 3,980 Bytes

0b15f14
140793a
1997cd5
0b15f14
140793a
aac3374
 
73660ac
 
aac3374
 
 
 
 
7d6878a
140793a
 
e1eb2b8
140793a
 
 
e1eb2b8
140793a
 
61d12d7
140793a
 
 
2147ae4
 
61d12d7
210da3f
140793a
 
2147ae4
83a6345
140793a
 
 
 
 
 
 
c3acc2f
140793a
 
 
 
 
 
 
83a6345
140793a
c3acc2f
a68ea86
aac3374
 
b2cfaba
aac3374
 
 
dddc929
aac3374
61d12d7
 
 
 
 
 
210da3f
61d12d7
07466ed
0caf6b4
210da3f
b525961
 
 
 
 
6ae1c70
b525961
 
210da3f
07466ed
210da3f
 
140793a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f441bd4
140793a
 
 
 
 
 
 
f441bd4
 
140793a
 
 
 
 
 
 
 
 
 
 
 
0b15f14
140793a
0b15f14
140793a
53aa7e6
140793a
 
 
 
 
 
63b183c
210da3f

import os
import gradio as gr
from huggingface_hub import InferenceClient

HF_TOKEN = os.environ.get("HF_TOKEN", None)

model2api = [
             "tiiuae/falcon-180B-chat",
             "meta-llama/Llama-2-70b-chat-hf",
             "codellama/CodeLlama-34b-Instruct-hf",
             "victor/CodeLlama-34b-Instruct-hf",
             "timdettmers/guanaco-33b-merged",
]

STOP_SEQUENCES = ["User:", "###", "<|endoftext|>", "</s>"]

EXAMPLES = [
    ["Hey LLAMA! Any recommendations for my holidays in Abu Dhabi?"],
    ["What's the Everett interpretation of quantum mechanics?"],
    ["Give me a list of the top 10 dive sites you would recommend around the world."],
    ["Can you tell me more about deep-water soloing?"],
    ["Can you write a short tweet about the release of our latest AI model, LLAMA LLM?"]
    ]

def format_prompt(message, history, system_prompt, bot_name):
  prompt = ""
  if system_prompt:
    prompt += f"System: {system_prompt}\n"
  for user_prompt, bot_response in history:
    prompt += f"User: {user_prompt}\n"
    prompt += f"{bot_name}: {bot_response}\n"
  prompt += f"""User: {message}\n{bot_name}: """
  return prompt

seed = 42

def generate(
    prompt, history, system_prompt="", temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)
    global seed
    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        stop_sequences=STOP_SEQUENCES,
        do_sample=True,
        seed=seed,
    )
    seed = seed + 1

    client = InferenceClient()
    clientList = (client.list_deployed_models('text-generation-inference'))['text-generation']
    for model in model2api:
        if model in clientList:
            client = InferenceClient(model, token=HF_TOKEN)
            print(f"Choosen model: {model}")
            break

    if model == model2api[0]:
        bot_name = "Falcon"
    else:
        bot_name = "Assistant"
    
    formatted_prompt = format_prompt(prompt, history, system_prompt, bot_name)
    output = ""
    
    try:
        stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
        
        for response in stream:
            output += response.token.text
            for stop_str in STOP_SEQUENCES:
                if output.endswith(stop_str):
                    output = output[:-len(stop_str)]
#                    output = output.rstrip()
                    yield output
            yield output
        return output
    except Exception as e:
        raise gr.Error(f"Error: {e}. Please retry!")
        
additional_inputs=[
    gr.Textbox("", label="Optional system prompt"),
    gr.Slider(
        label="Temperature",
        value=0.9,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),
    gr.Slider(
        label="Max new tokens",
        value=256,
        minimum=0,
        maximum=3000,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.90,
        minimum=0.01,
        maximum=0.99,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Repetition penalty",
        value=1.2,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    )
]

with gr.Blocks() as demo:
    
    gr.ChatInterface(
        generate, 
        examples=EXAMPLES,
        additional_inputs=additional_inputs,
    ) 

demo.queue(api_open=False).launch(show_api=False)
#demo.queue(concurrency_count=100).launch()