Spaces:
Sleeping
Sleeping
File size: 7,351 Bytes
2177bd6 551328e 2177bd6 551328e 2177bd6 551328e 2177bd6 551328e 2177bd6 551328e 2177bd6 551328e 2177bd6 551328e 2177bd6 551328e 21597e6 551328e 0e27985 866a6fe 0e27985 5d80cfa 7ace03c d8d22a8 9719d13 0e27985 551328e 9cefaa1 47c4494 551328e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import json
import os
import shutil
import requests
import gradio as gr
from huggingface_hub import Repository, InferenceClient
HF_TOKEN = os.environ.get("HF_TOKEN", None)
API_URL = "https://api-inference.huggingface.co/models/tiiuae/falcon-180B-chat"
BOT_NAME = "Falcon"
STOP_SEQUENCES = ["\nUser:", "<|endoftext|>", " User:", "###"]
EXAMPLES = [
["Hey Falcon! Any recommendations for my holidays in Abu Dhabi?"],
["What's the Everett interpretation of quantum mechanics?"],
["Give me a list of the top 10 dive sites you would recommend around the world."],
["Can you tell me more about deep-water soloing?"],
["Can you write a short tweet about the release of our latest AI model, Falcon LLM?"]
]
client = InferenceClient(
API_URL,
headers={"Authorization": f"Bearer {HF_TOKEN}"},
)
# def format_prompt(message, history, system_prompt):
# prompt = ""
# if system_prompt:
# prompt += f"System: {system_prompt}\n"
# for user_prompt, bot_response in history:
# prompt += f"User: {user_prompt}\n"
# prompt += f"Falcon: {bot_response}\n" # Response already contains "Falcon: "
# prompt += f"""User: {message}
# Falcon:"""
# return prompt
# seed = 42
# def generate(
# prompt, history, system_prompt="", temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
# ):
# temperature = float(temperature)
# if temperature < 1e-2:
# temperature = 1e-2
# top_p = float(top_p)
# global seed
# generate_kwargs = dict(
# temperature=temperature,
# max_new_tokens=max_new_tokens,
# top_p=top_p,
# repetition_penalty=repetition_penalty,
# stop_sequences=STOP_SEQUENCES,
# do_sample=True,
# seed=seed,
# )
# seed = seed + 1
# formatted_prompt = format_prompt(prompt, history, system_prompt)
# stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
# output = ""
# for response in stream:
# output += response.token.text
# for stop_str in STOP_SEQUENCES:
# if output.endswith(stop_str):
# output = output[:-len(stop_str)]
# output = output.rstrip()
# yield output
# yield output
# return output
# additional_inputs=[
# gr.Textbox("", label="Optional system prompt"),
# gr.Slider(
# label="Temperature",
# value=0.9,
# minimum=0.0,
# maximum=1.0,
# step=0.05,
# interactive=True,
# info="Higher values produce more diverse outputs",
# ),
# gr.Slider(
# label="Max new tokens",
# value=256,
# minimum=0,
# maximum=8192,
# step=64,
# interactive=True,
# info="The maximum numbers of new tokens",
# ),
# gr.Slider(
# label="Top-p (nucleus sampling)",
# value=0.90,
# minimum=0.0,
# maximum=1,
# step=0.05,
# interactive=True,
# info="Higher values sample more low-probability tokens",
# ),
# gr.Slider(
# label="Repetition penalty",
# value=1.2,
# minimum=1.0,
# maximum=2.0,
# step=0.05,
# interactive=True,
# info="Penalize repeated tokens",
# )
# ]
# with gr.Blocks() as demo:
# with gr.Row():
# with gr.Column(scale=0.4):
# gr.Image("better_banner.jpeg", elem_id="banner-image", show_label=False)
# with gr.Column():
# gr.Markdown(
# """# Falcon-180B Demo
# **Chat with [Falcon-180B-Chat](https://huggingface.co/tiiuae/falcon-180b-chat), brainstorm ideas, discuss your holiday plans, and more!**
# ✨ This demo is powered by [Falcon-180B](https://huggingface.co/tiiuae/falcon-180B) and finetuned on a mixture of [Ultrachat](https://huggingface.co/datasets/stingning/ultrachat), [Platypus](https://huggingface.co/datasets/garage-bAInd/Open-Platypus) and [Airoboros](https://huggingface.co/datasets/jondurbin/airoboros-2.1). [Falcon-180B](https://huggingface.co/tiiuae/falcon-180b) is a state-of-the-art large language model built by the [Technology Innovation Institute](https://www.tii.ae) in Abu Dhabi. It is trained on 3.5 trillion tokens (including [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb)) and available under the [Falcon-180B TII License](https://huggingface.co/spaces/tiiuae/falcon-180b-license/blob/main/LICENSE.txt). It currently holds the 🥇 1st place on the [🤗 Open LLM leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) for a pretrained model.
# 🧪 This is only a **first experimental preview**: we intend to provide increasingly capable versions of Falcon in the future, based on improved datasets and RLHF/RLAIF.
# 👀 **Learn more about Falcon LLM:** [falconllm.tii.ae](https://falconllm.tii.ae/)
# ➡️️ **Intended Use**: this demo is intended to showcase an early finetuning of [Falcon-180B](https://huggingface.co/tiiuae/falcon-180b), to illustrate the impact (and limitations) of finetuning on a dataset of conversations and instructions. We encourage the community to further build upon the base model, and to create even better instruct/chat versions!
# ⚠️ **Limitations**: the model can and will produce factually incorrect information, hallucinating facts and actions. As it has not undergone any advanced tuning/alignment, it can produce problematic outputs, especially if prompted to do so. Finally, this demo is limited to a session length of about 1,000 words.
# """
# )
# gr.ChatInterface(
# generate,
# examples=EXAMPLES,
# additional_inputs=additional_inputs,
# )
#demo.launch(show_api=True, share=True)
#demo.queue(concurrency_count=100, api_open=False).launch(show_api=True)
def query(system_prompt, user_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
print(temperature, max_new_tokens, top_p, repetition_penalty)
seed = 42
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
stop_sequences=STOP_SEQUENCES,
do_sample=True,
seed=seed,
)
prompt = f"System: {system_prompt}\nUser: {user_prompt}\n"
print(prompt)
print('-----')
#output = client.text_generation(prompt, **generate_kwargs, details=True, return_full_text=False)
#print(output)
stream = client.text_generation(prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
output += response.token.text
for stop_str in STOP_SEQUENCES:
if output.endswith(stop_str):
output = output[:-len(stop_str)]
output = output.rstrip()
#yield output
#yield output
print(output)
return output
iface = gr.Interface(
query,
inputs=["text","text","text","text","text","text"],
outputs="text",
)
iface.queue()
iface.launch()
|