vsrinivas's picture
Update app.py
4424032 verified
raw
history blame
3.9 kB
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import gradio as gr
desired_dtype = torch.bfloat16
torch.set_default_dtype(torch.bfloat16)
# checkpoint = "vsrinivas/falconlite2"
checkpoint = "tiiuae/falcon-7b-instruct"
model = AutoModelForCausalLM.from_pretrained(
# checkpoint, device_map="auto", offload_folder="offload", trust_remote_code=True, torch_dtype="auto")
checkpoint, device_map="auto", offload_folder="offload", trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
pipeline = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
)
def format_chat_prompt(message, chat_history, instruction):
prompt = f"System:{instruction}"
for turn in chat_history:
user_message, bot_message = turn
prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"
prompt = f"{prompt}\nUser: {message}\nAssistant:"
return prompt
def generate_seqs(prompt, max_new_tokens=None, stop_sequence=None, temperature=None):
output = pipeline(prompt,
max_length=200,
truncation=True,
max_new_tokens = max_new_tokens,
stop_sequence = stop_sequence,
temperature=temperature,
do_sample=True,
top_k=10,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id)
return output[0]['generated_text']
def respond(message, chat_history, instruction, temperature=0.7):
prompt = format_chat_prompt(message, chat_history, instruction)
chat_history = chat_history + [[message, ""]]
stream = generate_seqs(prompt = prompt,
max_new_tokens=8192,
stop_sequence=["\nUser:", "<|endoftext|>"],
temperature=temperature).split('Assistant: ')[-1]
#stop_sequence to not generate the user answer
acc_text = ""
#Streaming the tokens
for idx, response in enumerate(stream):
# text_token = response.token.text
text_token = response
# if response.details:
# return
if idx == 0 and text_token.startswith(" "):
text_token = text_token[1:]
acc_text += text_token
last_turn = list(chat_history.pop(-1))
last_turn[-1] += acc_text
chat_history = chat_history + [last_turn]
yield "", chat_history
acc_text = ""
with gr.Blocks() as demo:
gr.Markdown(
"""
# General purpose chatbot - test & demo app by Srinivas.V..
## As this is a free hosted platform (Computing and Memory limitations), you will find it slow and the app may not provide appropriate answers after a few dialogues. Type in your prompt, click/ submit and wait for the resonse before typing in your next prompt.
""")
chatbot = gr.Chatbot(height=1000) #just to fit the notebook
msg = gr.Textbox(label="Prompt")
with gr.Accordion(label="Advanced options",open=False):
system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.")
temperature = gr.Slider(label="temperature", minimum=0.1, maximum=1, value=0.7, step=0.1)
btn = gr.Button("Submit")
clear = gr.ClearButton(components=[msg, chatbot, system, temperature], value="Clear console")
btn.click(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot])
msg.submit(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot])
gr.close_all()
demo.queue().launch()