Spaces:
Configuration error
Configuration error
File size: 6,522 Bytes
4fe3463 8d503ef 4fe3463 96c114b 4fe3463 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
#llama 3.2 3b IT
import os
from threading import Thread
from typing import Iterator
import gradio as gr
#import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
DESCRIPTION = """\
# Llama 3.2 3B Instruct
Llama 3.2 3B is Meta's latest iteration of open LLMs.
This is a demo of [`meta-llama/Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct), fine-tuned for instruction following.
For more details, please check [our post](https://huggingface.co/blog/llama32).
"""
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Download model from Huggingface Hub
# Change this to meta-llama or the correct org name from Huggingface Hub
model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
)
model.eval()
# Main Gradio inference function
def generate(
message: str,
chat_history: list[tuple[str, str]],
max_new_tokens: int = 1024,
temperature: float = 0.6,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2,
) -> Iterator[str]:
conversation = [{k: v for k, v in d.items() if k != 'metadata'} for d in chat_history]
conversation.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
input_ids = input_ids.to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
{"input_ids": input_ids},
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
num_beams=1,
repetition_penalty=repetition_penalty,
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
conversation.append({"role": "assistant", "content": ""})
outputs = []
for text in streamer:
outputs.append(text)
bot_response = "".join(outputs)
conversation[-1]['content'] = bot_response
yield "", conversation
# Implementing Gradio 5 features and building a ChatInterface UI yourself
PLACEHOLDER = """<div style="padding: 20px; text-align: center; display: flex; flex-direction: column; align-items: center;">
<img src="https://ysharma-dummy-chat-app.hf.space/file=/tmp/gradio/c21ff9c8e7ecb2f7d957a72f2ef03c610ac7bbc4/Meta_lockup_positive%20primary_RGB_small.jpg" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55; margin-bottom: 10px;">
<h1 style="font-size: 28px; margin: 0;">Meta llama3.2</h1>
<p style="font-size: 18px; margin: 5px 0 0; opacity: 0.65;">
<a href="https://huggingface.co/blog/llama32" target="_blank" style="color: inherit; text-decoration: none;">Learn more about Llama 3.2</a>
</p>
</div>"""
def handle_retry(history, retry_data: gr.RetryData):
new_history = history[:retry_data.index]
previous_prompt = history[retry_data.index]['content']
yield from generate(previous_prompt, chat_history = new_history, max_new_tokens = 1024, temperature = 0.6, top_p = 0.9, top_k = 50, repetition_penalty = 1.2)
def handle_like(data: gr.LikeData):
if data.liked:
print("You upvoted this response: ", data.value)
else:
print("You downvoted this response: ", data.value)
def handle_undo(history, undo_data: gr.UndoData):
chatbot = history[:undo_data.index]
prompt = history[undo_data.index]['content']
return chatbot, prompt
def chat_examples_fill(data: gr.SelectData):
yield from generate(data.value['text'], chat_history = [], max_new_tokens = 1024, temperature = 0.6, top_p = 0.9, top_k = 50, repetition_penalty = 1.2)
with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
with gr.Column(elem_id="container", scale=1):
chatbot = gr.Chatbot(
label="Llama3.2 3B Instruct Chatbotw using Gradio 5",
show_label=False,
type="messages",
scale=1,
suggestions = [
{"text": "How many R are there in a Strawberry?"},
{"text": "What is the meaning of life for an AI?"},
{"text": "Are tomatoes vegetables?"},
{"text": "There's a llama in my garden 😱 What should I do?"},
{"text": "What is the best way to open a can of worms?"},
{"text": "The odd numbers in this group add up to an even number: 15, 32, 5, 13, 82, 7, 1. "},
{"text": 'How to setup a human base on Mars? Give short answer.'},
{"text": 'Explain theory of relativity to me like I’m 8 years old.'},
{"text": 'What is 9,000 * 9,000?'},
{"text": 'Write a pun-filled happy birthday message to my friend Alex.'},
{"text": 'Justify why a penguin might make a good king of the jungle.'}
],
placeholder = PLACEHOLDER,
)
msg = gr.Textbox(submit_btn=True, show_label=False)
with gr.Accordion('Additional inputs', open=False):
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, )
temperature = gr.Slider(label="Temperature",minimum=0.1, maximum=4.0, step=0.1, value=0.6,)
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9, )
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50, )
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2, )
msg.submit(generate, [msg, chatbot, max_new_tokens, temperature, top_p, top_k, repetition_penalty], [msg, chatbot])
chatbot.retry(handle_retry, chatbot, [msg, chatbot])
chatbot.like(handle_like, None, None)
chatbot.undo(handle_undo, chatbot, [chatbot, msg])
chatbot.suggestion_select(chat_examples_fill, None, [msg, chatbot] )
demo.launch()
|