Spaces:
Running
Running
File size: 5,293 Bytes
f46c4d5 a8d2b76 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, pipeline
from threading import Thread
model_id = "rasyosef/Llama-3.2-180M-Amharic-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
llama_am = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
# Function that accepts a prompt and generates text using the phi2 pipeline
def generate(message, chat_history, max_new_tokens=64):
history = []
for sent, received in chat_history:
history.append({"role": "user", "content": sent})
history.append({"role": "assistant", "content": received})
history.append({"role": "user", "content": message})
#print(history)
if len(tokenizer.apply_chat_template(history)) > 512:
yield "chat history is too long"
else:
# Streamer
streamer = TextIteratorStreamer(tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=300.0)
thread = Thread(target=llama_am,
kwargs={
"text_inputs":history,
"max_new_tokens":max_new_tokens,
"repetition_penalty":1.15,
"streamer":streamer
}
)
thread.start()
generated_text = ""
for word in streamer:
generated_text += word
response = generated_text.strip()
yield response
# Chat interface with gradio
with gr.Blocks() as demo:
gr.Markdown("""
# Llama 3.2 180M Amharic Chatbot Demo
This chatbot was created using [Llama-3.2-180M-Amharic-Instruct](https://huggingface.co/rasyosef/Llama-3.2-180M-Amharic-Instruct), a finetuned version of my 180 million parameter [Llama 3.2 180M Amharic](https://huggingface.co/rasyosef/Llama-3.2-180M-Amharic) transformer model.
""")
tokens_slider = gr.Slider(8, 256, value=64, label="Maximum new tokens", info="A larger `max_new_tokens` parameter value gives you longer text responses but at the cost of a slower response time.")
chatbot = gr.ChatInterface(
chatbot=gr.Chatbot(height=400),
fn=generate,
additional_inputs=[tokens_slider],
stop_btn=None,
cache_examples=False,
examples=[
["แฐแแแฃ แฅแแดแต แแ
?"],
["แจแขแตแฎแตแซ แแ แจแฐแ แตแ แแแตแ แแ?"],
["แจแขแตแฎแตแซ แจแแจแจแปแ แแแต แแ แแ แฉ?"],
["แจแ แแญแ แแฅแ แแแแ"],
["แฐแจแต แแแจแ\n\nแ
แฅแ แ แแ แณ"],
["แ แแต แ แตแแ แแแต แแแจแ"],
["แจแฐแฐแ แ แฝแแ แ แตแฐแซแจแต แแ แ แญแแต แแ? 'แ แแแณแ'แฃ 'แ แแณแ' แแญแ 'แแแแฐแ' แจแแ แแแฝ แตแฅแข 'แ แชแ แแแ แแ แญ'"],
["แจแแจแแณแญ แแ แจแฐแ แตแ แแแตแ แแ?"],
["แ แแ แจแ แแชแซ แแฌแแณแแต แแ แแ?"],
["แถแตแต แจแ แแชแซ แแแซแต แฅแแตแแ"],
["3 แจแ แแชแซ แแชแแฝแ แตแ แฅแแต"],
["5 แจแ แแชแซ แจแฐแแแฝแ แฅแแต"],
["แ แแตแต แจแ แแฎแ แแแฎแฝแ แฅแแตแแ"],
["แ แแแ แแญ แซแแตแ 7 แ แ
แแซแต แแแจแ"]
]
)
demo.queue().launch(debug=True,share=True)
# from huggingface_hub import InferenceClient
# """
# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
# """
# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
# def respond(
# message,
# history: list[tuple[str, str]],
# system_message,
# max_tokens,
# temperature,
# top_p,
# ):
# messages = [{"role": "system", "content": system_message}]
# for val in history:
# if val[0]:
# messages.append({"role": "user", "content": val[0]})
# if val[1]:
# messages.append({"role": "assistant", "content": val[1]})
# messages.append({"role": "user", "content": message})
# response = ""
# for message in client.chat_completion(
# messages,
# max_tokens=max_tokens,
# stream=True,
# temperature=temperature,
# top_p=top_p,
# ):
# token = message.choices[0].delta.content
# response += token
# yield response
# """
# For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
# """
# demo = gr.ChatInterface(
# respond,
# additional_inputs=[
# gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
# gr.Slider(
# minimum=0.1,
# maximum=1.0,
# value=0.95,
# step=0.05,
# label="Top-p (nucleus sampling)",
# ),
# ],
# )
# if __name__ == "__main__":
# demo.launch()
|