Spaces:
Sleeping
Sleeping
File size: 5,178 Bytes
4375b7f 4e683ec 76a154f b1c12fa 76a154f d534002 4e683ec 70520da 4375b7f 76a154f 4e683ec b327205 76a154f b327205 76a154f b327205 76a154f b327205 76a154f b327205 76a154f b327205 76a154f 4e683ec 98df5b4 76a154f 4e683ec 814dd6c b327205 012c0fa b327205 4e683ec 012c0fa 76a154f b1c12fa 76a154f e12dd90 0da65a1 a928ce7 4e683ec a928ce7 4e683ec e12dd90 4e683ec 012c0fa 6111f2c 4e683ec 6111f2c 4e683ec 012c0fa 4e683ec 76a154f 4e683ec 76a154f 4e683ec 012c0fa 4e683ec 76a154f 4e683ec 76a154f 4e683ec 76a154f 4e683ec 76a154f 4e683ec a928ce7 4e683ec b327205 4e683ec cadad8a e12dd90 4e683ec bb98bf8 4e683ec 76a154f 4e683ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import os
from threading import Thread
from typing import Iterator
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
MAX_MAX_NEW_TOKENS = 8192
DEFAULT_MAX_NEW_TOKENS = 2048
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
DESCRIPTION = """\
# Turkish LLaMA 8B Chat
This Space demonstrates [Turkish-Llama-8b-DPO-v0.1](https://huggingface.co/ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1) by YTU COSMOS Research Group, an 8B parameter model fine-tuned for Turkish language understanding and generation. Feel free to play with it, or duplicate to run generations without a queue!
🔎 This model is the newest and most advanced iteration of CosmosLLama, developed by merging two distinctly trained CosmosLLaMa-Instruct DPO models.
🤖 The model is optimized for Turkish language tasks and can handle various text generation scenarios including conversations, instructions, and general text completion.
💡 You can also try the model on the official demo page: [cosmos.yildiz.edu.tr/cosmosllama](https://cosmos.yildiz.edu.tr/cosmosllama)
"""
LICENSE = """
<p/>
---
This demo uses [Turkish-Llama-8b-DPO-v0.1](https://huggingface.co/ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1) by YTU COSMOS Research Group,
and is governed by the original llama3 license.
"""
if not torch.cuda.is_available():
DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
if torch.cuda.is_available():
model_id = "ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1"
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.use_default_system_prompt = False
TERMINATORS = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
@spaces.GPU
def generate(
message: str,
chat_history: list[dict],
system_prompt: str = "",
max_new_tokens: int = 2048,
temperature: float = 0.6,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.0,
) -> Iterator[str]:
conversation = []
if system_prompt:
conversation.append({"role": "system", "content": system_prompt})
conversation += chat_history
conversation.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(
conversation,
add_generation_prompt=True,
return_tensors="pt"
)
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
input_ids = input_ids.to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
{"input_ids": input_ids},
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
num_beams=1,
repetition_penalty=repetition_penalty,
eos_token_id=TERMINATORS,
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
yield "".join(outputs)
chat_interface = gr.ChatInterface(
fn=generate,
additional_inputs=[
gr.Textbox(
label="System prompt",
lines=6,
value="Sen bir yapay zeka asistanısın. Kullanıcı sana bir görev verecek. Amacın görevi olabildiğince sadık bir şekilde tamamlamak. Görevi yerine getirirken adım adım düşün ve adımlarını gerekçelendir.",
),
gr.Slider(
label="Max new tokens",
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
step=1,
value=DEFAULT_MAX_NEW_TOKENS,
),
gr.Slider(
label="Temperature",
minimum=0.1,
maximum=4.0,
step=0.1,
value=0.6,
),
gr.Slider(
label="Top-p (nucleus sampling)",
minimum=0.05,
maximum=1.0,
step=0.05,
value=0.9,
),
gr.Slider(
label="Top-k",
minimum=1,
maximum=1000,
step=1,
value=50,
),
gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=2.0,
step=0.05,
value=1.0,
),
],
stop_btn=None,
examples=[
["Merhaba! Nasılsın?"],
["Yapay zeka alanında açık kaynak kodun faydaları nelerdir?"],
],
cache_examples=False,
type="messages",
)
with gr.Blocks(css_paths="style.css", fill_height=True) as demo:
gr.Markdown(DESCRIPTION)
gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
chat_interface.render()
gr.Markdown(LICENSE)
if __name__ == "__main__":
demo.queue(max_size=20).launch()
|