File size: 5,178 Bytes
4375b7f
4e683ec
76a154f
 
 
b1c12fa
76a154f
d534002
4e683ec
70520da
 
4375b7f
76a154f
4e683ec
b327205
76a154f
b327205
76a154f
b327205
76a154f
b327205
76a154f
b327205
76a154f
 
 
 
 
 
b327205
 
76a154f
 
 
4e683ec
98df5b4
76a154f
4e683ec
814dd6c
b327205
 
 
012c0fa
b327205
4e683ec
 
012c0fa
 
 
 
 
76a154f
 
b1c12fa
76a154f
 
e12dd90
0da65a1
a928ce7
4e683ec
 
 
a928ce7
4e683ec
 
 
 
e12dd90
4e683ec
 
012c0fa
 
 
 
 
6111f2c
 
 
 
4e683ec
 
 
6111f2c
4e683ec
 
 
 
 
 
 
 
012c0fa
4e683ec
 
 
76a154f
4e683ec
 
 
 
76a154f
 
4e683ec
 
 
012c0fa
 
 
 
 
4e683ec
 
76a154f
 
 
 
4e683ec
 
 
76a154f
 
 
4e683ec
 
 
 
76a154f
 
 
4e683ec
 
 
 
76a154f
 
 
 
4e683ec
 
 
 
 
 
a928ce7
4e683ec
 
 
 
b327205
 
4e683ec
cadad8a
e12dd90
4e683ec
 
bb98bf8
4e683ec
 
 
76a154f
 
4e683ec
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
from threading import Thread
from typing import Iterator

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

MAX_MAX_NEW_TOKENS = 8192
DEFAULT_MAX_NEW_TOKENS = 2048
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

DESCRIPTION = """\
# Turkish LLaMA 8B Chat

This Space demonstrates [Turkish-Llama-8b-DPO-v0.1](https://huggingface.co/ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1) by YTU COSMOS Research Group, an 8B parameter model fine-tuned for Turkish language understanding and generation. Feel free to play with it, or duplicate to run generations without a queue!

🔎 This model is the newest and most advanced iteration of CosmosLLama, developed by merging two distinctly trained CosmosLLaMa-Instruct DPO models.

🤖 The model is optimized for Turkish language tasks and can handle various text generation scenarios including conversations, instructions, and general text completion.

💡 You can also try the model on the official demo page: [cosmos.yildiz.edu.tr/cosmosllama](https://cosmos.yildiz.edu.tr/cosmosllama)
"""

LICENSE = """
<p/>

---
This demo uses [Turkish-Llama-8b-DPO-v0.1](https://huggingface.co/ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1) by YTU COSMOS Research Group,
and is governed by the original llama3 license.
"""

if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"


if torch.cuda.is_available():
    model_id = "ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1"
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.use_default_system_prompt = False
    
    TERMINATORS = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]


@spaces.GPU
def generate(
    message: str,
    chat_history: list[dict],
    system_prompt: str = "",
    max_new_tokens: int = 2048,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.0,
) -> Iterator[str]:
    conversation = []
    if system_prompt:
        conversation.append({"role": "system", "content": system_prompt})
    conversation += chat_history
    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        return_tensors="pt"
    )
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
    input_ids = input_ids.to(model.device)

    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        {"input_ids": input_ids},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty,
        eos_token_id=TERMINATORS,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)


chat_interface = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Textbox(
            label="System prompt",
            lines=6,
            value="Sen bir yapay zeka asistanısın. Kullanıcı sana bir görev verecek. Amacın görevi olabildiğince sadık bir şekilde tamamlamak. Görevi yerine getirirken adım adım düşün ve adımlarını gerekçelendir.",
        ),
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.1,
            maximum=4.0,
            step=0.1,
            value=0.6,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.9,
        ),
        gr.Slider(
            label="Top-k",
            minimum=1,
            maximum=1000,
            step=1,
            value=50,
        ),
        gr.Slider(
            label="Repetition penalty",
            minimum=1.0,
            maximum=2.0,
            step=0.05,
            value=1.0,
        ),
    ],
    stop_btn=None,
    examples=[
        ["Merhaba! Nasılsın?"],
        ["Yapay zeka alanında açık kaynak kodun faydaları nelerdir?"],
    ],
    cache_examples=False,
    type="messages",
)

with gr.Blocks(css_paths="style.css", fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
    chat_interface.render()
    gr.Markdown(LICENSE)

if __name__ == "__main__":
    demo.queue(max_size=20).launch()