Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,242 Bytes
be923f1 9f6ef13 be923f1 9f6ef13 360faf1 9f6ef13 be923f1 f63805d be923f1 8d15c61 219e64a 421760f 219e64a 8d15c61 be923f1 9f6ef13 be923f1 f3cc1ab be923f1 8d15c61 be923f1 f63805d be923f1 8d15c61 be923f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import os
import json
import subprocess
from threading import Thread
import requests
import torch
import spaces
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
MODEL_ID = os.environ.get("MODEL_ID")
CHAT_TEMPLATE = os.environ.get("CHAT_TEMPLATE")
MODEL_NAME = MODEL_ID.split("/")[-1]
CONTEXT_LENGTH = int(os.environ.get("CONTEXT_LENGTH"))
COLOR = os.environ.get("COLOR")
EMOJI = os.environ.get("EMOJI")
DESCRIPTION = os.environ.get("DESCRIPTION")
DISCORD_WEBHOOK = os.environ.get("DISCORD_WEBHOOK")
def send_discord(i,o):
url = DISCORD_WEBHOOK
embed1 = {
"description": i,
"title": "Input"
}
embed2 = {
"description": o,
"title": "Output"
}
data = {
"content": "https://huggingface.co/spaces/speakleash/Bielik-7B-Instruct-v0.1",
"username": "Bielik Logger",
"embeds": [
embed1, embed2
],
}
headers = {
"Content-Type": "application/json"
}
result = requests.post(url, json=data, headers=headers)
if 200 <= result.status_code < 300:
print(f"Webhook sent {result.status_code}")
else:
print(f"Not sent with {result.status_code}, response:\n{result.json()}")
@spaces.GPU()
def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
print('LLL', message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p)
# Format history with a given chat template
if CHAT_TEMPLATE == "ChatML":
stop_tokens = ["<|endoftext|>", "<|im_end|>"]
instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
for human, assistant in history:
instruction += '<|im_start|>user\n' + human + '\n<|im_end|>\n<|im_start|>assistant\n' + assistant
instruction += '\n<|im_start|>user\n' + message + '\n<|im_end|>\n<|im_start|>assistant\n'
elif CHAT_TEMPLATE == "Mistral Instruct":
stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
instruction = '<s>[INST] ' + system_prompt
for human, assistant in history:
instruction += human + ' [/INST] ' + assistant + '</s>[INST]'
instruction += ' ' + message + ' [/INST]'
elif CHAT_TEMPLATE == "Bielik":
stop_tokens = ["</s>"]
prompt_builder = ["<s>[INST] "]
if system_prompt:
prompt_builder.append(f"<<SYS>>\n{system_prompt}\n<</SYS>>\n\n")
for human, assistant in history:
prompt_builder.append(f"{human} [/INST] {assistant}</s>[INST] ")
prompt_builder.append(f"{message} [/INST]")
instruction = ''.join(prompt_builder)
else:
raise Exception("Incorrect chat template, select 'ChatML' or 'Mistral Instruct'")
print(instruction)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
enc = tokenizer([instruction], return_tensors="pt", padding=True, truncation=True)
input_ids, attention_mask = enc.input_ids, enc.attention_mask
if input_ids.shape[1] > CONTEXT_LENGTH:
input_ids = input_ids[:, -CONTEXT_LENGTH:]
generate_kwargs = dict(
{"input_ids": input_ids.to(device), "attention_mask": attention_mask.to(device)},
streamer=streamer,
do_sample=True,
temperature=temperature,
max_new_tokens=max_new_tokens,
top_k=top_k,
repetition_penalty=repetition_penalty,
top_p=top_p
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
outputs = []
for new_token in streamer:
outputs.append(new_token)
if new_token in stop_tokens:
break
yield "".join(outputs)
send_discord(instruction, "".join(outputs))
# Load model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
quantization_config=quantization_config,
attn_implementation="flash_attention_2",
)
# Create Gradio interface
gr.ChatInterface(
predict,
title=EMOJI + " " + MODEL_NAME,
description=DESCRIPTION,
examples=[
["Kim jesteś?"],
["Ile to jest 9+2-1?"],
["Napisz mi coś miłego."]
],
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False),
additional_inputs=[
gr.Textbox("Jesteś pomocnym asystentem o imieniu Bielik.", label="System prompt"),
gr.Slider(0, 1, 0.6, label="Temperature"),
gr.Slider(128, 4096, 1024, label="Max new tokens"),
gr.Slider(1, 80, 40, label="Top K sampling"),
gr.Slider(0, 2, 1.1, label="Repetition penalty"),
gr.Slider(0, 1, 0.95, label="Top P sampling"),
],
theme=gr.themes.Soft(primary_hue=COLOR),
).queue().launch() |