Spaces:
Paused
Paused
File size: 4,829 Bytes
2bc99a0 432f817 927b5de 2bc99a0 a3c3064 8290337 6132d55 63a0917 b6dc5a5 2bc99a0 f2227ae b6dc5a5 432f817 a3c3064 f2227ae 2bc99a0 9bc49ef 0d5c130 9bc49ef 34723ea 0d5c130 63a0917 37c2762 9bc49ef 2bc99a0 b6dc5a5 a3c3064 2bc99a0 b6dc5a5 2bc99a0 432f817 fd37061 5ab0bbc a3c3064 2bc99a0 a3c3064 4e972fb f2227ae fd37061 93d63cd 0a0db1b 8503364 d402103 5ab0bbc fd37061 f2227ae 8de5029 fd37061 36072c8 670dcbd 8503364 fd37061 f2227ae fd37061 8290337 36072c8 fd37061 8290337 fd37061 f2227ae fd37061 4e972fb fd37061 ee86b18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import os
import math
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer , TextStreamer
import torch
import gradio as gr
import sentencepiece
title = "# Welcome to 🙋🏻♂️Tonic's🧠🤌🏻Neural Chat (From Intel)!"
description = """Try out [Intel/neural-chat-7b-v3-1](https://huggingface.co/Intel/neural-chat-7b-v3-1) the Instruct of [Intel/neural-chat-7b-v3](https://huggingface.co/Intel/neural-chat-7b-v3) Llama Finetune using the [mistralai/Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) recipe. You can use [Intel/neural-chat-7b-v3-1](https://huggingface.co/Intel/neural-chat-7b-v3-1) here via API using Gradio by scrolling down and clicking Use 'Via API' or privately by [cloning this space on huggingface](https://huggingface.co/spaces/TeamTonic/NeuralChat?duplicate=true) . [Join my active builders' server on discord](https://discord.gg/VqTxc76K3u). Let's build together!. """
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:50'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_name = "Intel/neural-chat-7b-v3-1"
tokenizer = AutoTokenizer.from_pretrained("Intel/neural-chat-7b-v3-1")
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
streamer = TextStreamer(tokenizer)
class IntelChatBot:
def __init__(self, model, tokenizer, system_message="You are 🧠🤌🏻Neuro, an AI language model created by Tonic-AI. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."):
self.model = model
self.tokenizer = tokenizer
self.system_message = system_message
def set_system_message(self, new_system_message):
self.system_message = new_system_message
def format_prompt(self, user_message):
prompt = f"### System:\n {self.system_message}\n ### User:\n{user_message}\n### System:\n"
return prompt
def neuro(self, user_message, temperature, max_new_tokens, top_p, repetition_penalty, do_sample):
prompt = self.format_prompt(user_message)
inputs = self.tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
input_ids = inputs["input_ids"].to(self.model.device)
attention_mask = inputs["attention_mask"].to(self.model.device)
output_ids = self.model.generate(
input_ids,
attention_mask=attention_mask,
max_length=input_ids.shape[1] + max_new_tokens,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
streamer=streamer,
do_sample=do_sample
)
response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
return response
def gradio_predict(user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty, do_sample):
Intel_bot.set_system_message(system_message)
if not do_sample:
max_length = 780
temperature = 0.9
top_p = 0.9
repetition_penalty = 0.9
response = Intel_bot.neuro(user_message, temperature, max_new_tokens, top_p, repetition_penalty, do_sample)
return response
Intel_bot = IntelChatBot(model, tokenizer)
with gr.Blocks(theme = "ParityError/Anime") as demo:
gr.Markdown(title)
gr.Markdown(description)
with gr.Row():
system_message = gr.Textbox(label="Optional 🧠🤌🏻NeuralChat Assistant Message", lines=2)
user_message = gr.Textbox(label="Your Message", lines=3)
with gr.Row():
do_sample = gr.Checkbox(label="Advanced", value=False)
with gr.Accordion("Advanced Settings", open=lambda do_sample: do_sample):
with gr.Row():
max_new_tokens = gr.Slider(label="Max new tokens", value=780, minimum=150, maximum=3200, step=1)
temperature = gr.Slider(label="Temperature", value=0.3, minimum=0.1, maximum=1.0, step=0.1)
top_p = gr.Slider(label="Top-p (nucleus sampling)", value=0.90, minimum=0.01, maximum=0.99, step=0.05)
repetition_penalty = gr.Slider(label="Repetition penalty", value=0.9, minimum=1.0, maximum=1.0, step=0.05)
submit_button = gr.Button("Submit")
output_text = gr.Textbox(label="🧠🤌🏻NeuralChat Response")
def process(user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty, do_sample):
return gradio_predict(user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty, do_sample)
submit_button.click(
process,
inputs=[user_message, system_message, max_new_tokens, temperature, top_p, repetition_penalty, do_sample],
outputs=output_text
)
demo.launch() |