Spaces:
Build error
Build error
File size: 4,446 Bytes
b8c24aa 3a82207 63b82b4 c8fdb3b 3a82207 4e81072 7dc3087 deaeb85 00f3401 52fa8dc 895beee 2c2ec7f 52fa8dc 895beee 33cc946 00f3401 08c1bd3 33cc946 4e81072 7dc3087 1868f23 63b82b4 1868f23 52fa8dc 00f3401 63b82b4 895beee 52fa8dc 895beee 33cc946 ea9c0d3 7115ad7 ea9c0d3 7dc3087 33cc946 64d8a64 63b82b4 64d8a64 63b82b4 64d8a64 63b82b4 c7f7d96 08c1bd3 33cc946 a6b8174 00f3401 33cc946 3a82207 33cc946 3a82207 33cc946 3a82207 a6b8174 63b82b4 33cc946 3a82207 33cc946 3a82207 00f3401 33cc946 ea9c0d3 00f3401 33cc946 3a82207 33cc946 3a82207 00f3401 3a82207 33cc946 63b82b4 00f3401 63b82b4 ea9c0d3 63b82b4 9a34670 63b82b4 00f3401 63b82b4 33cc946 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import gradio as gr
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TextIteratorStreamer,
)
import os
from threading import Thread
import spaces
import time
import subprocess
from transformers_modules.apple.OpenELM-3B-Instruct.d3c76da586450c73898e6bed70cecf3376300fb3.configuration_openelm import OpenELMConfig
from transformers_modules.apple.OpenELM_3B_Instruct.d3c76da586450c73898e6bed70cecf3376300fb3.configuration_openelm import OpenELMConfig
from transformers import AutoConfig, AutoTokenizer
config = AutoConfig.from_pretrained("apple/OpenELM-270M", trust_remote_code=True)
# flash-attn ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ค์น. CUDA ๋น๋๋ ๊ฑด๋๋.
subprocess.run(
"pip install flash-attn --no-build-isolation",
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
shell=True,
)
# Hugging Face ํ ํฐ ๊ฐ์ ธ์ค๊ธฐ
token = os.environ["HF_TOKEN"]
# microsoft/Phi-3-mini-128k-instruct ๋ชจ๋ธ๊ณผ ํ ํฌ๋์ด์ ๊ฐ์ด ๋ก๋
# beomi/Llama-3-KoEn-8B-Instruct-preview ๋ชจ๋ธ๊ณผ ํ ํฌ๋์ด์ ๊ฐ์ด ๋ก๋
model = AutoModelForCausalLM.from_pretrained(
# "microsoft/Phi-3-mini-128k-instruct",
"apple/OpenELM-270M",
token=token,
trust_remote_code=True,
)
# bug fix
# tok = AutoTokenizer.from_pretrained("apple/OpenELM-3B-Instruct", token=token, trust_remote_code=True)
tok = AutoTokenizer.from_pretrained(
"apple/OpenELM-270M",
token=token,
trust_remote_code=True,
config=OpenELMConfig(),
)
# ์ข
๋ฃ ํ ํฐ ID ์ค์
terminators = [
tok.eos_token_id,
]
# GPU๊ฐ ์ฌ์ฉ ๊ฐ๋ฅํ ๊ฒฝ์ฐ GPU๋ก, ์๋๋ฉด CPU๋ก ๋ชจ๋ธ ๋ก๋
if torch.cuda.is_available():
device = torch.device("cuda")
print(f"Using GPU: {torch.cuda.get_device_name(device)}")
else:
device = torch.device("cpu")
print("Using CPU")
model = model.to(device)
# Spaces์ GPU ์์์ ์ฌ์ฉํ์ฌ chat ํจ์ ์คํ. ์ต๋ 60์ด ๋์ GPU ์์ ์ฌ์ฉ ๊ฐ๋ฅ.
@spaces.GPU(duration=60)
def chat(message, history, temperature, do_sample, max_tokens):
# ์ฑํ
๊ธฐ๋ก์ ์ ์ ํ ํ์์ผ๋ก ๋ณํ
chat = []
for item in history:
chat.append({"role": "user", "content": item[0]})
if item[1] is not None:
chat.append({"role": "assistant", "content": item[1]})
chat.append({"role": "user", "content": message})
# ํ ํฌ๋์ด์ ๋ฅผ ์ฌ์ฉํ์ฌ ์
๋ ฅ ์ฒ๋ฆฌ
messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
model_inputs = tok([messages], return_tensors="pt").to(device)
# TextIteratorStreamer๋ฅผ ์ฌ์ฉํ์ฌ ๋ชจ๋ธ ์ถ๋ ฅ ์คํธ๋ฆฌ๋ฐ
streamer = TextIteratorStreamer(
tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
)
# ์์ฑ ๊ด๋ จ ๋งค๊ฐ๋ณ์ ์ค์
generate_kwargs = dict(
model_inputs,
streamer=streamer,
max_new_tokens=max_tokens, # ์์ฑํ ์ต๋ ์ ํ ํฐ ์
do_sample=True, # ์ํ๋ง ์ฌ๋ถ
temperature=temperature, # ์จ๋ ๋งค๊ฐ๋ณ์. ๋์์๋ก ๋ค์์ฑ ์ฆ๊ฐ
eos_token_id=terminators, # ์ข
๋ฃ ํ ํฐ ID
)
# ์จ๋๊ฐ 0์ด๋ฉด ์ํ๋งํ์ง ์์
if temperature == 0:
generate_kwargs["do_sample"] = False
# ๋ณ๋ ์ค๋ ๋์์ ๋ชจ๋ธ ์์ฑ ์์
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
# ์์ฑ๋ ํ
์คํธ๋ฅผ ๋ฐ๋ณต์ ์ผ๋ก yield
partial_text = ""
for new_text in streamer:
partial_text += new_text
yield partial_text
yield partial_text
# Gradio์ ChatInterface๋ฅผ ์ฌ์ฉํ์ฌ ๋ํํ ์ธํฐํ์ด์ค ์์ฑ
demo = gr.ChatInterface(
fn=chat,
examples=[["Write me a poem about Machine Learning."]],
additional_inputs_accordion=gr.Accordion(
label="โ๏ธ Parameters", open=False, render=False
),
additional_inputs=[
gr.Slider(
minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
),
gr.Checkbox(label="Sampling", value=True),
gr.Slider(
minimum=128,
maximum=4096,
step=1,
value=512,
label="Max new tokens",
render=False,
),
],
stop_btn="Stop Generation",
title="Chat With LLMs",
description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
)
# Gradio ์ธํฐํ์ด์ค ์คํ
demo.launch() |