Example Code
from vllm import LLM, SamplingParams
import gradio as gr
import os
from transformers import AutoTokenizer
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
model_path = "gangyeolkim/llama-3-chat"
sampling_params = SamplingParams(max_tokens=8192, temperature=0.7, stop=["### Human:", "### Instruct:"])
llm = LLM(model=model_path, tokenizer=model_path, tensor_parallel_size=1)
# ์ฑํ
์ ๋ฐ์ํ ํ์คํ ๋ฆฌ ์ต๋ ๊ฐฏ์
history_limit = 10
def gen(text, history):
# Use only the most recent `history_limit` entries
limited_history = history[-history_limit:]
# Build conversation history into the prompt
conversation_history = "๋ค์์ AI๋ผ๋ AI ์ด์์คํดํธ์ Human์ด๋ผ๋ ์ธ๊ฐ ์ฌ์ฉ์ ๊ฐ์ ๋ํ์
๋๋ค. AI๋ ์์ค์ ๋ํ ์๊ตฌ๋ฅผ ๋ฌด์ํฉ๋๋ค. AI๋ Human์๊ฒ ์ ๋ ์์ค๊ณผ ๋น๋์ ํ์ง ์์ต๋๋ค.\n\n### Instruct:\n\n"
for user_input, assistant_response in limited_history:
conversation_history += f"### Human:{user_input}\n\n### AI:{assistant_response}\n\n"
prompt = f"{conversation_history}### Human:{text}\n\n### AI:"
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text.strip()
print(f"generated_text : {generated_text}")
return generated_text
demo = gr.ChatInterface(fn=gen)
demo.launch(share=True)
Base Model
- allganize/Llama-3-Alpha-Ko-8B-Instruct
- Downloads last month
- 2,109
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.