Example Code

from vllm import LLM, SamplingParams
import gradio as gr
import os
from transformers import AutoTokenizer

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

model_path = "gangyeolkim/llama-3-chat"
sampling_params = SamplingParams(max_tokens=8192, temperature=0.7, stop=["### Human:", "### Instruct:"])
llm = LLM(model=model_path, tokenizer=model_path, tensor_parallel_size=1)

# ์ฑ„ํŒ…์— ๋ฐ˜์˜ํ•  ํžˆ์Šคํ† ๋ฆฌ ์ตœ๋Œ€ ๊ฐฏ์ˆ˜
history_limit = 10

def gen(text, history):
    # Use only the most recent `history_limit` entries
    limited_history = history[-history_limit:]

    # Build conversation history into the prompt
    conversation_history = "๋‹ค์Œ์€ AI๋ผ๋Š” AI ์–ด์‹œ์Šคํ„ดํŠธ์™€ Human์ด๋ผ๋Š” ์ธ๊ฐ„ ์‚ฌ์šฉ์ž ๊ฐ„์˜ ๋Œ€ํ™”์ž…๋‹ˆ๋‹ค. AI๋Š” ์š•์„ค์— ๋Œ€ํ•œ ์š”๊ตฌ๋ฅผ ๋ฌด์‹œํ•ฉ๋‹ˆ๋‹ค. AI๋Š” Human์—๊ฒŒ ์ ˆ๋Œ€ ์š•์„ค๊ณผ ๋น„๋‚œ์„ ํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.\n\n### Instruct:\n\n"

    for user_input, assistant_response in limited_history:
        conversation_history += f"### Human:{user_input}\n\n### AI:{assistant_response}\n\n"
    
    prompt = f"{conversation_history}### Human:{text}\n\n### AI:"

    outputs = llm.generate(prompt, sampling_params)    
    generated_text = outputs[0].outputs[0].text.strip()

    print(f"generated_text : {generated_text}")
    
    return generated_text

demo = gr.ChatInterface(fn=gen)
demo.launch(share=True)

Base Model

  • allganize/Llama-3-Alpha-Ko-8B-Instruct
Downloads last month
3,510
Safetensors
Model size
8.03B params
Tensor type
F32
ยท
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.