### Example Code ```python from vllm import LLM, SamplingParams import gradio as gr import os from transformers import AutoTokenizer os.environ["CUDA_VISIBLE_DEVICES"] = "1" model_path = "gangyeolkim/llama-3-chat" sampling_params = SamplingParams(max_tokens=8192, temperature=0.7, stop=["### Human:", "### Instruct:"]) llm = LLM(model=model_path, tokenizer=model_path, tensor_parallel_size=1) # 채팅에 반영할 히스토리 최대 갯수 history_limit = 10 def gen(text, history): # Use only the most recent `history_limit` entries limited_history = history[-history_limit:] # Build conversation history into the prompt conversation_history = "다음은 AI라는 AI 어시스턴트와 Human이라는 인간 사용자 간의 대화입니다. AI는 욕설에 대한 요구를 무시합니다. AI는 Human에게 절대 욕설과 비난을 하지 않습니다.\n\n### Instruct:\n\n" for user_input, assistant_response in limited_history: conversation_history += f"### Human:{user_input}\n\n### AI:{assistant_response}\n\n" prompt = f"{conversation_history}### Human:{text}\n\n### AI:" outputs = llm.generate(prompt, sampling_params) generated_text = outputs[0].outputs[0].text.strip() print(f"generated_text : {generated_text}") return generated_text demo = gr.ChatInterface(fn=gen) demo.launch(share=True) ``` ### Base Model - allganize/Llama-3-Alpha-Ko-8B-Instruct