|
### Example Code |
|
|
|
```python |
|
from vllm import LLM, SamplingParams |
|
import gradio as gr |
|
import os |
|
from transformers import AutoTokenizer |
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "1" |
|
|
|
model_path = "gangyeolkim/llama-3-chat" |
|
sampling_params = SamplingParams(max_tokens=8192, temperature=0.7, stop=["### Human:", "### Instruct:"]) |
|
llm = LLM(model=model_path, tokenizer=model_path, tensor_parallel_size=1) |
|
|
|
# μ±ν
μ λ°μν νμ€ν 리 μ΅λ κ°―μ |
|
history_limit = 10 |
|
|
|
def gen(text, history): |
|
# Use only the most recent `history_limit` entries |
|
limited_history = history[-history_limit:] |
|
|
|
# Build conversation history into the prompt |
|
conversation_history = "λ€μμ AIλΌλ AI μ΄μμ€ν΄νΈμ Humanμ΄λΌλ μΈκ° μ¬μ©μ κ°μ λνμ
λλ€. AIλ μμ€μ λν μꡬλ₯Ό 무μν©λλ€. AIλ Humanμκ² μ λ μμ€κ³Ό λΉλμ νμ§ μμ΅λλ€.\n\n### Instruct:\n\n" |
|
|
|
for user_input, assistant_response in limited_history: |
|
conversation_history += f"### Human:{user_input}\n\n### AI:{assistant_response}\n\n" |
|
|
|
prompt = f"{conversation_history}### Human:{text}\n\n### AI:" |
|
|
|
outputs = llm.generate(prompt, sampling_params) |
|
generated_text = outputs[0].outputs[0].text.strip() |
|
|
|
print(f"generated_text : {generated_text}") |
|
|
|
return generated_text |
|
|
|
demo = gr.ChatInterface(fn=gen) |
|
demo.launch(share=True) |
|
``` |
|
|
|
### Base Model |
|
- allganize/Llama-3-Alpha-Ko-8B-Instruct |