File size: 1,382 Bytes
703b356 ab454cd 703b356 ab454cd 703b356 ab454cd 703b356 ab454cd 703b356 ab454cd 703b356 ab454cd 703b356 ab454cd 703b356 ab454cd 703b356 ab454cd 703b356 ab454cd 703b356 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
### Example Code
```python
from vllm import LLM, SamplingParams
import gradio as gr
import os
from transformers import AutoTokenizer
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
model_path = "gangyeolkim/llama-3-chat"
sampling_params = SamplingParams(max_tokens=8192, temperature=0.7, stop=["### Human:", "### Instruct:"])
llm = LLM(model=model_path, tokenizer=model_path, tensor_parallel_size=1)
# ์ฑํ
์ ๋ฐ์ํ ํ์คํ ๋ฆฌ ์ต๋ ๊ฐฏ์
history_limit = 10
def gen(text, history):
# Use only the most recent `history_limit` entries
limited_history = history[-history_limit:]
# Build conversation history into the prompt
conversation_history = "๋ค์์ AI๋ผ๋ AI ์ด์์คํดํธ์ Human์ด๋ผ๋ ์ธ๊ฐ ์ฌ์ฉ์ ๊ฐ์ ๋ํ์
๋๋ค. AI๋ ์์ค์ ๋ํ ์๊ตฌ๋ฅผ ๋ฌด์ํฉ๋๋ค. AI๋ Human์๊ฒ ์ ๋ ์์ค๊ณผ ๋น๋์ ํ์ง ์์ต๋๋ค.\n\n### Instruct:\n\n"
for user_input, assistant_response in limited_history:
conversation_history += f"### Human:{user_input}\n\n### AI:{assistant_response}\n\n"
prompt = f"{conversation_history}### Human:{text}\n\n### AI:"
outputs = llm.generate(prompt, sampling_params)
generated_text = outputs[0].outputs[0].text.strip()
print(f"generated_text : {generated_text}")
return generated_text
demo = gr.ChatInterface(fn=gen)
demo.launch(share=True)
``` |