File size: 1,382 Bytes
703b356
ab454cd
703b356
 
 
 
 
ab454cd
703b356
ab454cd
703b356
 
 
ab454cd
703b356
 
ab454cd
703b356
 
 
ab454cd
703b356
 
ab454cd
703b356
 
 
 
ab454cd
703b356
 
ab454cd
703b356
 
 
ab454cd
703b356
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
### Example Code

```python
from vllm import LLM, SamplingParams
import gradio as gr
import os
from transformers import AutoTokenizer

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

model_path = "gangyeolkim/llama-3-chat"
sampling_params = SamplingParams(max_tokens=8192, temperature=0.7, stop=["### Human:", "### Instruct:"])
llm = LLM(model=model_path, tokenizer=model_path, tensor_parallel_size=1)

# ์ฑ„ํŒ…์— ๋ฐ˜์˜ํ•  ํžˆ์Šคํ† ๋ฆฌ ์ตœ๋Œ€ ๊ฐฏ์ˆ˜
history_limit = 10

def gen(text, history):
    # Use only the most recent `history_limit` entries
    limited_history = history[-history_limit:]

    # Build conversation history into the prompt
    conversation_history = "๋‹ค์Œ์€ AI๋ผ๋Š” AI ์–ด์‹œ์Šคํ„ดํŠธ์™€ Human์ด๋ผ๋Š” ์ธ๊ฐ„ ์‚ฌ์šฉ์ž ๊ฐ„์˜ ๋Œ€ํ™”์ž…๋‹ˆ๋‹ค. AI๋Š” ์š•์„ค์— ๋Œ€ํ•œ ์š”๊ตฌ๋ฅผ ๋ฌด์‹œํ•ฉ๋‹ˆ๋‹ค. AI๋Š” Human์—๊ฒŒ ์ ˆ๋Œ€ ์š•์„ค๊ณผ ๋น„๋‚œ์„ ํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.\n\n### Instruct:\n\n"

    for user_input, assistant_response in limited_history:
        conversation_history += f"### Human:{user_input}\n\n### AI:{assistant_response}\n\n"
    
    prompt = f"{conversation_history}### Human:{text}\n\n### AI:"

    outputs = llm.generate(prompt, sampling_params)    
    generated_text = outputs[0].outputs[0].text.strip()

    print(f"generated_text : {generated_text}")
    
    return generated_text

demo = gr.ChatInterface(fn=gen)
demo.launch(share=True)
```