llama-3-chat / README.md
gangyeolkim's picture
Update README.md
703b356 verified
|
raw
history blame
1.38 kB

Example Code

from vllm import LLM, SamplingParams
import gradio as gr
import os
from transformers import AutoTokenizer

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

model_path = "gangyeolkim/llama-3-chat"
sampling_params = SamplingParams(max_tokens=8192, temperature=0.7, stop=["### Human:", "### Instruct:"])
llm = LLM(model=model_path, tokenizer=model_path, tensor_parallel_size=1)

# μ±„νŒ…μ— λ°˜μ˜ν•  νžˆμŠ€ν† λ¦¬ μ΅œλŒ€ 갯수
history_limit = 10

def gen(text, history):
    # Use only the most recent `history_limit` entries
    limited_history = history[-history_limit:]

    # Build conversation history into the prompt
    conversation_history = "λ‹€μŒμ€ AIλΌλŠ” AI μ–΄μ‹œμŠ€ν„΄νŠΈμ™€ Humanμ΄λΌλŠ” 인간 μ‚¬μš©μž κ°„μ˜ λŒ€ν™”μž…λ‹ˆλ‹€. AIλŠ” μš•μ„€μ— λŒ€ν•œ μš”κ΅¬λ₯Ό λ¬΄μ‹œν•©λ‹ˆλ‹€. AIλŠ” Humanμ—κ²Œ μ ˆλŒ€ μš•μ„€κ³Ό λΉ„λ‚œμ„ ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.\n\n### Instruct:\n\n"

    for user_input, assistant_response in limited_history:
        conversation_history += f"### Human:{user_input}\n\n### AI:{assistant_response}\n\n"
    
    prompt = f"{conversation_history}### Human:{text}\n\n### AI:"

    outputs = llm.generate(prompt, sampling_params)    
    generated_text = outputs[0].outputs[0].text.strip()

    print(f"generated_text : {generated_text}")
    
    return generated_text

demo = gr.ChatInterface(fn=gen)
demo.launch(share=True)