File size: 4,336 Bytes
fb1e4d5
 
 
 
a947bc1
fb1e4d5
 
 
d386e4f
6c2d9a5
fb1e4d5
 
 
 
 
6c2d9a5
fb1e4d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d386e4f
fb1e4d5
 
a947bc1
fb1e4d5
 
6c2d9a5
fb1e4d5
6ca3222
fb1e4d5
a947bc1
64cec6b
fb1e4d5
 
 
 
 
 
a947bc1
fb1e4d5
92466aa
34b3043
fb1e4d5
 
 
 
 
 
 
a947bc1
1ce29c6
fb1e4d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a947bc1
a9cc21f
 
 
 
 
 
a947bc1
 
a9cc21f
 
fb1e4d5
 
cb4d9ba
f461ba8
2d478d0
fb1e4d5
 
 
 
 
 
a947bc1
fb1e4d5
fece52c
a215d38
fb1e4d5
8c68d29
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
import os
import requests
from llama_cpp import Llama

llm_name = "MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF"
llm_path = os.path.basename(llm_name)

gguf_model = "Q4_K_M.gguf" # "Q6_K.gguf" 

# download gguf model
def download_llms(llm_name):
    """Download GGUF model"""
    download_url = ""
    print("Downloading " + llm_name)
    download_url = f"https://huggingface.co/MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF/resolve/main/{gguf_model}"

    if not os.path.exists("model"):
        os.makedirs("model")
    
    llm_filename = os.path.basename(download_url)
    llm_temp_file_path = os.path.join("model", llm_filename)

    if os.path.exists(llm_temp_file_path):
        print("Model already available")
    else:
        response = requests.get(download_url, stream=True)
        if response.status_code == 200:
            with open(llm_temp_file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            
            print("Download completed")
        else:
            print(f"Model download unsuccessful {response.status_code}")

# define model pipeline with llama-cpp
def initialize_llm(llm_model): 
    model_path = ""
    if llm_model == llm_name:
        model_path = f"model/{gguf_model}"
        download_llms(llm_model)
    llm = Llama(
        model_path=model_path,
        n_ctx=1024, # input text context length, 0 = from model
        n_threads=2,
        verbose=False
        )
    return llm
    
llm = initialize_llm(llm_name)

# format prompt as per the ChatML template. The model was fine-tuned with this chat template 
def format_prompt(input_text, history):
    system_prompt = """You are an expert and  helpful AI assistant. You are truthful and constructive in your response for real-world matters 
    but you are also creative for imaginative/fictional tasks."""
    prompt = ""
    if history:
        for previous_prompt, response in history:
            prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{previous_prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
    prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
    return prompt

# generate llm response
def generate(prompt, history, max_new_tokens=512): # temperature=0.95, top_p=0.9
    if not history:
        history = []

    # temperature = float(temperature)
    # top_p = float(top_p)

    kwargs = dict(
        # temperature=temperature,
        max_tokens=max_new_tokens,
        # top_p=top_p,
        stop=["<|im_end|>"]
    )

    formatted_prompt = format_prompt(prompt, history)

    # generate a streaming response 
    response = llm(formatted_prompt, **kwargs, stream=True)
    output = ""
    for chunk in response:
        output += chunk['choices'][0]['text']
        yield output
    return output

    # # generate response without streaming
    # response = llm(formatted_prompt, **kwargs)
    # return response['choices'][0]['text']

chatbot = gr.Chatbot(height=500)
with gr.Blocks(theme=gr.themes.Default(primary_hue="sky")) as demo:
    gr.HTML("<center><h1>Fine-tuned Meta-Llama-3-8B Chatbot</h1><center>")
    gr.Markdown("This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation. <b>Note</b>: The app is running on a free basic CPU hosted on Hugging Facce Hub. Responses may be slow!")
    gr.ChatInterface(
        generate,
        chatbot=chatbot,  
        retry_btn=None,
        undo_btn=None,
        clear_btn="Clear",
        # description="This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation.",
        # additional_inputs=additional_inputs,
        examples=[["What is code vulnerability and how Generative AI can help to address code vulnerability?"], 
                  ["Imagine there is a planet named 'Orca' where life exists and the dominant species of the inhabitants are mysterious human-like intelligence. Write a short fictional story about the survival of this dominant species in the planet's extreme conditions. Use your imagination and creativity to set the plot of the story. Keep the story within 500 words."]]
    )
demo.queue().launch()