MuntasirHossain
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -2,9 +2,7 @@ import gradio as gr
|
|
2 |
import os
|
3 |
import requests
|
4 |
from llama_cpp import Llama
|
5 |
-
|
6 |
-
import transformers
|
7 |
-
import torch
|
8 |
|
9 |
llm_name = "MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF"
|
10 |
llm_path = os.path.basename(llm_name)
|
@@ -12,15 +10,9 @@ llm_path = os.path.basename(llm_name)
|
|
12 |
# download gguf model
|
13 |
def download_llms(llm_name):
|
14 |
"""Download GGUF model"""
|
15 |
-
|
16 |
download_url = ""
|
17 |
print("Downloading " + llm_name)
|
18 |
download_url = "https://huggingface.co/MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF/resolve/main/Q4_K_M.gguf"
|
19 |
-
|
20 |
-
# elif selected_llm == 'microsoft/Phi-3-mini-4k-instruct':
|
21 |
-
# download_url = "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf"
|
22 |
-
# elif selected_llm == 'mistralai/Mistral-7B-Instruct-v0.2':
|
23 |
-
# download_url = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q2_K.gguf"
|
24 |
|
25 |
if not os.path.exists("model"):
|
26 |
os.makedirs("model")
|
@@ -43,42 +35,33 @@ def download_llms(llm_name):
|
|
43 |
print(f"Model download completed {response.status_code}")
|
44 |
|
45 |
# define model pipeline with llama-cpp
|
46 |
-
def initialize_llm(llm_model):
|
47 |
model_path = ""
|
48 |
if llm_model == llm_name:
|
49 |
model_path = "model/Q4_K_M.gguf"
|
50 |
download_llms(llm_model)
|
51 |
llm = Llama(
|
52 |
model_path=model_path,
|
53 |
-
#
|
54 |
-
# max_tokens=256,
|
55 |
-
# top_p=1,
|
56 |
-
# top_k= top_k,
|
57 |
-
n_ctx=1024,
|
58 |
verbose=False
|
59 |
)
|
60 |
return llm
|
61 |
|
62 |
llm = initialize_llm(llm_name)
|
63 |
|
64 |
-
# format prompt as per the
|
65 |
def format_prompt(input_text, history):
|
66 |
-
system_prompt = "You are a helpful AI assistant. You are truthful in your response
|
|
|
67 |
prompt = ""
|
68 |
if history:
|
69 |
for previous_prompt, response in history:
|
70 |
prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{previous_prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
|
71 |
-
# <start_of_turn>user
|
72 |
-
# {previous_prompt}<end_of_turn>
|
73 |
-
# <start_of_turn>model
|
74 |
-
# {response}<end_of_turn>
|
75 |
prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
|
76 |
-
# <start_of_turn>user
|
77 |
-
# {input_text}<end_of_turn>
|
78 |
-
# <start_of_turn>model"""
|
79 |
return prompt
|
80 |
|
81 |
-
|
|
|
82 |
if not history:
|
83 |
history = []
|
84 |
|
@@ -89,39 +72,35 @@ def generate(prompt, history, max_new_tokens=256): # temperature=0.95, top_p=0.9
|
|
89 |
# temperature=temperature,
|
90 |
max_tokens=max_new_tokens,
|
91 |
# top_p=top_p,
|
92 |
-
# repetition_penalty=repetition_penalty,
|
93 |
-
# do_sample=True,
|
94 |
stop=["<|im_end|>"]
|
95 |
)
|
96 |
|
97 |
formatted_prompt = format_prompt(prompt, history)
|
98 |
|
|
|
99 |
response = llm(formatted_prompt, **kwargs, stream=True)
|
100 |
output = ""
|
101 |
for chunk in response:
|
102 |
output += chunk['choices'][0]['text']
|
103 |
yield output
|
104 |
return output
|
105 |
-
|
106 |
-
#
|
107 |
-
# output += chunk.token.text
|
108 |
-
# yield output
|
109 |
-
# return output
|
110 |
-
|
111 |
# response = llm(formatted_prompt, **kwargs)
|
112 |
# return response['choices'][0]['text']
|
113 |
|
114 |
chatbot = gr.Chatbot(height=500)
|
115 |
-
with gr.Blocks(theme=gr.themes.
|
116 |
gr.HTML("<center><h1>Fine-tuned Meta-Llama-3-8B</h1><center>")
|
|
|
117 |
gr.ChatInterface(
|
118 |
generate,
|
119 |
chatbot=chatbot,
|
120 |
retry_btn=None,
|
121 |
undo_btn=None,
|
122 |
clear_btn="Clear",
|
123 |
-
description="This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation",
|
124 |
# additional_inputs=additional_inputs,
|
125 |
-
examples=[["
|
126 |
)
|
127 |
demo.queue().launch()
|
|
|
2 |
import os
|
3 |
import requests
|
4 |
from llama_cpp import Llama
|
5 |
+
|
|
|
|
|
6 |
|
7 |
llm_name = "MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF"
|
8 |
llm_path = os.path.basename(llm_name)
|
|
|
10 |
# download gguf model
|
11 |
def download_llms(llm_name):
|
12 |
"""Download GGUF model"""
|
|
|
13 |
download_url = ""
|
14 |
print("Downloading " + llm_name)
|
15 |
download_url = "https://huggingface.co/MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF/resolve/main/Q4_K_M.gguf"
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
if not os.path.exists("model"):
|
18 |
os.makedirs("model")
|
|
|
35 |
print(f"Model download completed {response.status_code}")
|
36 |
|
37 |
# define model pipeline with llama-cpp
|
38 |
+
def initialize_llm(llm_model):
|
39 |
model_path = ""
|
40 |
if llm_model == llm_name:
|
41 |
model_path = "model/Q4_K_M.gguf"
|
42 |
download_llms(llm_model)
|
43 |
llm = Llama(
|
44 |
model_path=model_path,
|
45 |
+
n_ctx=1024, # input text context length, 0 = from model
|
|
|
|
|
|
|
|
|
46 |
verbose=False
|
47 |
)
|
48 |
return llm
|
49 |
|
50 |
llm = initialize_llm(llm_name)
|
51 |
|
52 |
+
# format prompt as per the ChatML template. The model was fine-tuned with this chat template
|
53 |
def format_prompt(input_text, history):
|
54 |
+
system_prompt = """You are a helpful AI assistant. You are truthful in your response for real-world matters
|
55 |
+
but you are also creative for imaginative/fictional tasks."""
|
56 |
prompt = ""
|
57 |
if history:
|
58 |
for previous_prompt, response in history:
|
59 |
prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{previous_prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
|
|
|
|
|
|
|
|
|
60 |
prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
|
|
|
|
|
|
|
61 |
return prompt
|
62 |
|
63 |
+
# generate llm response
|
64 |
+
def generate(prompt, history, max_new_tokens=256): # temperature=0.95, top_p=0.9
|
65 |
if not history:
|
66 |
history = []
|
67 |
|
|
|
72 |
# temperature=temperature,
|
73 |
max_tokens=max_new_tokens,
|
74 |
# top_p=top_p,
|
|
|
|
|
75 |
stop=["<|im_end|>"]
|
76 |
)
|
77 |
|
78 |
formatted_prompt = format_prompt(prompt, history)
|
79 |
|
80 |
+
# generate a streaming response
|
81 |
response = llm(formatted_prompt, **kwargs, stream=True)
|
82 |
output = ""
|
83 |
for chunk in response:
|
84 |
output += chunk['choices'][0]['text']
|
85 |
yield output
|
86 |
return output
|
87 |
+
|
88 |
+
# # generate response without streaming
|
|
|
|
|
|
|
|
|
89 |
# response = llm(formatted_prompt, **kwargs)
|
90 |
# return response['choices'][0]['text']
|
91 |
|
92 |
chatbot = gr.Chatbot(height=500)
|
93 |
+
with gr.Blocks(theme=gr.themes.Default(primary_hue="red", secondary_hue="pink")) as demo:
|
94 |
gr.HTML("<center><h1>Fine-tuned Meta-Llama-3-8B</h1><center>")
|
95 |
+
gr.Markdown("<b>This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation.</b>")
|
96 |
gr.ChatInterface(
|
97 |
generate,
|
98 |
chatbot=chatbot,
|
99 |
retry_btn=None,
|
100 |
undo_btn=None,
|
101 |
clear_btn="Clear",
|
102 |
+
# description="This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation.",
|
103 |
# additional_inputs=additional_inputs,
|
104 |
+
examples=[["What is a large language model?"], ["What is the meaning of life?"], ["Write a short fictional story about a planet named 'Orca'."]]
|
105 |
)
|
106 |
demo.queue().launch()
|