File size: 3,563 Bytes
5f4691e e20cb99 ae519a4 5188dae 5f4691e 2c3da68 c9760a6 ae519a4 c9760a6 5f4691e ae519a4 c9760a6 5188dae c9760a6 5188dae 3981ed2 ae519a4 5188dae c9760a6 5188dae ae519a4 5f4691e e20cb99 130ba32 e20cb99 5f4691e ae519a4 130ba32 e20cb99 ae519a4 e20cb99 5f4691e c9760a6 c70a3f3 c9760a6 c70a3f3 c9760a6 c70a3f3 c9760a6 5f4691e c70a3f3 5f4691e c9760a6 5f4691e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import gradio as gr
import numpy as np
from transformers import pipeline
from custom_chat_interface import CustomChatInterface
from llama_cpp import Llama
from llama_cpp.llama_chat_format import MoondreamChatHandler
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
class MyModel:
def __init__(self):
self.client = None
self.current_model = ""
def respond(
self,
message,
history: list[tuple[str, str]],
model,
system_message,
max_tokens,
temperature,
top_p,
):
if model != self.current_model or self.current_model is None:
model_id, filename = model.split(",")
client = Llama.from_pretrained(
repo_id=model_id.strip(),
filename=f"*{filename.strip()}*.gguf",
n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
)
self.client = client
self.current_model = model
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
response = ""
for message in self.client.create_chat_completion(
messages,
temperature=temperature,
top_p=top_p,
stream=True,
max_tokens=max_tokens,
):
delta = message["choices"][0]["delta"]
if "content" in delta:
response += delta["content"]
yield response
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
def transcribe(audio):
sr, y = audio
# Convert to mono if stereo
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
y /= np.max(np.abs(y))
text = transcriber({"sampling_rate": sr, "raw": y})["text"]
return text
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
my_model = MyModel()
model_choices = [
"lab2-as/lora_model_gguf, Q4",
"lab2-as/lora_model_no_quant_gguf, Q4",
"lab2-as/lora_model_math_optimized_gguf, Q4",
]
demo = CustomChatInterface(
my_model.respond,
transcriber=transcribe,
additional_inputs=[
gr.Dropdown(
choices=model_choices,
value=model_choices[0],
label="Select Model",
),
gr.Textbox(
value="You are a chatbot with a proficiency in math. You are to answer the mathematical equations correctly and efficiently. You are to reason and explain your solutions thoroughly.",
label="System message",
),
gr.Slider(
minimum=1,
maximum=2048,
value=512,
step=1,
label="Max new tokens",
),
gr.Slider(
minimum=0.1,
maximum=4.0,
value=0.4,
step=0.1,
label="Temperature",
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.15,
step=0.05,
label="Top-p (Nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch()
|