Spaces:
Running
Running
File size: 912 Bytes
5429f80 ec9d10c 5429f80 aa49098 5429f80 aa49098 5429f80 1db0c3f 5429f80 1db0c3f c62a826 5429f80 aa49098 5429f80 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import gradio as gr
from llama_cpp import Llama
llm = Llama(
model_path="gemma-2b-uk.gguf",
n_threads=2,
n_threads_batch=2,
)
def convert_history(message, history):
chat_history = ""
for block in history[-1:]:
chat_history += f"<|user|>\n{block[0]}<eos>\n<|assistant|>\n{block[1]}<eos>\n"
chat_history += f"<|user|>\n{message}<eos>\n<|assistant|>\n"
return chat_history
def ask(message, history):
chat_history = convert_history(message, history)
chunks = llm(
chat_history,
temperature = 0.2,
top_p=0.9,
stream = True,
repeat_penalty = 1.05,
max_tokens = 128,
)
response = ""
for chunk in chunks:
delta = chunk["choices"][0]["text"]
print(delta)
response += delta
yield response
demo = gr.ChatInterface(ask)
if __name__ == "__main__":
demo.queue().launch() |