File size: 2,922 Bytes
fc46f2c 5b033e9 fc46f2c 3c59616 fc46f2c 3c59616 fc46f2c 3293bdb fc46f2c 3c59616 fc46f2c 3c59616 ddd8f10 fc46f2c 3c59616 97506b9 3c59616 97506b9 611802b 97506b9 3c59616 fc46f2c 3c59616 fc46f2c 3c59616 fc46f2c 7271565 fc46f2c 3c59616 fc46f2c a965d37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import spaces
model = Llama(
model_path=hf_hub_download(
repo_id=os.environ.get("REPO_ID", "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF"),
filename=os.environ.get("MODEL_FILE", "llama-o1-supervised-1129-q4_k_m.gguf"),
)
)
DESCRIPTION = '''
# SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
Focused on advancing AI reasoning capabilities.
## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
**To start a new chat**, click "clear" and start a new dialogue.
'''
LICENSE = """
--- MIT License ---
"""
template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
def llama_o1_template(data):
#query = data['query']
text = template.format(content=data)
return text
@spaces.GPU
def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
temp = ""
input_texts = [llama_o1_template(message)]
input_texts = [input_text.replace('<|end_of_text|>','') for input_text in input_texts]
#print(f"input_texts[0]: {input_texts[0]}")
inputs = model.tokenize(input_texts[0].encode('utf-8'))
for token in model.generate(inputs, top_p=top_p, temp=temperature):
#print(f"token: {token}")
text = model.detokenize([token])
#print(f"text detok: {text}")
temp += text.decode('utf-8')
yield temp
with gr.Blocks() as demo:
gr.Markdown(DESCRIPTION)
chatbot = gr.ChatInterface(
generate_text,
title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
description="Edit Settings below if needed.",
examples=[
["How many r's are in the word strawberry?"],
['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
['Find the least odd prime factor of $2019^8+1$.'],
],
cache_examples=False,
fill_height=True
)
with gr.Accordion("Adjust Parameters", open=False):
gr.Slider(minimum=1024, maximum=8192, value=2048, step=1, label="Max Tokens")
gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
gr.Markdown(LICENSE)
if __name__ == "__main__":
demo.launch()
|