Spaces:
Running
Running
File size: 3,410 Bytes
fff2d97 f816f6b fff2d97 f816f6b 71d0a2d f816f6b 71d0a2d f816f6b 71d0a2d f816f6b 71d0a2d f816f6b 71d0a2d f816f6b 71d0a2d f816f6b 71d0a2d f816f6b 71d0a2d f816f6b 71d0a2d f816f6b 71d0a2d f816f6b 71d0a2d f816f6b 71d0a2d f816f6b 71d0a2d f816f6b 71d0a2d f816f6b fff2d97 71d0a2d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import gradio as gr
import requests, json
public_ip = '71.202.66.108'
model = 'llama3.1:latest' # You can replace the model name if needed
context = []
ollama_serve = f"http://{public_ip}:11434/api/generate"
# Call Ollama API
def generate(prompt, context, top_k, top_p, temp):
r = requests.post(ollama_serve,
json={
'model': model,
'prompt': prompt,
'context': context,
'options': {
'top_k': top_k,
'temperature': top_p,
'top_p': temp
}
},
stream=True)
r.raise_for_status()
response = ""
for line in r.iter_lines():
body = json.loads(line)
response_part = body.get('response', '')
if 'error' in body:
yield f"Error: {body['error']}"
return
# Append token to the growing response and yield the entire response so far
if response_part:
response += response_part
yield response # Yield the growing response incrementally
if body.get('done', False):
context = body.get('context', [])
return # End the generator once done
def chat(input, chat_history, top_k, top_p, temp):
chat_history = chat_history or []
global context
# Initialize the user input as part of the chat history
chat_history.append((input, "")) # Add user input first
response = "" # Initialize empty response
# Stream each part of the response as it's received
response_stream = generate(input, context, top_k, top_p, temp)
for response_part in response_stream:
response = response_part # Keep updating with the new part of the response
# Update the latest assistant response (the second part of the tuple)
chat_history[-1] = (input, response)
yield chat_history, chat_history # Yield the updated chat history
######################### Gradio Code ##########################
block = gr.Blocks()
with block:
gr.Markdown("""<h1><center> Trashcan AI </center></h1>""")
gr.Markdown("""<h3><center> LLama3.1 hosted on a 2013 "Trashcan" Mac Pro with ollama </center></h3>""")
chatbot = gr.Chatbot()
message = gr.Textbox(placeholder="Type here")
state = gr.State()
with gr.Row():
top_k = gr.Slider(0.0, 100.0, label="top_k", value=40, info="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)")
top_p = gr.Slider(0.0, 1.0, label="top_p", value=0.9, info="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)")
temp = gr.Slider(0.0, 2.0, label="temperature", value=0.8, info="The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)")
submit = gr.Button("SEND")
# Use .click() to trigger the response streaming
submit.click(chat, inputs=[message, state, top_k, top_p, temp], outputs=[chatbot, state])
if __name__ == "__main__":
block.launch()
|