File size: 3,410 Bytes
fff2d97
f816f6b
fff2d97
f816f6b
 
71d0a2d
f816f6b
 
 
 
71d0a2d
f816f6b
 
71d0a2d
 
 
 
 
 
 
 
 
 
 
f816f6b
 
 
 
 
 
 
71d0a2d
f816f6b
71d0a2d
 
f816f6b
71d0a2d
 
 
 
f816f6b
 
 
71d0a2d
f816f6b
 
 
 
71d0a2d
 
 
 
f816f6b
71d0a2d
 
f816f6b
71d0a2d
 
 
 
 
f816f6b
 
71d0a2d
f816f6b
 
 
 
71d0a2d
 
f816f6b
 
 
 
 
 
71d0a2d
 
 
f816f6b
 
71d0a2d
 
f816f6b
fff2d97
 
71d0a2d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import gradio as gr
import requests, json

public_ip = '71.202.66.108'

model = 'llama3.1:latest'  # You can replace the model name if needed
context = [] 

ollama_serve = f"http://{public_ip}:11434/api/generate"

# Call Ollama API
def generate(prompt, context, top_k, top_p, temp):
    r = requests.post(ollama_serve,
                      json={
                          'model': model,
                          'prompt': prompt,
                          'context': context,
                          'options': {
                              'top_k': top_k,
                              'temperature': top_p,
                              'top_p': temp
                          }
                      },
                      stream=True)
    r.raise_for_status()

    response = ""  

    for line in r.iter_lines():
        body = json.loads(line)
        response_part = body.get('response', '')
        
        if 'error' in body:
            yield f"Error: {body['error']}"
            return

        # Append token to the growing response and yield the entire response so far
        if response_part:
            response += response_part
            yield response  # Yield the growing response incrementally

        if body.get('done', False):
            context = body.get('context', [])
            return  # End the generator once done

def chat(input, chat_history, top_k, top_p, temp):
    chat_history = chat_history or []
    global context
    
    # Initialize the user input as part of the chat history
    chat_history.append((input, ""))  # Add user input first
    response = ""  # Initialize empty response

    # Stream each part of the response as it's received
    response_stream = generate(input, context, top_k, top_p, temp)

    for response_part in response_stream:
        response = response_part  # Keep updating with the new part of the response
        # Update the latest assistant response (the second part of the tuple)
        chat_history[-1] = (input, response)
        yield chat_history, chat_history  # Yield the updated chat history


######################### Gradio Code ##########################
block = gr.Blocks()

with block:

    gr.Markdown("""<h1><center> Trashcan AI </center></h1>""")
    gr.Markdown("""<h3><center> LLama3.1 hosted on a 2013 "Trashcan" Mac Pro with ollama </center></h3>""")

    chatbot = gr.Chatbot()
    message = gr.Textbox(placeholder="Type here")

    state = gr.State()
    with gr.Row():
        top_k = gr.Slider(0.0, 100.0, label="top_k", value=40, info="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)")
        top_p = gr.Slider(0.0, 1.0, label="top_p", value=0.9, info="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)")
        temp = gr.Slider(0.0, 2.0, label="temperature", value=0.8, info="The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)")

    submit = gr.Button("SEND")
    
    # Use .click() to trigger the response streaming
    submit.click(chat, inputs=[message, state, top_k, top_p, temp], outputs=[chatbot, state])

if __name__ == "__main__":
    block.launch()