File size: 1,267 Bytes
06cf9c4
31cf808
ed5c4cc
 
 
06cf9c4
ed5c4cc
06cf9c4
ed5c4cc
4ac82e2
360ead8
5f6c3f6
06cf9c4
a14b41a
 
3402c51
88d45e0
b0e95e2
d77d9c9
a14b41a
d77d9c9
4ac82e2
a14b41a
2c5e4eb
a14b41a
a02f609
dc7e243
31cf808
2bde560
a14b41a
0b19b1e
6f0a8a9
2bde560
428c8e6
00b813c
0329016
ec04b94
ef70bbb
06cf9c4
117600f
 
3c6b95f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import gradio as gr
import copy
import time
import ctypes #to run on C api directly 
import llama_cpp
from llama_cpp import Llama
from huggingface_hub import hf_hub_download #load from huggingfaces 


llm = Llama(model_path= hf_hub_download(repo_id="TheBloke/orca_mini_3B-GGML", filename="orca-mini-3b.ggmlv3.q4_1.bin"), n_ctx=2048) #download model from hf/ n_ctx=2048 for high ccontext length

history = []

preprompt = " The user and the AI are having a conversation : "

def generate_text(input_text, history):
    print("history ",history)
    print("input ", input_text)
    if history == []:
        input_text_with_history = f"{pre_prompt}"+ "\n" + f"Q: {input_text} " + "\n" +" A:"
    else:
        input_text_with_history = f"{history[-1][1]}"+ "\n"
        input_text_with_history += f"Q: {input_text}" + "\n" +" A:"
    print("new input", input_text_with_history)
    output = llm(input_text_with_history, max_tokens=1024, stop=["Q:", "\n"], stream=True)

    for out in output:
     stream = copy.deepcopy(out)
     print(stream["choices"][0]["text"])
    yield stream["choices"][0]["text"]
    
    history =["init",input_text_with_history]
        


demo = gr.ChatInterface(generate_text)
demo.queue(concurrency_count=1, max_size=5)
demo.launch()