File size: 1,107 Bytes
06cf9c4
ed5c4cc
 
 
06cf9c4
ed5c4cc
06cf9c4
ed5c4cc
8833e69
360ead8
5f6c3f6
06cf9c4
3402c51
88d45e0
b0e95e2
3270654
d77d9c9
f7ad5f0
d77d9c9
793eafb
106ae96
2c5e4eb
f7ad5f0
36864b0
ce356a3
1e4b0ca
0329016
00b813c
0329016
ec04b94
ef70bbb
06cf9c4
117600f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import gradio as gr
import time
import ctypes #to run on C api directly 
import llama_cpp
from llama_cpp import Llama
from huggingface_hub import hf_hub_download #load from huggingfaces 


llm = Llama(model_path= hf_hub_download(repo_id="TheBloke/Vigogne-2-7B-Chat-GGML", filename="vigogne-2-7b-chat.ggmlv3.q4_1.bin"), n_ctx=2048) #download model from hf/ n_ctx=2048 for high ccontext length

history = []

def generate_text(input_text, history):
    print("history ",history)
    print("input ", input_text)
    print("response", response)
    if history == []:
        input_text_with_history = f"Q: {input_text} \n A:"
    else:
        input_text_with_history = history[-1][1]+ "\n"
        input_text_with_history += f"Q: {input_text} \n A:"
    print("new input", input_text_with_history)
    output = llm(input_text_with_history, max_tokens=1024, stop=["Q:", "\n"], echo=True)
    response = output['choices'][0]['text'] + "\n"
    history =["init",input_text_with_history]
    return response
    

demo = gr.ChatInterface(generate_text)
demo.queue(concurrency_count=1, max_size=5)
demo.launch()