llama-cpp-python

Runtime error

File size: 1,601 Bytes

06cf9c4
fabc6ce
fa0a20e
4936389
06cf9c4
fa0a20e
06cf9c4
4936389
360ead8
b689e2e
360ead8
b689e2e
06cf9c4
b689e2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06cf9c4
d3a9044
06cf9c4
 
397a785
06cf9c4
 
 
 
 
 
 
b689e2e
397a785
d3a9044
06cf9c4

import gradio as gr
import time
import ctypes #to run on C api directly 
import llama_cpp
from llama_cpp import Llama
from huggingface_hub import hf_hub_download #load from huggingfaces 


llm = Llama(model_path= hf_hub_download(repo_id="TheBloke/Vigogne-2-7B-Instruct-GGML", filename="vigogne-2-7b-instruct.ggmlv3.q4_1.bin"), n_ctx=2048) #download model from hf/ n_ctx=2048 for high ccontext length
chat_history = []

def generate_text(message,history):

if len(history) > 0:
        user_input, bot_response = history[-1]  # Get the latest pair from history
        chat_history.append([user_input, message])
    else:
        chat_history.append([message, ""])  # If history is empty, just add the user input
   
input_text = message
    output = llm(f"Q: {input_text} \n A:", max_tokens=521, stop=["Q:", "\n"], echo=True)
    response = output['choices'][0]['text']

    # Append the bot response to the chat history
    chat_history[-1][1] = response

    return response
     

input_text = gr.inputs.Textbox(lines= 10, label="Enter your input text")
output_text = gr.outputs.Textbox(label="Output text")

description = " currently running ggml models with llama.cpp implementation in python [https://github.com/abetlen/llama-cpp-python]"

examples = [
    ["What is the capital of France? ", "The capital of France is Paris."],
    ["Who wrote the novel 'Pride and Prejudice'?", "The novel 'Pride and Prejudice' was written by Jane Austen."],
    ["What is the square root of 64?", "The square root of 64 is 8."]
]

demo = gr.ChatInterface(random_response).launch()
demo.queue()
demo.launch()