llama-cpp-python

Runtime error

File size: 2,781 Bytes

06cf9c4
31cf808
ed5c4cc
 
 
06cf9c4
ed5c4cc
f8686c3
 
 
 
 
 
 
 
06cf9c4
ed5c4cc
b139f7c
1c7a4e8
5f6c3f6
06cf9c4
b139f7c
a14b41a
f8686c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3402c51
88d45e0
b0e95e2
a506243
d77d9c9
d264f30
d77d9c9
4ac82e2
d264f30
2c5e4eb
d264f30
dc7e243
31cf808
2bde560
4d3e3ef
 
f01cc65
 
 
2bde560
428c8e6
00b813c
35951e7
2ee138e
 
35951e7
 
 
 
 
ec04b94
ef70bbb
06cf9c4
117600f
 
3c6b95f

import gradio as gr
import copy
import time
import ctypes #to run on C api directly 
import llama_cpp
from llama_cpp import Llama
from huggingface_hub import hf_hub_download #load from huggingfaces 
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


llm = Llama(model_path= hf_hub_download(repo_id="TheBloke/Dolphin-Llama2-7B-GGML", filename="dolphin-llama2-7b.ggmlv3.q4_1.bin"), n_ctx=2048) #download model from hf/ n_ctx=2048 for high ccontext length

history = []

pre_prompt = " The user and the AI are having a conversation : <|endoftext|> \n "

def get_pdf_text(pdfs):
  text=""
  for pdf in pdfs:
    pdf_reader = PdfReader(pdf)
    for page in pdf_reader.pages:
      text+= page.extract_text()
  return text

def get_text_chunks(text):
  text_splitter = CharacterTextSplitter(separator="\n",
  chunk_size=1000, chunk_overlap = 200, length_function=len)
  chunks = text_splitter.split_text(text)
  return chunks

def get_vectorstore(text_chunks):
    embeddings = OpenAIEmbeddings()
#     embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

def generate_text(input_text, history):
    print("history ",history)
    print("input ", input_text)
    temp =""
    if history == []:
        input_text_with_history = f"SYSTEM:{pre_prompt}"+ "\n" + f"USER: {input_text} " + "\n" +" ASSISTANT:"
    else:
        input_text_with_history = f"{history[-1][1]}"+ "\n"
        input_text_with_history += f"USER: {input_text}" + "\n" +" ASSISTANT:"
    print("new input", input_text_with_history)
    output = llm(input_text_with_history, max_tokens=1024, stop=["<|prompter|>", "<|endoftext|>", "<|endoftext|> \n","ASSISTANT:","USER:","SYSTEM:"], stream=True)
    for out in output:
     stream = copy.deepcopy(out)
     print(stream["choices"][0]["text"])
     temp += stream["choices"][0]["text"]
     yield temp


    history =["init",input_text_with_history]
        


demo = gr.ChatInterface(generate_text,
    title="LLM on CPU",
    description="Running LLM with https://github.com/abetlen/llama-cpp-python. btw the text streaming thing was the hardest thing to impliment",
    examples=["Hello", "Am I cool?", "Are tomatoes vegetables?"],
    cache_examples=True,
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",)
demo.queue(concurrency_count=1, max_size=5)
demo.launch()