import os import copy import llama_cpp import gradio as gr from llama_cpp import Llama import datetime from huggingface_hub import hf_hub_download #MODEL SETTINGS also for DISPLAY convHistory = '' modelfile = hf_hub_download( repo_id=os.environ.get("REPO_ID", "TheBloke/Starling-LM-7B-alpha-GGUF"), filename=os.environ.get("MODEL_FILE", "starling-lm-7b-alpha.Q4_K_M.gguf"), ) #"https://huggingface.co/TheBloke/Starling-LM-7B-alpha-GGUF/blob/main/starling-lm-7b-alpha.Q4_K_M.gguf" repetitionpenalty = 1.15 contextlength=8192 logfile = 'StarlingLM7B_logs.txt' print("loading model...") stt = datetime.datetime.now() # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system. llm = Llama( model_path=modelfile, # Download the model file first n_ctx=contextlength, # The max sequence length to use - note that longer sequence lengths require much more resources #n_threads=2, # The number of CPU threads to use, tailor to your system and the resulting performance ) dt = datetime.datetime.now() - stt print(f"Model loaded in {dt}") def writehistory(text): with open(logfile, 'a') as f: f.write(text) f.write('\n') f.close() """ gr.themes.Base() gr.themes.Default() gr.themes.Glass() gr.themes.Monochrome() gr.themes.Soft() """ def combine(a, b, c, d,e,f): global convHistory import datetime SYSTEM_PROMPT = f"""{a} """ # parameters here: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__ temperature = c max_new_tokens = d repeat_penalty = f top_p = e prompt = f"GPT4 User: {b}<|end_of_turn|>GPT4 Assistant:" start = datetime.datetime.now() generation = "" delta = "" prompt_tokens = f"Prompt Tokens: {len(llm.tokenize(bytes(prompt,encoding='utf-8')))}" generated_text = "" answer_tokens = '' total_tokens = '' for character in llm(prompt, max_tokens=max_new_tokens, stop=[""], temperature = temperature, repeat_penalty = repeat_penalty, top_p = top_p, # Example stop token - not necessarily correct for this specific model! Please check before using. echo=False, stream=True): generation += character["choices"][0]["text"] answer_tokens = f"Out Tkns: {len(llm.tokenize(bytes(generation,encoding='utf-8')))}" total_tokens = f"Total Tkns: {len(llm.tokenize(bytes(prompt,encoding='utf-8'))) + len(llm.tokenize(bytes(generation,encoding='utf-8')))}" delta = datetime.datetime.now() - start yield generation, delta, prompt_tokens, answer_tokens, total_tokens timestamp = datetime.datetime.now() logger = f"""time: {timestamp}\n Temp: {temperature} - MaxNewTokens: {max_new_tokens} - RepPenalty: 1.5 \nPROMPT: \n{prompt}\nClaude2Alpaca-7B: {generation}\nGenerated in {delta}\nPromptTokens: {prompt_tokens} Output Tokens: {answer_tokens} Total Tokens: {total_tokens}\n\n---\n\n""" writehistory(logger) convHistory = convHistory + prompt + "\n" + generation + "\n" print(convHistory) return generation, delta, prompt_tokens, answer_tokens, total_tokens #return generation, delta # MAIN GRADIO INTERFACE with gr.Blocks(theme='WeixuanYuan/Soft_dark') as demo: #theme=gr.themes.Glass() #theme='remilia/Ghostly' #TITLE SECTION with gr.Row(variant='compact'): with gr.Column(scale=12): gr.HTML("