Spaces:
Sleeping
Sleeping
import os | |
import urllib.request | |
import gradio as gr | |
from llama_cpp import Llama | |
from langchain.llms import llamacpp | |
from huggingface_hub import login, hf_hub_download | |
from dotenv import load_dotenv | |
MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF" | |
MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf" | |
# MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-GGUF" | |
# MODEL_BASENAME = "Wizard-Vicuna-7B-Uncensored.Q4_K_M.gguf" | |
CONTEXT_WINDOW_SIZE = 8000 | |
MAX_NEW_TOKENS = 2000 | |
N_BATCH = 128 | |
# load_dotenv() | |
os.getenv('hf_token') | |
def load_quantized_model(model_id, model_basename): | |
try: | |
model_path = hf_hub_download( | |
repo_id=model_id, | |
filename=model_basename, | |
resume_download=True, | |
cache_dir="./models" | |
) | |
kwargs = { | |
'model_path': model_path, | |
'c_ctx': CONTEXT_WINDOW_SIZE, | |
'max_tokens': MAX_NEW_TOKENS, | |
'n_batch': N_BATCH | |
} | |
return llamacpp.LlamaCpp(**kwargs) | |
except TypeError: | |
return None | |
def load_model(model_id, model_basename=None): | |
if ".gguf" in model_basename.lower(): | |
llm = load_quantized_model(model_id, model_basename) | |
return llm | |
else: | |
print("currently only .gguf models supported") | |
def generate_text(prompt="Who is the CEO of Apple?"): | |
llm = load_model(MODEL_ID, MODEL_BASENAME) | |
output = llm( | |
prompt, | |
max_tokens=256, | |
temperature=0.1, | |
top_p=0.5, | |
echo=False, | |
stop=["#"], | |
) | |
print(output) | |
return output | |
# output_text = output["choices"][0]["text"].strip() | |
# # Remove Prompt Echo from Generated Text | |
# cleaned_output_text = output_text.replace(prompt, "") | |
# return cleaned_output_text | |
description = "Zephyr-beta" | |
examples = [ | |
["What is the capital of France?", "The capital of France is Paris."], | |
[ | |
"Who wrote the novel 'Pride and Prejudice'?", | |
"The novel 'Pride and Prejudice' was written by Jane Austen.", | |
], | |
["What is the square root of 64?", "The square root of 64 is 8."], | |
] | |
gradio_interface = gr.Interface( | |
fn=generate_text, | |
inputs="text", | |
outputs="text", | |
examples=examples, | |
title="Zephyr-B", | |
) | |
gradio_interface.launch(share=True) |