Spaces:

whoami02
/

gradio_101

Sleeping

App Files Files Community

gradio_101 / app.py

whoami02

Update app.py

e3b67b1 over 1 year ago

raw

history blame

2.25 kB

	import os
	import urllib.request
	import gradio as gr
	from llama_cpp import Llama
	from langchain.llms import llamacpp
	from huggingface_hub import login, hf_hub_download
	from dotenv import load_dotenv

	MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
	MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
	# MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-GGUF"
	# MODEL_BASENAME = "Wizard-Vicuna-7B-Uncensored.Q4_K_M.gguf"
	CONTEXT_WINDOW_SIZE = 8000
	MAX_NEW_TOKENS = 2000
	N_BATCH = 128
	# load_dotenv()
	os.getenv('hf_token')
	def load_quantized_model(model_id, model_basename):
	try:
	model_path = hf_hub_download(
	repo_id=model_id,
	filename=model_basename,
	resume_download=True,
	cache_dir="./models"
	)
	kwargs = {
	'model_path': model_path,
	'c_ctx': CONTEXT_WINDOW_SIZE,
	'max_tokens': MAX_NEW_TOKENS,
	'n_batch': N_BATCH
	}
	return llamacpp.LlamaCpp(**kwargs)
	except TypeError:
	return None

	def load_model(model_id, model_basename=None):
	if ".gguf" in model_basename.lower():
	llm = load_quantized_model(model_id, model_basename)
	return llm
	else:
	print("currently only .gguf models supported")



	def generate_text(prompt="Who is the CEO of Apple?"):
	llm = load_model(MODEL_ID, MODEL_BASENAME)
	output = llm(
	prompt,
	max_tokens=256,
	temperature=0.1,
	top_p=0.5,
	echo=False,
	stop=["#"],
	)
	print(output)
	return output
	# output_text = output["choices"][0]["text"].strip()

	# # Remove Prompt Echo from Generated Text
	# cleaned_output_text = output_text.replace(prompt, "")
	# return cleaned_output_text


	description = "Zephyr-beta"

	examples = [
	["What is the capital of France?", "The capital of France is Paris."],
	[
	"Who wrote the novel 'Pride and Prejudice'?",
	"The novel 'Pride and Prejudice' was written by Jane Austen.",
	],
	["What is the square root of 64?", "The square root of 64 is 8."],
	]

	gradio_interface = gr.Interface(
	fn=generate_text,
	inputs="text",
	outputs="text",
	examples=examples,
	title="Zephyr-B",
	)
	gradio_interface.launch(share=True)