Spaces:

whoami02
/

gradio_101

Sleeping

App Files Files Community

whoami02 commited on Dec 29, 2023

Commit

45243bb

1 Parent(s): 2ea2830

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -37

app.py CHANGED Viewed

@@ -1,66 +1,55 @@
 import os
 import urllib.request
 import gradio as gr
-from llama_cpp import Llama
 from langchain.llms import llamacpp
 from huggingface_hub import login, hf_hub_download
-from dotenv import load_dotenv
 MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
 MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
 CONTEXT_WINDOW_SIZE = 8000
 MAX_NEW_TOKENS = 2000
 N_BATCH = 128
 def load_quantized_model(model_id, model_basename):
     try:
         model_path = hf_hub_download(
-            repo_id=model_id,
-            filename=model_basename,
-            resume_download=True,
-            cache_dir="./models"
-        )
-        kwargs = {
-            'model_path': model_path,
-            'c_ctx': CONTEXT_WINDOW_SIZE,
-            'max_tokens': MAX_NEW_TOKENS,
-            'n_batch': N_BATCH
-        }
-        return llamacpp.LlamaCpp(**kwargs)
-    except TypeError:
-        return None
-def load_model(model_id, model_basename=None):
-    if ".gguf" in model_basename.lower():
-        llm = load_quantized_model(model_id, model_basename)
-        return llm
-    else:
         print("currently only .gguf models supported")
 def generate_text(prompt="Who is the CEO of Apple?"):
     llm = load_model(MODEL_ID, MODEL_BASENAME)
     output = llm(
-        prompt,
-        max_tokens=256,
-        temperature=0.1,
-        top_p=0.5,
-        echo=False,
-        stop=["#"],
-    )
-    print(output)
-    return output
-    # output_text = output["choices"][0]["text"].strip()
-    # # Remove Prompt Echo from Generated Text
     # cleaned_output_text = output_text.replace(prompt, "")
     # return cleaned_output_text
-os.getenv("hf_token")
 description = "Zephyr-beta"
 examples = [
     ["What is the capital of France?", "The capital of France is Paris."],
     [
@@ -77,4 +66,4 @@ gradio_interface = gr.Interface(
     examples=examples,
     title="Zephyr-B",
 )
-gradio_interface.launch()

 import os
 import urllib.request
 import gradio as gr
+# from llama_cpp import Llama
 from langchain.llms import llamacpp
 from huggingface_hub import login, hf_hub_download
+# from dotenv import load_dotenv, find_dotenv
 MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
 MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
+# MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-GGUF"
+# MODEL_BASENAME = "Wizard-Vicuna-7B-Uncensored.Q4_K_M.gguf"
 CONTEXT_WINDOW_SIZE = 8000
 MAX_NEW_TOKENS = 2000
 N_BATCH = 128
+os.getenv("hf_token")
 def load_quantized_model(model_id, model_basename):
     try:
         model_path = hf_hub_download(
         print("currently only .gguf models supported")
+# Dowloading GGML model from HuggingFace
+# ggml_model_path = "https://huggingface.co/CRD716/ggml-vicuna-1.1-quantized/resolve/main/ggml-vicuna-7b-1.1-q4_1.bin"
+# filename = "ggml-vicuna-7b-1.1-q4_1.bin"
+# download_file(ggml_model_path, filename)
+# llm = Llama(model_path=filename, n_ctx=512, n_batch=126)
 def generate_text(prompt="Who is the CEO of Apple?"):
     llm = load_model(MODEL_ID, MODEL_BASENAME)
     output = llm(
     # cleaned_output_text = output_text.replace(prompt, "")
     # return cleaned_output_text
 description = "Zephyr-beta"
 examples = [
     ["What is the capital of France?", "The capital of France is Paris."],
     [
     examples=examples,
     title="Zephyr-B",
 )
+gradio_interface.launch(share=True)