Spaces:

whoami02
/

gradio_101

Sleeping

App Files Files Community

whoami02 commited on Dec 29, 2023

Commit

e3b67b1

1 Parent(s): 45243bb

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -23

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
 import urllib.request
 import gradio as gr
-# from llama_cpp import Llama
 from langchain.llms import llamacpp
 from huggingface_hub import login, hf_hub_download
-# from dotenv import load_dotenv, find_dotenv
 MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
 MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
@@ -13,43 +13,56 @@ MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
 CONTEXT_WINDOW_SIZE = 8000
 MAX_NEW_TOKENS = 2000
 N_BATCH = 128
-os.getenv("hf_token")
 def load_quantized_model(model_id, model_basename):
     try:
         model_path = hf_hub_download(
         print("currently only .gguf models supported")
-# Dowloading GGML model from HuggingFace
-# ggml_model_path = "https://huggingface.co/CRD716/ggml-vicuna-1.1-quantized/resolve/main/ggml-vicuna-7b-1.1-q4_1.bin"
-# filename = "ggml-vicuna-7b-1.1-q4_1.bin"
-# download_file(ggml_model_path, filename)
-# llm = Llama(model_path=filename, n_ctx=512, n_batch=126)
 def generate_text(prompt="Who is the CEO of Apple?"):
     llm = load_model(MODEL_ID, MODEL_BASENAME)
     output = llm(
     # cleaned_output_text = output_text.replace(prompt, "")
     # return cleaned_output_text
 description = "Zephyr-beta"
 examples = [
     ["What is the capital of France?", "The capital of France is Paris."],
     [

 import os
 import urllib.request
 import gradio as gr
+from llama_cpp import Llama
 from langchain.llms import llamacpp
 from huggingface_hub import login, hf_hub_download
+from dotenv import load_dotenv
 MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
 MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
 CONTEXT_WINDOW_SIZE = 8000
 MAX_NEW_TOKENS = 2000
 N_BATCH = 128
+# load_dotenv()
+os.getenv('hf_token')
 def load_quantized_model(model_id, model_basename):
     try:
         model_path = hf_hub_download(
+            repo_id=model_id,
+            filename=model_basename,
+            resume_download=True,
+            cache_dir="./models"
+        )
+        kwargs = {
+            'model_path': model_path,
+            'c_ctx': CONTEXT_WINDOW_SIZE,
+            'max_tokens': MAX_NEW_TOKENS,
+            'n_batch': N_BATCH
+        }
+        return llamacpp.LlamaCpp(**kwargs)
+    except TypeError:
+        return None
+def load_model(model_id, model_basename=None):
+    if ".gguf" in model_basename.lower():
+        llm = load_quantized_model(model_id, model_basename)
+        return llm
+    else:
         print("currently only .gguf models supported")
 def generate_text(prompt="Who is the CEO of Apple?"):
     llm = load_model(MODEL_ID, MODEL_BASENAME)
     output = llm(
+        prompt,
+        max_tokens=256,
+        temperature=0.1,
+        top_p=0.5,
+        echo=False,
+        stop=["#"],
+    )
+    print(output)
+    return output
+    # output_text = output["choices"][0]["text"].strip()
+    # # Remove Prompt Echo from Generated Text
     # cleaned_output_text = output_text.replace(prompt, "")
     # return cleaned_output_text
 description = "Zephyr-beta"
 examples = [
     ["What is the capital of France?", "The capital of France is Paris."],
     [