whoami02 commited on
Commit
e3b67b1
·
1 Parent(s): 45243bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -23
app.py CHANGED
@@ -1,10 +1,10 @@
1
  import os
2
  import urllib.request
3
  import gradio as gr
4
- # from llama_cpp import Llama
5
  from langchain.llms import llamacpp
6
  from huggingface_hub import login, hf_hub_download
7
- # from dotenv import load_dotenv, find_dotenv
8
 
9
  MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
10
  MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
@@ -13,43 +13,56 @@ MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
13
  CONTEXT_WINDOW_SIZE = 8000
14
  MAX_NEW_TOKENS = 2000
15
  N_BATCH = 128
16
-
17
- os.getenv("hf_token")
18
  def load_quantized_model(model_id, model_basename):
19
  try:
20
  model_path = hf_hub_download(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  print("currently only .gguf models supported")
22
 
23
 
24
- # Dowloading GGML model from HuggingFace
25
- # ggml_model_path = "https://huggingface.co/CRD716/ggml-vicuna-1.1-quantized/resolve/main/ggml-vicuna-7b-1.1-q4_1.bin"
26
- # filename = "ggml-vicuna-7b-1.1-q4_1.bin"
27
-
28
- # download_file(ggml_model_path, filename)
29
-
30
-
31
- # llm = Llama(model_path=filename, n_ctx=512, n_batch=126)
32
-
33
 
34
  def generate_text(prompt="Who is the CEO of Apple?"):
35
  llm = load_model(MODEL_ID, MODEL_BASENAME)
36
  output = llm(
 
 
 
 
 
 
 
 
 
 
 
 
37
  # cleaned_output_text = output_text.replace(prompt, "")
38
  # return cleaned_output_text
39
 
40
 
41
-
42
-
43
-
44
  description = "Zephyr-beta"
45
 
46
-
47
-
48
-
49
-
50
-
51
-
52
-
53
  examples = [
54
  ["What is the capital of France?", "The capital of France is Paris."],
55
  [
 
1
  import os
2
  import urllib.request
3
  import gradio as gr
4
+ from llama_cpp import Llama
5
  from langchain.llms import llamacpp
6
  from huggingface_hub import login, hf_hub_download
7
+ from dotenv import load_dotenv
8
 
9
  MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
10
  MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
 
13
  CONTEXT_WINDOW_SIZE = 8000
14
  MAX_NEW_TOKENS = 2000
15
  N_BATCH = 128
16
+ # load_dotenv()
17
+ os.getenv('hf_token')
18
  def load_quantized_model(model_id, model_basename):
19
  try:
20
  model_path = hf_hub_download(
21
+ repo_id=model_id,
22
+ filename=model_basename,
23
+ resume_download=True,
24
+ cache_dir="./models"
25
+ )
26
+ kwargs = {
27
+ 'model_path': model_path,
28
+ 'c_ctx': CONTEXT_WINDOW_SIZE,
29
+ 'max_tokens': MAX_NEW_TOKENS,
30
+ 'n_batch': N_BATCH
31
+ }
32
+ return llamacpp.LlamaCpp(**kwargs)
33
+ except TypeError:
34
+ return None
35
+
36
+ def load_model(model_id, model_basename=None):
37
+ if ".gguf" in model_basename.lower():
38
+ llm = load_quantized_model(model_id, model_basename)
39
+ return llm
40
+ else:
41
  print("currently only .gguf models supported")
42
 
43
 
 
 
 
 
 
 
 
 
 
44
 
45
  def generate_text(prompt="Who is the CEO of Apple?"):
46
  llm = load_model(MODEL_ID, MODEL_BASENAME)
47
  output = llm(
48
+ prompt,
49
+ max_tokens=256,
50
+ temperature=0.1,
51
+ top_p=0.5,
52
+ echo=False,
53
+ stop=["#"],
54
+ )
55
+ print(output)
56
+ return output
57
+ # output_text = output["choices"][0]["text"].strip()
58
+
59
+ # # Remove Prompt Echo from Generated Text
60
  # cleaned_output_text = output_text.replace(prompt, "")
61
  # return cleaned_output_text
62
 
63
 
 
 
 
64
  description = "Zephyr-beta"
65
 
 
 
 
 
 
 
 
66
  examples = [
67
  ["What is the capital of France?", "The capital of France is Paris."],
68
  [