whoami02 commited on
Commit
45243bb
·
1 Parent(s): 2ea2830

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -37
app.py CHANGED
@@ -1,66 +1,55 @@
1
  import os
2
  import urllib.request
3
  import gradio as gr
4
- from llama_cpp import Llama
5
  from langchain.llms import llamacpp
6
  from huggingface_hub import login, hf_hub_download
7
- from dotenv import load_dotenv
8
 
9
  MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
10
  MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
 
 
11
  CONTEXT_WINDOW_SIZE = 8000
12
  MAX_NEW_TOKENS = 2000
13
  N_BATCH = 128
14
 
15
-
16
  def load_quantized_model(model_id, model_basename):
17
  try:
18
  model_path = hf_hub_download(
19
- repo_id=model_id,
20
- filename=model_basename,
21
- resume_download=True,
22
- cache_dir="./models"
23
- )
24
- kwargs = {
25
- 'model_path': model_path,
26
- 'c_ctx': CONTEXT_WINDOW_SIZE,
27
- 'max_tokens': MAX_NEW_TOKENS,
28
- 'n_batch': N_BATCH
29
- }
30
- return llamacpp.LlamaCpp(**kwargs)
31
- except TypeError:
32
- return None
33
-
34
- def load_model(model_id, model_basename=None):
35
- if ".gguf" in model_basename.lower():
36
- llm = load_quantized_model(model_id, model_basename)
37
- return llm
38
- else:
39
  print("currently only .gguf models supported")
40
 
41
 
 
 
 
 
 
 
 
 
 
 
42
  def generate_text(prompt="Who is the CEO of Apple?"):
43
  llm = load_model(MODEL_ID, MODEL_BASENAME)
44
  output = llm(
45
- prompt,
46
- max_tokens=256,
47
- temperature=0.1,
48
- top_p=0.5,
49
- echo=False,
50
- stop=["#"],
51
- )
52
- print(output)
53
- return output
54
- # output_text = output["choices"][0]["text"].strip()
55
-
56
- # # Remove Prompt Echo from Generated Text
57
  # cleaned_output_text = output_text.replace(prompt, "")
58
  # return cleaned_output_text
59
 
60
 
61
- os.getenv("hf_token")
 
 
62
  description = "Zephyr-beta"
63
 
 
 
 
 
 
 
 
64
  examples = [
65
  ["What is the capital of France?", "The capital of France is Paris."],
66
  [
@@ -77,4 +66,4 @@ gradio_interface = gr.Interface(
77
  examples=examples,
78
  title="Zephyr-B",
79
  )
80
- gradio_interface.launch()
 
1
  import os
2
  import urllib.request
3
  import gradio as gr
4
+ # from llama_cpp import Llama
5
  from langchain.llms import llamacpp
6
  from huggingface_hub import login, hf_hub_download
7
+ # from dotenv import load_dotenv, find_dotenv
8
 
9
  MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
10
  MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
11
+ # MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-GGUF"
12
+ # MODEL_BASENAME = "Wizard-Vicuna-7B-Uncensored.Q4_K_M.gguf"
13
  CONTEXT_WINDOW_SIZE = 8000
14
  MAX_NEW_TOKENS = 2000
15
  N_BATCH = 128
16
 
17
+ os.getenv("hf_token")
18
  def load_quantized_model(model_id, model_basename):
19
  try:
20
  model_path = hf_hub_download(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  print("currently only .gguf models supported")
22
 
23
 
24
+ # Dowloading GGML model from HuggingFace
25
+ # ggml_model_path = "https://huggingface.co/CRD716/ggml-vicuna-1.1-quantized/resolve/main/ggml-vicuna-7b-1.1-q4_1.bin"
26
+ # filename = "ggml-vicuna-7b-1.1-q4_1.bin"
27
+
28
+ # download_file(ggml_model_path, filename)
29
+
30
+
31
+ # llm = Llama(model_path=filename, n_ctx=512, n_batch=126)
32
+
33
+
34
  def generate_text(prompt="Who is the CEO of Apple?"):
35
  llm = load_model(MODEL_ID, MODEL_BASENAME)
36
  output = llm(
 
 
 
 
 
 
 
 
 
 
 
 
37
  # cleaned_output_text = output_text.replace(prompt, "")
38
  # return cleaned_output_text
39
 
40
 
41
+
42
+
43
+
44
  description = "Zephyr-beta"
45
 
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
  examples = [
54
  ["What is the capital of France?", "The capital of France is Paris."],
55
  [
 
66
  examples=examples,
67
  title="Zephyr-B",
68
  )
69
+ gradio_interface.launch(share=True)