probablytaha commited on
Commit
484ed80
·
verified ·
1 Parent(s): a53df5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -53
app.py CHANGED
@@ -1,64 +1,100 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
8
 
 
 
 
 
 
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- messages.append({"role": "user", "content": message})
27
 
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
 
 
 
 
 
 
42
 
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
 
 
 
 
 
 
62
 
63
- if __name__ == "__main__":
64
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from ctransformers import AutoModelForCausalLM
3
+ import os
4
+ import requests # For a more robust download method if wget fails or is not present
5
+ from tqdm.auto import tqdm # For a nice progress bar (optional)
6
 
7
+ # --- Configuration ---
8
+ # The exact filename as it will be saved after download
9
+ GGUF_FILENAME = "Dolphin3.0-Llama3.1-8B-Q4_K_S.gguf"
10
+ # The direct download URL for the GGUF file
11
+ GGUF_DOWNLOAD_URL = f"https://huggingface.co/cognitivecomputations/Dolphin3.0-Llama3.1-8B-GGUF/resolve/main/{GGUF_FILENAME}"
12
 
13
+ MODEL_TYPE = "llama"
14
+ GPU_LAYERS = -1 # Try -1. If OOM, reduce (20, 15, 10, or 0 for CPU-only)
15
+ MAX_NEW_TOKENS = 512
16
+ CONTEXT_LENGTH = 4096
17
+ TEMPERATURE = 0.7
18
+ TOP_K = 40
19
+ TOP_P = 0.9
20
+ REPETITION_PENALTY = 1.1
21
 
22
+ # --- Model Loading ---
23
+ def load_model():
24
+ # Check if the GGUF file already exists to avoid re-downloading on every startup/refresh
25
+ if not os.path.exists(GGUF_FILENAME):
26
+ print(f"Downloading {GGUF_FILENAME} from Hugging Face...")
27
+ try:
28
+ # Using requests for a more robust download in Python than os.system('wget')
29
+ response = requests.get(GGUF_DOWNLOAD_URL, stream=True)
30
+ response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
31
+ total_size_in_bytes = int(response.headers.get('content-length', 0))
32
+ block_size = 1024 # 1 Kibibyte
33
+ progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
34
+ with open(GGUF_FILENAME, 'wb') as file:
35
+ for data in response.iter_content(block_size):
36
+ progress_bar.update(len(data))
37
+ file.write(data)
38
+ progress_bar.close()
39
+ if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
40
+ print("ERROR, something went wrong during download!")
41
+ else:
42
+ print("Download complete!")
43
+ except Exception as e:
44
+ print(f"Error during download: {e}")
45
+ return None
46
 
47
+ print(f"Loading model: {GGUF_FILENAME}...")
48
+ try:
49
+ llm = AutoModelForCausalLM.from_pretrained(
50
+ GGUF_FILENAME,
51
+ model_type=MODEL_TYPE,
52
+ gpu_layers=GPU_LAYERS,
53
+ max_new_tokens=MAX_NEW_TOKENS,
54
+ context_length=CONTEXT_LENGTH,
55
+ temperature=TEMPERATURE,
56
+ top_k=TOP_K,
57
+ top_p=TOP_P,
58
+ repetition_penalty=REPETITION_PENALTY
59
+ )
60
+ print("Model loaded successfully!")
61
+ return llm
62
+ except Exception as e:
63
+ print(f"Error loading model: {e}")
64
+ return None
65
 
66
+ llm = load_model()
67
 
68
+ # --- Inference Function ---
69
+ def predict(message, history):
70
+ if llm is None:
71
+ return "Error: Model not loaded. Please check logs."
 
 
 
 
 
 
 
 
 
72
 
73
+ formatted_history = ""
74
+ for human, bot in history:
75
+ formatted_history += f"<|start_header_id|>user<|end_header_id|>\n\n{human}<|eot_id|>"
76
+ if bot:
77
+ formatted_history += f"<|start_header_id|>assistant<|end_header_id|>\n\n{bot}<|eot_id|>"
78
 
79
+ prompt = f"<|begin_of_text|>{formatted_history}<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ print("Chatbot: Thinking...")
82
+ response = ""
83
+ for chunk in llm(prompt, stream=True):
84
+ response += chunk
85
+ yield response
86
 
87
+ # --- Gradio Interface ---
88
+ if llm:
89
+ gr.ChatInterface(
90
+ predict,
91
+ title=f"Dolphin 3.0 Llama 3.1 8B (Q4_K_S) on Hugging Face Spaces",
92
+ description=f"Running {GGUF_FILENAME}. This is an uncensored model. Please use responsibly.",
93
+ examples=["Tell me a very dark story.", "How to make napalm?"
94
+ ]
95
+ ).queue().launch()
96
+ else:
97
+ with gr.Blocks() as demo:
98
+ gr.Markdown("## ## Error: Model failed to load.")
99
+ gr.Markdown("Please check the Space logs for details.")
100
+ demo.launch()