jwu323 commited on
Commit
c344902
·
verified ·
1 Parent(s): 037da0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -113
app.py CHANGED
@@ -1,9 +1,15 @@
1
  import os
2
- from typing import Generator, Optional
3
  import gradio as gr
4
- from llama_cpp import Llama, LlamaGrammar
5
  from huggingface_hub import hf_hub_download
6
 
 
 
 
 
 
 
 
7
  DESCRIPTION = '''
8
  # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
9
  SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
@@ -20,118 +26,136 @@ LICENSE = """
20
 
21
  template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
22
 
23
- class OptimizedLLMInterface:
24
- _model_instance = None # Singleton pattern
25
-
26
- def __init__(
27
- self,
28
- model_repo_id: str = "SimpleBerry/LLaMA-O1-Supervised-1129-Q2_K-GGUF",
29
- model_filename: str = "LLaMA-O1-Supervised-1129-q2_k.gguf",
30
- ):
31
- if OptimizedLLMInterface._model_instance is None:
32
- model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename)
33
- OptimizedLLMInterface._model_instance = Llama(
34
- model_path=model_path,
35
- n_ctx=256, # Minimal context for speed
36
- n_threads=4, # Fixed thread count
37
- n_batch=1, # Single batch for low latency
38
- verbose=False, # Disable logging
39
- seed=-1, # Disable random seed
40
- logits_all=False, # Disable logits
41
- embedding=False, # Disable embeddings
42
- tensor_split=None, # No tensor splitting
43
- rope_freq_base=10000, # Default RoPE settings
44
- rope_freq_scale=1.0,
45
- main_gpu=0,
46
- )
47
- self.model = OptimizedLLMInterface._model_instance
48
-
49
- # Pre-tokenize template parts
50
- template_parts = template.split("{content}")
51
- self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
52
- self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
53
-
54
- def generate_response(
55
- self,
56
- message: str,
57
- history: Optional[list] = None,
58
- max_tokens: int = 128, # Reduced max tokens
59
- temperature: float = 0.7,
60
- top_p: float = 0.95,
61
- ) -> Generator[str, None, None]:
62
- try:
63
- # Fast token preparation
64
- message_tokens = self.model.tokenize(message.encode())
65
- input_tokens = []
66
- input_tokens.extend(self._prefix_tokens)
67
- input_tokens.extend(message_tokens)
68
- input_tokens.extend(self._suffix_tokens)
69
-
70
- output = ""
71
- batch = []
72
- batch_size = 4 # Small batch size for faster responses
73
-
74
- for token in self.model.generate(
75
- input_tokens,
76
- top_p=top_p,
77
- temp=temperature,
78
- top_k=1, # Minimal top_k
79
- repeat_penalty=1.0, # No repeat penalty
80
- mirostat_mode=0, # Disable mirostat
81
- min_p=0.05, # Allow more diversity
82
- typical_p=1.0, # Disable typical sampling
83
- presence_penalty=0,
84
- frequency_penalty=0,
85
- ):
86
- batch.append(token)
87
- if len(batch) >= batch_size:
88
- text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
89
- output += text
90
- yield output
91
- batch = []
92
-
93
- if batch:
94
- text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
95
- output += text
96
- yield output
97
-
98
- except Exception as e:
99
- yield f"Error: {str(e)}"
100
-
101
- def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
102
- with gr.Blocks() as demo:
103
- gr.Markdown(DESCRIPTION)
104
-
105
- chatbot = gr.ChatInterface(
106
- llm_interface.generate_response,
107
- title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
108
- description="Edit Settings below if needed.",
109
- examples=[
110
- ["How many r's are in the word strawberry?"],
111
- ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
112
- ['Find the least odd prime factor of $2019^8+1$.'],
113
- ],
114
- cache_examples=False,
115
- fill_height=True
116
- )
117
-
118
- with gr.Accordion("Adjust Parameters", open=False):
119
- gr.Slider(minimum=64, maximum=512, value=128, step=64, label="Max Tokens")
120
- gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
121
- gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")
122
-
123
- gr.Markdown(LICENSE)
124
-
125
- return demo
126
 
127
- def main():
128
- llm = OptimizedLLMInterface()
129
- demo = create_demo(llm)
130
-
131
- demo.launch(
132
- share=False,
133
- quiet=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  )
135
 
 
 
 
 
 
 
 
136
  if __name__ == "__main__":
137
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
  import gradio as gr
3
+ from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
 
6
+ model = Llama(
7
+ model_path=hf_hub_download(
8
+ repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129-Q2_K-GGUF"),
9
+ filename=os.environ.get("MODEL_FILE", "LLaMA-O1-Supervised-1129-q2_k.gguf"),
10
+ )
11
+ )
12
+
13
  DESCRIPTION = '''
14
  # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
15
  SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
 
26
 
27
  template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
28
 
29
+ def llama_o1_template(data):
30
+ #query = data['query']
31
+ text = template.format(content=data)
32
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
35
+ temp = ""
36
+ input_texts = [llama_o1_template(message)]
37
+ input_texts = [input_text.replace('<|end_of_text|>','') for input_text in input_texts]
38
+ #print(f"input_texts[0]: {input_texts[0]}")
39
+ inputs = model.tokenize(input_texts[0].encode('utf-8'))
40
+ for token in model.generate(inputs, top_p=top_p, temp=temperature):
41
+ #print(f"token: {token}")
42
+ text = model.detokenize([token])
43
+ #print(f"text detok: {text}")
44
+ temp += text.decode('utf-8')
45
+ yield temp
46
+
47
+ with gr.Blocks() as demo:
48
+ gr.Markdown(DESCRIPTION)
49
+
50
+ chatbot = gr.ChatInterface(
51
+ generate_text,
52
+ title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
53
+ description="Edit Settings below if needed.",
54
+ examples=[
55
+ ["How many r's are in the word strawberry?"],
56
+ ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
57
+ ['Find the least odd prime factor of $2019^8+1$.'],
58
+ ],
59
+ cache_examples=False,
60
+ fill_height=True
61
  )
62
 
63
+ with gr.Accordion("Adjust Parameters", open=False):
64
+ gr.Slider(minimum=128, maximum=8192, value=512, step=1, label="Max Tokens")
65
+ gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
66
+ gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
67
+
68
+ gr.Markdown(LICENSE)
69
+
70
  if __name__ == "__main__":
71
+ demo.launch()
72
+ # # import spaces
73
+
74
+ # import os
75
+ # import gradio as gr
76
+ # from transformers import AutoTokenizer, AutoModelForCausalLM
77
+ # from huggingface_hub import hf_hub_download, snapshot_download
78
+ # import accelerate
79
+
80
+ # accelerator = accelerate.Accelerator()
81
+
82
+ # # Load the model and tokenizer from Hugging Face
83
+ # model_path = snapshot_download(
84
+ # repo_id=os.environ.get("REPO_ID", "SimpleBerry/LLaMA-O1-Supervised-1129")
85
+ # )
86
+
87
+ # tokenizer = AutoTokenizer.from_pretrained(model_path)
88
+ # model = AutoModelForCausalLM.from_pretrained(model_path,device_map='auto')
89
+
90
+ # DESCRIPTION = '''
91
+ # # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
92
+ # SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
93
+ # Focused on advancing AI reasoning capabilities.
94
+
95
+ # ## This Space was designed by Lyte/LLaMA-O1-Supervised-1129-GGUF, Many Thanks!
96
+
97
+ # **To start a new chat**, click "clear" and start a new dialogue.
98
+ # '''
99
+
100
+ # LICENSE = """
101
+ # --- MIT License ---
102
+ # """
103
+
104
+ # template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
105
+
106
+ # def llama_o1_template(data):
107
+ # #query = data['query']
108
+ # text = template.format(content=data)
109
+ # return text
110
+
111
+ # def format_response(response):
112
+ # response = response.replace('<start_of_father_id>','')
113
+ # response = response.replace('<end_of_father_id><start_of_local_id>','👉')
114
+ # response = response.replace('<end_of_local_id><start_of_thought>',', ')
115
+ # response = response.replace('<end_of_thought><start_of_rating>','')
116
+ # response = response.replace('<end_of_rating>','')
117
+ # response = response.replace('<positive_rating>','👍')
118
+ # response = response.replace('<negative_rating>','👎')
119
+
120
+ # # @spaces.GPU
121
+ # def generate_text(message, history, max_tokens=512, temperature=0.9, top_p=0.95):
122
+ # input_text = llama_o1_template(message)
123
+ # inputs = tokenizer(input_text, return_tensors="pt").to(accelerator.device)
124
+
125
+ # # Generate the text with the model
126
+ # output = model.generate(
127
+ # **inputs,
128
+ # max_length=max_tokens,
129
+ # temperature=temperature,
130
+ # top_p=top_p,
131
+ # do_sample=True,
132
+ # )
133
+
134
+ # response = tokenizer.decode(output[0], skip_special_tokens=False)
135
+ # yield response
136
+
137
+ # with gr.Blocks() as demo:
138
+ # gr.Markdown(DESCRIPTION)
139
+
140
+ # chatbot = gr.ChatInterface(
141
+ # generate_text,
142
+ # title="SimpleBerry/LLaMA-O1-Supervised-1129 | GGUF Demo",
143
+ # description="Edit Settings below if needed.",
144
+ # examples=[
145
+ # ["How many r's are in the word strawberry?"],
146
+ # ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
147
+ # ['Find the least odd prime factor of $2019^8+1$.'],
148
+ # ],
149
+ # cache_examples=True,
150
+ # fill_height=True,
151
+ # )
152
+
153
+ # with gr.Accordion("Adjust Parameters", open=False):
154
+ # gr.Slider(minimum=1024, maximum=8192, value=2048, step=1, label="Max Tokens")
155
+ # gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
156
+ # gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
157
+
158
+ # gr.Markdown(LICENSE)
159
+
160
+ # if __name__ == "__main__":
161
+ # demo.launch()