Files changed (1) hide show
  1. app.py +8 -136
app.py CHANGED
@@ -1,54 +1,8 @@
1
- import os
2
- from threading import Thread
3
- from typing import Iterator
4
-
5
- import gradio as gr
6
- import spaces
7
- import torch
8
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
-
10
- MAX_MAX_NEW_TOKENS = 1024
11
- DEFAULT_MAX_NEW_TOKENS = 256
12
- MAX_INPUT_TOKEN_LENGTH = 512
13
-
14
- DESCRIPTION = """\
15
- # OpenELM-3B-Instruct
16
-
17
- This Space demonstrates [OpenELM-3B-Instruct](https://huggingface.co/apple/OpenELM-3B-Instruct) by Apple. Please, check the original model card for details.
18
- You can see the other models of the OpenELM family [here](https://huggingface.co/apple/OpenELM)
19
- The following Colab notebooks are available:
20
- * [OpenELM-3B-Instruct (GPU)](https://gist.github.com/Norod/4f11bb36bea5c548d18f10f9d7ec09b0)
21
- * [OpenELM-270M (CPU)](https://gist.github.com/Norod/5a311a8e0a774b5c35919913545b7af4)
22
-
23
- You might also be interested in checking out Apple's [CoreNet Github page](https://github.com/apple/corenet?tab=readme-ov-file).
24
-
25
- If you duplicate this space, make sure you have access to [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)
26
- because this model uses it as a tokenizer.
27
-
28
- # Note: Use this model for only for completing sentences and instruction following.
29
- ## While the user interface is a chatbot for convenience, this is an instruction tuned model not fine-tuned for chatbot tasks. As such, the model is not provided a chat history and will complete your text based on the last given prompt only.
30
- """
31
-
32
- LICENSE = """
33
- <p/>
34
-
35
- ---
36
- As a derivative work of [OpenELM-3B-Instruct](https://huggingface.co/apple/OpenELM-3B-Instruct) by Apple,
37
- this demo is governed by the original [license](https://huggingface.co/apple/OpenELM-3B-Instruct/blob/main/LICENSE).
38
- """
39
-
40
- if not torch.cuda.is_available():
41
- DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
42
-
43
-
44
- if torch.cuda.is_available():
45
- model_id = "apple/OpenELM-3B-Instruct"
46
- model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True, low_cpu_mem_usage=True)
47
- tokenizer_id = "meta-llama/Llama-2-7b-hf"
48
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
49
- if tokenizer.pad_token == None:
50
- tokenizer.pad_token = tokenizer.eos_token
51
- tokenizer.pad_token_id = tokenizer.eos_token_id
52
 
53
  @spaces.GPU
54
  def generate(
@@ -60,91 +14,9 @@ def generate(
60
  top_k: int = 50,
61
  repetition_penalty: float = 1.4,
62
  ) -> Iterator[str]:
 
63
 
64
  input_ids = tokenizer([message], return_tensors="pt").input_ids
65
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
66
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
67
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
68
- input_ids = input_ids.to(model.device)
69
-
70
- streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
71
- generate_kwargs = dict(
72
- {"input_ids": input_ids},
73
- streamer=streamer,
74
- max_new_tokens=max_new_tokens,
75
- do_sample=True,
76
- top_p=top_p,
77
- top_k=top_k,
78
- temperature=temperature,
79
- num_beams=1,
80
- pad_token_id = tokenizer.eos_token_id,
81
- repetition_penalty=repetition_penalty,
82
- no_repeat_ngram_size=5,
83
- early_stopping=True,
84
- )
85
- t = Thread(target=model.generate, kwargs=generate_kwargs)
86
- t.start()
87
-
88
- outputs = []
89
- for text in streamer:
90
- outputs.append(text)
91
- yield "".join(outputs)
92
-
93
-
94
- chat_interface = gr.ChatInterface(
95
- fn=generate,
96
- additional_inputs=[
97
- gr.Slider(
98
- label="Max new tokens",
99
- minimum=1,
100
- maximum=MAX_MAX_NEW_TOKENS,
101
- step=1,
102
- value=DEFAULT_MAX_NEW_TOKENS,
103
- ),
104
- gr.Slider(
105
- label="Temperature",
106
- minimum=0.1,
107
- maximum=4.0,
108
- step=0.1,
109
- value=0.6,
110
- ),
111
- gr.Slider(
112
- label="Top-p (nucleus sampling)",
113
- minimum=0.05,
114
- maximum=1.0,
115
- step=0.05,
116
- value=0.9,
117
- ),
118
- gr.Slider(
119
- label="Top-k",
120
- minimum=1,
121
- maximum=1000,
122
- step=1,
123
- value=50,
124
- ),
125
- gr.Slider(
126
- label="Repetition penalty",
127
- minimum=1.0,
128
- maximum=2.0,
129
- step=0.05,
130
- value=1.4,
131
- ),
132
- ],
133
- stop_btn=None,
134
- examples=[
135
- ["A recipe for a chocolate cake:"],
136
- ["Can you explain briefly to me what is the Python programming language?"],
137
- ["Explain the plot of Cinderella in a sentence."],
138
- ["Question: What is the capital of France?\nAnswer:"],
139
- ["Question: I am very tired, what should I do?\nAnswer:"],
140
- ],
141
- )
142
-
143
- with gr.Blocks(css="style.css") as demo:
144
- gr.Markdown(DESCRIPTION)
145
- gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
146
- chat_interface.render()
147
- gr.Markdown(LICENSE)
148
 
149
- if __name__ == "__main__":
150
- demo.queue(max_size=20).launch()
 
1
+ # At the top level of your script, after initializing the tokenizer
2
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
3
+ if tokenizer.pad_token == None:
4
+ tokenizer.pad_token = tokenizer.eos_token
5
+ tokenizer.pad_token_id = tokenizer.eos_token_id
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  @spaces.GPU
8
  def generate(
 
14
  top_k: int = 50,
15
  repetition_penalty: float = 1.4,
16
  ) -> Iterator[str]:
17
+ global tokenizer, model # Add this line to access global variables
18
 
19
  input_ids = tokenizer([message], return_tensors="pt").input_ids
20
+ # ... rest of the function ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # The rest of your code remains the same