Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -143,8 +143,15 @@ def generate(input_dict: dict, chat_history: list[dict],
|
|
143 |
{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
|
144 |
{"role": "user", "content": [{"type": "text", "text": prompt}]}
|
145 |
]
|
|
|
146 |
inputs = processor.apply_chat_template(
|
147 |
-
messages,
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
).to("cuda")
|
149 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
150 |
generation_kwargs = {
|
@@ -184,7 +191,15 @@ def generate(input_dict: dict, chat_history: list[dict],
|
|
184 |
]
|
185 |
}]
|
186 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
189 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
190 |
thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
|
|
|
143 |
{"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
|
144 |
{"role": "user", "content": [{"type": "text", "text": prompt}]}
|
145 |
]
|
146 |
+
# Explicitly enable truncation to avoid token/feature mismatch.
|
147 |
inputs = processor.apply_chat_template(
|
148 |
+
messages,
|
149 |
+
tokenize=True,
|
150 |
+
add_generation_prompt=True,
|
151 |
+
return_dict=True,
|
152 |
+
return_tensors="pt",
|
153 |
+
truncation=True,
|
154 |
+
max_length=MAX_INPUT_TOKEN_LENGTH
|
155 |
).to("cuda")
|
156 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
157 |
generation_kwargs = {
|
|
|
191 |
]
|
192 |
}]
|
193 |
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
194 |
+
# Enable truncation explicitly here as well.
|
195 |
+
inputs = processor(
|
196 |
+
text=[prompt_full],
|
197 |
+
images=images,
|
198 |
+
return_tensors="pt",
|
199 |
+
padding=True,
|
200 |
+
truncation=True,
|
201 |
+
max_length=MAX_INPUT_TOKEN_LENGTH
|
202 |
+
).to("cuda")
|
203 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
204 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
205 |
thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
|