Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -48,16 +48,12 @@ def convert_history(chat_history, max_input_length=1024):
|
|
48 |
return history_text
|
49 |
|
50 |
@spaces.GPU
|
|
|
51 |
def instruct(instruction, max_token_output=1024):
|
52 |
input_text = instruction
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
input_ids["attention_mask"] = input_ids["attention_mask"].cuda()
|
57 |
-
generation_kwargs = dict(input_ids, streamer=streamer, max_new_tokens=max_token_output, do_sample=False)
|
58 |
-
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
59 |
-
thread.start()
|
60 |
-
return streamer
|
61 |
|
62 |
|
63 |
with gr.Blocks() as demo:
|
|
|
48 |
return history_text
|
49 |
|
50 |
@spaces.GPU
|
51 |
+
@torch.inference_mode()
|
52 |
def instruct(instruction, max_token_output=1024):
|
53 |
input_text = instruction
|
54 |
+
input_ids = tokenizer(input_text, return_tensors='pt', truncation=True).to('cuda')
|
55 |
+
outputs = model.generate(**input_ids, max_length=max_token_output, do_sample=False)
|
56 |
+
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
|
59 |
with gr.Blocks() as demo:
|