Spaces:

eltorio
/

Llama-3.2-3B-appreciation

Sleeping

App Files Files Community

[email protected] commited on Nov 28, 2024

Commit

86216f9

1 Parent(s): c5af4d7

Stream output with TextIteratorStreamer

Browse files

Files changed (1) hide show

app.py +24 -11

app.py CHANGED Viewed

@@ -9,8 +9,9 @@ model_id: "eltorio/Llama-3.2-3B-appreciation"
 Author: Ronan Le Meillat
 License: AGPL-3.0
 """
 import gradio as gr
-from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM
 from peft import AutoPeftModelForCausalLM
 import torch
 import os
@@ -77,22 +78,34 @@ def infere(trimestre: str, moyenne_1: float,moyenne_2: float,moyenne_3: float, c
                     """,
                   duration=500)
     messages = get_conversation(trimestre, moyenne_1, moyenne_2, moyenne_3, comportement, participation, travail)
     # Tokenize the input
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize = True,
         add_generation_prompt = True,
         return_tensors = "pt",).to(device)
-    # Generate the output
-    outputs = model.generate(input_ids = inputs,
-                                        max_new_tokens = 90,
-                                        use_cache = True,
-                                        temperature = 1.5,
-                                        min_p = 0.1,
-                                        pad_token_id=tokenizer.eos_token_id,)
-    # Decodes the returned tokens
-    decoded_sequences = tokenizer.batch_decode(outputs[:, inputs.shape[1]:],skip_special_tokens=True)[0]
-    return decoded_sequences
 # Create a Gradio interface with the infere function and specified title and descriptions
 autoeval = gr.Interface(fn=infere, inputs=[

 Author: Ronan Le Meillat
 License: AGPL-3.0
 """
+from threading import Thread
 import gradio as gr
+from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 from peft import AutoPeftModelForCausalLM
 import torch
 import os
                     """,
                   duration=500)
     messages = get_conversation(trimestre, moyenne_1, moyenne_2, moyenne_3, comportement, participation, travail)
     # Tokenize the input
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize = True,
         add_generation_prompt = True,
         return_tensors = "pt",).to(device)
+    # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
+    # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
+    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids = inputs,
+        streamer=streamer,
+        max_new_tokens=90,
+        use_cache = True,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+    generation_thread = Thread(target=model.generate, kwargs=generate_kwargs)
+    generation_thread.start()
+    # Pull the generated text from the streamer, and update the model output.
+    model_output = ""
+    for new_text in streamer:
+        model_output += new_text
+        yield model_output
+    return model_output
 # Create a Gradio interface with the infere function and specified title and descriptions
 autoeval = gr.Interface(fn=infere, inputs=[