Spaces:

Somekindofathing
/

ontology-individuals-filler

Paused

Somekindofa commited on Jan 24

Commit

0aba972

1 Parent(s): 03a78ae

Refactor generate function to handle input token length and implement threading for model generation

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import os
 import torch
 from typing import Optional, Iterator
 # Initialize logging and device information
@@ -79,11 +80,35 @@ def generate(
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
-    print("\nInput Ids:\t", input_ids, "Type:\t", type(input_ids))
-    for i, token in enumerate(input_ids):
-        print(f"ID {i}:", token[0])
-        print(f"ID {token[0]} -> '{tokenizer.decode(token[0])}'")
 chat_interface = gr.ChatInterface(
     fn=generate,

 import os
 import torch
 from typing import Optional, Iterator
+from threading import Thread
 # Initialize logging and device information
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
+    print("Input Ids Shape: ", input_ids.shape)
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        gr.Warning(f"Trimmed the input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH]
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer,
+                                    timeout=10.0,
+                                    skip_prompt=True,
+                                    skip_special_tokens=True)
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
 chat_interface = gr.ChatInterface(
     fn=generate,