Spaces:
Sleeping
Sleeping
from transformers import TextStreamer | |
def prepare_inference_inputs(tokenizer, template, instruction, input_text, eos_token, device="cuda"): | |
""" | |
Prepares the inputs for inference by formatting the prompt and tokenizing it. | |
Args: | |
- tokenizer: The tokenizer used for tokenization. | |
- template: The template string for the prompt format. | |
- instruction: The instruction to be included in the prompt. | |
- input_text: The input to be included in the prompt. | |
- eos_token: The end of sequence token. | |
- device: The device for the model ('cuda' or 'cpu'). | |
Returns: | |
- Tokenized inputs ready for inference. | |
""" | |
prompt = template.format(instruction, input_text, "") + eos_token | |
return tokenizer([prompt], return_tensors="pt").to(device) | |
def generate_responses(model, inputs, tokenizer, max_new_tokens=64): | |
""" | |
Generates responses from the model based on the provided inputs. | |
Args: | |
- model: The pre-trained model for generation. | |
- inputs: The tokenized inputs to generate responses. | |
- tokenizer: The tokenizer used to decode the output. | |
- max_new_tokens: The maximum number of tokens to generate. | |
Returns: | |
- Decoded responses from the model. | |
""" | |
outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True) | |
return tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
def stream_responses(model, inputs, tokenizer, max_new_tokens=128): | |
""" | |
Streams the model's response using a text streamer. | |
Args: | |
- model: The pre-trained model for generation. | |
- inputs: The tokenized inputs to generate responses. | |
- tokenizer: The tokenizer used to decode the output. | |
- max_new_tokens: The maximum number of tokens to generate. | |
Returns: | |
- Streams the output directly. | |
""" | |
text_streamer = TextStreamer(tokenizer) | |
model.generate(**inputs, streamer=text_streamer, max_new_tokens=max_new_tokens) | |