File size: 2,799 Bytes
d88613b 492ffb8 651361d 7a96b3a d88613b 492ffb8 d88613b b698bb4 8a33d58 b698bb4 8a33d58 7a96b3a d88613b 492ffb8 651361d 492ffb8 d88613b 651361d d88613b 651361d d88613b 7a96b3a 651361d 7a96b3a d88613b 492ffb8 7a96b3a d88613b 492ffb8 d88613b 492ffb8 d88613b 7a96b3a 492ffb8 d88613b 492ffb8 d88613b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import gradio as gr
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import time
# Load model and tokenizer from Hugging Face Hub
model_name = "Electricarchmage/cookbookgpt"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Set the pad_token to eos_token and padding_side to 'left'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
# Define the respond function with logging for debugging
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
# Preparing the messages for context (the history and the new message)
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
# Tokenize the input
inputs = tokenizer([msg["content"] for msg in messages], return_tensors="pt", padding=True, truncation=True)
attention_mask = inputs.get('attention_mask', torch.ones_like(inputs['input_ids']))
start_time = time.time() # Start the timer
# Generate output tokens
try:
output = model.generate(
inputs["input_ids"],
attention_mask=attention_mask,
max_length=max_tokens + len(inputs["input_ids"][0]),
temperature=temperature,
top_p=top_p,
num_return_sequences=1,
do_sample=True,
no_repeat_ngram_size=2,
)
except Exception as e:
return f"Error during generation: {str(e)}"
generation_time = time.time() - start_time # Time taken for generation
# Decode the output tokens into text
response = tokenizer.decode(output[0], skip_special_tokens=True)
# Extract only the assistant's reply
assistant_reply = response.split("Assistant:")[-1].strip()
# Add generation time in the response for debugging
return f"Response: {assistant_reply}\nGeneration time: {generation_time:.2f} seconds"
# Define the Gradio interface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
# Launch the app
if __name__ == "__main__":
demo.launch()
|