Spaces:

genaforvena
/

huivam_finnegans_spaceship

Sleeping

96fba15 3 months ago

4.36 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, GenerationConfig
	import torch
	import threading
	from queue import Queue

	# Custom Streamer Class
	class MyStreamer(TextStreamer):
	def __init__(self, tokenizer, skip_prompt=True, **decode_kwargs):
	super().__init__(tokenizer, skip_prompt, **decode_kwargs)
	self.text_queue = Queue()
	self.stop_signal = None
	self.skip_special_tokens = decode_kwargs.pop("skip_special_tokens", True) # Default to True
	self.token_cache = [] # Add a token cache

	def on_finalized_text(self, text, stream_end=False):
	"""Put the new text in the queue."""
	self.text_queue.put(text)

	def put(self, value):
	"""Decode the token and add to buffer."""
	if len(value.shape) > 1 and value.shape[0] > 1:
	raise ValueError("put() only supports a single sequence of tokens at a time.")
	elif len(value.shape) > 1:
	value = value[0]

	if self.skip_prompt and self.next_tokens_are_prompt:
	self.next_tokens_are_prompt = False
	return

	# Add the token to the cache
	self.token_cache.extend(value.tolist())

	# Decode the entire cache
	text = self.tokenizer.decode(
	self.token_cache,
	skip_special_tokens=self.skip_special_tokens,
	**self.decode_kwargs,
	)

	# Check for stop signal (e.g., end of text)
	if self.stop_signal and text.endswith(self.stop_signal):
	text = text[: -len(self.stop_signal)]
	self.on_finalized_text(text, stream_end=True)
	self.token_cache = [] # Clear the cache
	else:
	self.on_finalized_text(text, stream_end=False)

	def end(self):
	"""Flush the buffer."""
	if self.token_cache:
	text = self.tokenizer.decode(
	self.token_cache,
	skip_special_tokens=self.skip_special_tokens,
	**self.decode_kwargs,
	)
	self.on_finalized_text(text, stream_end=True)
	self.token_cache = [] # Clear the cache
	else:
	self.on_finalized_text("", stream_end=True)

	# Load the model and tokenizer
	model_name = "genaforvena/huivam_finnegan_llama3.2-1b"
	model = None
	tokenizer = None
	try:
	model = AutoModelForCausalLM.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	print("Model and tokenizer loaded successfully.")
	except Exception as e:
	print(f"Error loading model/tokenizer: {e}")
	exit()

	# Move the model to the appropriate device
	device = "cuda" if torch.cuda.is_available() else "cpu"
	if model:
	model.to(device)
	print(f"Model moved to {device}.")

	# Function to generate a streaming response
	def reply(prompt):
	messages = [{"role": "user", "content": prompt}]
	try:
	inputs = tokenizer.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_tensors="pt",
	).to(device)

	# Create a custom streamer
	streamer = MyStreamer(tokenizer, skip_prompt=True)

	generation_config = GenerationConfig(
	pad_token_id=tokenizer.pad_token_id,
	)

	def generate():
	model.generate(
	inputs,
	generation_config=generation_config,
	streamer=streamer,
	max_new_tokens=512, # Adjust as needed
	)

	thread = threading.Thread(target=generate)
	thread.start()

	# Yield only the new tokens as they come in
	while thread.is_alive():
	try:
	next_token = streamer.text_queue.get(timeout=0.1)
	yield next_token # Yield only the new token
	except:
	pass

	# Yield any remaining text after generation finishes
	while not streamer.text_queue.empty():
	next_token = streamer.text_queue.get()
	yield next_token # Yield only the new token

	except Exception as e:
	print(f"Error during inference: {e}")
	yield f"Error processing your request: {e}"

	# Gradio interface
	demo = gr.Interface(
	fn=reply,
	inputs="text",
	outputs="text",
	)

	# Launch the Gradio app
	demo.launch(share=True)