Spaces:

macadeliccc
/

liquid_ai_chatbot

Running on Zero

App Files Files Community

liquid_ai_chatbot / app.py

macadeliccc

changed space to use LiquidAI/LFM2-1.2

19a1870 3 days ago

raw

history blame contribute delete

2.67 kB

	import spaces
	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	from threading import Thread

	@spaces.GPU
	def predict(message, history):
	torch.set_default_device("cuda")

	# Load model and tokenizer
	model_id = "LiquidAI/LFM2-1.2B"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	trust_remote_code=True,
	load_in_4bit=True, # Keeping 4-bit quantization for efficiency
	# attn_implementation="flash_attention_2" # Uncomment on compatible GPU
	)

	# Format conversation history for chat template
	messages = [{"role": "user" if i % 2 == 0 else "assistant", "content": msg}
	for conv in history for i, msg in enumerate(conv) if msg]
	messages.append({"role": "user", "content": message})

	# Apply chat template
	input_ids = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="pt",
	tokenize=True
	).to('cuda')

	# Setup streamer for real-time output
	streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)

	# Generation parameters
	generate_kwargs = dict(
	input_ids=input_ids,
	streamer=streamer,
	max_new_tokens=256,
	do_sample=True,
	temperature=0.3,
	min_p=0.15,
	repetition_penalty=1.05,
	pad_token_id=tokenizer.eos_token_id
	)

	# Start generation in separate thread
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	# Stream tokens
	partial_message = ""
	for new_token in streamer:
	partial_message += new_token
	yield partial_message

	# Setup Gradio interface
	gr.ChatInterface(
	predict,
	description="""
	<center><h2>LiquidAI LFM2-1.2B Chat</h2></center>

	Chat with [LiquidAI/LFM2-1.2B](https://huggingface.co/LiquidAI/LFM2-1.2B), a compact and efficient language model.

	This model provides high-quality responses while maintaining a small footprint, making it ideal for fast inference.
	""",
	examples=[
	'Can you solve the equation 2x + 3 = 11 for x?',
	'What is C. elegans?',
	'Explain quantum computing in simple terms',
	'Write a Python function to find prime numbers',
	'What are the key differences between RNA and DNA?',
	'Can you write a haiku about artificial intelligence?'
	],
	theme=gr.themes.Soft(primary_hue="blue"),
	).launch()