Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,668 Bytes
5376cf0 0ab8f18 19a1870 821bcb4 933bc05 821bcb4 4a07feb 19a1870 104a8d7 19a1870 104a8d7 19a1870 821bcb4 19a1870 821bcb4 19a1870 821bcb4 1da7e11 821bcb4 19a1870 933bc05 19a1870 821bcb4 19a1870 821bcb4 19a1870 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import spaces
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
@spaces.GPU
def predict(message, history):
torch.set_default_device("cuda")
# Load model and tokenizer
model_id = "LiquidAI/LFM2-1.2B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True,
load_in_4bit=True, # Keeping 4-bit quantization for efficiency
# attn_implementation="flash_attention_2" # Uncomment on compatible GPU
)
# Format conversation history for chat template
messages = [{"role": "user" if i % 2 == 0 else "assistant", "content": msg}
for conv in history for i, msg in enumerate(conv) if msg]
messages.append({"role": "user", "content": message})
# Apply chat template
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
tokenize=True
).to('cuda')
# Setup streamer for real-time output
streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
# Generation parameters
generate_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
max_new_tokens=256,
do_sample=True,
temperature=0.3,
min_p=0.15,
repetition_penalty=1.05,
pad_token_id=tokenizer.eos_token_id
)
# Start generation in separate thread
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
# Stream tokens
partial_message = ""
for new_token in streamer:
partial_message += new_token
yield partial_message
# Setup Gradio interface
gr.ChatInterface(
predict,
description="""
<center><h2>LiquidAI LFM2-1.2B Chat</h2></center>
Chat with [LiquidAI/LFM2-1.2B](https://huggingface.co/LiquidAI/LFM2-1.2B), a compact and efficient language model.
This model provides high-quality responses while maintaining a small footprint, making it ideal for fast inference.
""",
examples=[
'Can you solve the equation 2x + 3 = 11 for x?',
'What is C. elegans?',
'Explain quantum computing in simple terms',
'Write a Python function to find prime numbers',
'What are the key differences between RNA and DNA?',
'Can you write a haiku about artificial intelligence?'
],
theme=gr.themes.Soft(primary_hue="blue"),
).launch() |