Spaces:
Sleeping
Sleeping
# app.py | |
import streamlit as st | |
from unsloth import FastLanguageModel | |
from transformers import TextStreamer | |
# To speed up model loading in repeated queries, you can use st.cache_resource (Streamlit 1.18+). | |
def load_unsloth_model( | |
model_name="azizsi/model2", | |
max_seq_length=4096, | |
dtype="float16", | |
load_in_4bit=False | |
): | |
""" | |
Loads and prepares the model for inference using FastLanguageModel from Unsloth. | |
Returns (model, tokenizer). | |
""" | |
model, tokenizer = FastLanguageModel.from_pretrained( | |
model_name=model_name, | |
max_seq_length=max_seq_length, | |
dtype=dtype, | |
load_in_4bit=load_in_4bit | |
) | |
# Enable 2x faster inference (per Unsloth docs) | |
FastLanguageModel.for_inference(model) | |
return model, tokenizer | |
def main(): | |
st.title("Unsloth Model Demo") | |
# Provide a text input area for the user | |
user_input = st.text_area("Enter your prompt:", "") | |
# Generate button | |
if st.button("Generate"): | |
with st.spinner("Generating response..."): | |
# Load the model & tokenizer | |
model, tokenizer = load_unsloth_model() | |
# Create a TextStreamer to stream tokens or capture final text | |
streamer = TextStreamer(tokenizer) | |
# Tokenize user prompt and move to GPU (or the model's device) | |
inputs = tokenizer(user_input, return_tensors="pt").to(model.device) | |
# Generate up to 128 new tokens (modify as desired) | |
outputs = model.generate(**inputs, streamer=streamer, max_new_tokens=128) | |
# If you want to display the entire response at once: | |
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
st.markdown("**Response:**") | |
st.write(generated_text) | |
if __name__ == "__main__": | |
main() | |