import streamlit as st import torch from PIL import Image from gtts import gTTS from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer # Streamlit config must be first st.set_page_config(page_title="Magic Story Generator", layout="centered", page_icon="📖") # Model loading cached for performance @st.cache_resource def load_models(): caption_model = pipeline("image-to-text", "Salesforce/blip-image-captioning-base") story_model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen3-1.7B", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True ) story_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B", trust_remote_code=True) return caption_model, story_model, story_tokenizer # Initialize models caption_pipe, story_model, story_tokenizer = load_models() # Main app interface st.title("📖 Instant Story Generator") uploaded_image = st.file_uploader("Upload an image:", type=["jpg", "jpeg", "png"]) if uploaded_image: img = Image.open(uploaded_image).convert("RGB") st.image(img, caption="Your Image", use_column_width=True) # Generate caption caption = caption_pipe(img)[0]['generated_text'] # Generate story messages = [{ "role": "system", "content": f"Create a 50 to 100 words children's story based on: {caption}." }] inputs = story_tokenizer.apply_chat_template( messages, return_tensors="pt" ).to(story_model.device) outputs = story_model.generate( inputs, max_new_tokens=300, temperature=0.7, top_p=0.9 ) # Display results story = story_tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True) st.subheader("Generated Story") st.write(story) # Audio conversion audio = gTTS(text=story, lang='en') with tempfile.NamedTemporaryFile(delete=False) as fp: audio.save(fp.name) st.audio(fp.name, format='audio/mp3')