Spaces:

mayf
/

1

Sleeping

File size: 3,472 Bytes

dfb3989
 
8367fb2
 
7d2ac1c
 
6b1de29
8367fb2
 
dfb3989
 
 
8367fb2
7d2ac1c
8367fb2
7d2ac1c
 
 
 
 
 
8367fb2
7d2ac1c
 
 
 
8367fb2
7d2ac1c
121e41f
7d2ac1c
8367fb2
dd489ad
ff06172
7d2ac1c
258bc7e
7d2ac1c
258bc7e
7d2ac1c
 
8367fb2
258bc7e
121e41f
7d2ac1c
 
258bc7e
 
ff06172
258bc7e
 
 
 
7d2ac1c
258bc7e
 
 
ff06172
258bc7e
 
7d2ac1c
8367fb2
ff06172
dfb3989
7d2ac1c
0fdc556
121e41f
 
cc355a8
eb25a05
dfb3989
cc355a8
ff06172
7d2ac1c
258bc7e
7d2ac1c
5e41bcc
7d2ac1c
 
 
 
 
 
 
 
b3f64ee
258bc7e
 
 
 
7d2ac1c
258bc7e
 
 
ff06172
258bc7e
 
dfb3989
 
1c165f8
121e41f
 
6b1de29
 
 
 
e9fb854
7d2ac1c

# app.py

import streamlit as st
from PIL import Image
from io import BytesIO
from huggingface_hub import InferenceApi
from gtts import gTTS
import tempfile

# —––––––– Page config
st.set_page_config(page_title="Storyteller for Kids", layout="centered")
st.title("🖼️ ➡️ 📖 Interactive Storyteller")

# —––––––– Inference clients (cached)
@st.cache_resource
def load_clients():
    hf_token = st.secrets["HF_TOKEN"]
    caption_client = InferenceApi(
        repo_id="Salesforce/blip-image-captioning-base",
        task="image-to-text",
        token=hf_token
    )
    story_client = InferenceApi(
        repo_id="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        task="text-generation",
        token=hf_token
    )
    return caption_client, story_client

caption_client, story_client = load_clients()

# —––––––– Main UI
uploaded = st.file_uploader("Upload an image:", type=["jpg", "jpeg", "png"])
if not uploaded:
    st.info("Please upload a JPG/PNG image to begin.")
else:
    # 1) Display image
    img = Image.open(uploaded).convert("RGB")
    st.image(img, use_container_width=True)

    # 2) Generate caption
    with st.spinner("🔍 Generating caption..."):
        buf = BytesIO()
        img.save(buf, format="PNG")
        cap_out = caption_client(data=buf.getvalue())

        # Unwrap list/dict properly
        if isinstance(cap_out, list) and cap_out:
            cap_text = cap_out[0].get("generated_text", "").strip()
        elif isinstance(cap_out, dict):
            cap_text = cap_out.get("generated_text", "").strip()
        else:
            cap_text = str(cap_out).strip()

    if not cap_text:
        st.error("😕 Couldn’t generate a caption. Try another image.")
        st.stop()

    st.markdown(f"**Caption:** {cap_text}")

    # 3) Build prompt
    prompt = (
        f"Here’s an image description: “{cap_text}”.\n\n"
        "Write an 80–100 word playful story for 3–10 year-old children that:\n"
        "1) Describes the scene and main subject.\n"
        "2) Explains what it’s doing and how it feels.\n"
        "3) Concludes with a fun, imaginative ending.\n\n"
        "Story:"
    )

    # 4) Generate story via HF Inference API (use `params`)
    with st.spinner("✍️ Generating story..."):
        story_out = story_client(
            inputs=prompt,
            parameters={             # ← must be `params`, not `parameters`
                "max_new_tokens": 120,
                "do_sample": True,
                "temperature": 0.7,
                "top_p": 0.9,
                "top_k": 50,
                "repetition_penalty": 1.2,
                "no_repeat_ngram_size": 3
            }
        )
        if isinstance(story_out, list) and story_out:
            story = story_out[0].get("generated_text", "").strip()
        elif isinstance(story_out, dict):
            story = story_out.get("generated_text", "").strip()
        else:
            story = str(story_out).strip()

    if not story:
        st.error("😕 Couldn’t generate a story. Please try again!")
        st.stop()

    st.markdown("**Story:**")
    st.write(story)

    # 5) Text-to-Speech via gTTS
    with st.spinner("🔊 Converting to speech..."):
        tts = gTTS(text=story, lang="en")
        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
        tts.write_to_fp(tmp)
        tmp.flush()
    st.audio(tmp.name, format="audio/mp3")