1
File size: 3,225 Bytes
dfb3989
 
8367fb2
 
7d2ac1c
 
6b1de29
8367fb2
 
dfb3989
 
 
8367fb2
7d2ac1c
8367fb2
7d2ac1c
 
 
 
 
 
 
 
 
8367fb2
7d2ac1c
 
 
 
 
8367fb2
7d2ac1c
121e41f
7d2ac1c
8367fb2
dd489ad
cc355a8
7d2ac1c
 
 
 
 
 
8367fb2
7d2ac1c
121e41f
7d2ac1c
 
 
 
 
 
 
 
 
8367fb2
121e41f
dfb3989
7d2ac1c
0fdc556
121e41f
 
cc355a8
eb25a05
dfb3989
cc355a8
7d2ac1c
 
 
 
 
 
 
 
 
 
 
 
 
b3f64ee
7d2ac1c
 
 
 
 
dfb3989
 
1c165f8
121e41f
 
6b1de29
 
 
 
e9fb854
7d2ac1c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# app.py

import streamlit as st
from PIL import Image
from io import BytesIO
from huggingface_hub import InferenceApi
from gtts import gTTS
import tempfile

# —––––––– Page config
st.set_page_config(page_title="Storyteller for Kids", layout="centered")
st.title("🖼️ ➡️ 📖 Interactive Storyteller")

# —––––––– Inference clients (cached)
@st.cache_resource
def load_clients():
    # read your HF token from Space secrets
    hf_token = st.secrets["HF_TOKEN"]

    # caption client: BLIP-base via HF Image-to-Text API
    caption_client = InferenceApi(
        repo_id="Salesforce/blip-image-captioning-base",
        task="image-to-text",
        token=hf_token
    )
    # story client: DeepSeek-R1-Distill via HF Text-Generation API
    story_client = InferenceApi(
        repo_id="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        task="text-generation",
        token=hf_token
    )
    return caption_client, story_client

caption_client, story_client = load_clients()

# —––––––– Main UI
uploaded = st.file_uploader("Upload an image:", type=["jpg", "jpeg", "png"])
if not uploaded:
    st.info("Please upload an image (JPG/PNG) to begin.")
else:
    # 1) Display the image
    img = Image.open(uploaded).convert("RGB")
    st.image(img, use_container_width=True)

    # 2) Caption via HF Inference API
    with st.spinner("🔍 Generating caption..."):
        buf = BytesIO()
        img.save(buf, format="PNG")
        caption_output = caption_client(data=buf.getvalue())
        # handle API return formats
        if isinstance(caption_output, dict):
            cap_text = caption_output.get("generated_text", "").strip()
        else:
            cap_text = str(caption_output).strip()
    st.markdown(f"**Caption:** {cap_text}")

    # 3) Build prompt
    prompt = (
        f"Here’s an image description: “{cap_text}”.\n\n"
        "Write an 80–100 word playful story for 3–10 year-old children that:\n"
        "1) Describes the scene and main subject.\n"
        "2) Explains what it’s doing and how it feels.\n"
        "3) Concludes with a fun, imaginative ending.\n\n"
        "Story:"
    )

    # 4) Story via HF Inference API
    with st.spinner("✍️ Generating story..."):
        story_output = story_client(
            inputs=prompt,
            params={
                "max_new_tokens": 120,
                "do_sample": True,
                "temperature": 0.7,
                "top_p": 0.9,
                "top_k": 50,
                "repetition_penalty": 1.2,
                "no_repeat_ngram_size": 3
            }
        )
        # API returns list of generations or a dict
        if isinstance(story_output, list):
            story = story_output[0].get("generated_text", "").strip()
        else:
            story = story_output.get("generated_text", "").strip()
    st.markdown("**Story:**")
    st.write(story)

    # 5) Text-to-Speech via gTTS
    with st.spinner("🔊 Converting to speech..."):
        tts = gTTS(text=story, lang="en")
        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
        tts.write_to_fp(tmp)
        tmp.flush()
    st.audio(tmp.name, format="audio/mp3")