1
File size: 2,613 Bytes
dfb3989
 
8367fb2
 
 
6b1de29
8367fb2
 
dfb3989
 
 
8367fb2
cc355a8
8367fb2
 
cc355a8
8367fb2
504dc12
33fead7
cc355a8
8367fb2
cc355a8
8367fb2
eb25a05
e9fb854
c916589
8367fb2
cc355a8
0fdc556
c916589
60c225b
8367fb2
 
dfb3989
8367fb2
dd489ad
cc355a8
dfb3989
cc355a8
e9fb854
0fdc556
c916589
8367fb2
60c225b
cc355a8
c916589
dfb3989
8367fb2
cc355a8
dfb3989
cc355a8
0fdc556
cc355a8
 
 
eb25a05
dfb3989
cc355a8
 
 
 
b3f64ee
cc355a8
eb25a05
 
 
e9fb854
cc355a8
60c225b
b3f64ee
cc355a8
eb25a05
dfb3989
 
1c165f8
cc355a8
 
6b1de29
 
 
 
e9fb854
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# app.py

import streamlit as st
from PIL import Image
from transformers import pipeline
from gtts import gTTS
import tempfile

# —––––––– Page config
st.set_page_config(page_title="Storyteller for Kids", layout="centered")
st.title("🖼️ ➡️ 📖 Interactive Storyteller")

# —––––––– Load and warm pipelines
@st.cache_resource
def load_pipelines():
    # BLIP-base for captions
    captioner = pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
        device=-1  # GPU if available, else -1
    )
    # Flan-T5-Large for stories
    storyteller = pipeline(
        "text2text-generation",
        model="google/flan-t5-large",
        device=0
    )
    # Warm-up runs so user-facing calls are fast
    dummy = Image.new("RGB", (384, 384), color=(128, 128, 128))
    captioner(dummy)
    storyteller("Warm up", max_new_tokens=1)
    return captioner, storyteller

captioner, storyteller = load_pipelines()

# —––––––– Main UI
uploaded = st.file_uploader("Upload an image:", type=["jpg", "jpeg", "png"])
if uploaded:
    # 1) Preprocess image
    image = Image.open(uploaded).convert("RGB")
    image = image.resize((384, 384), Image.LANCZOS)
    st.image(image, caption="Your image", use_container_width=True)

    # 2) Caption
    with st.spinner("🔍 Generating caption…"):
        cap = captioner(image)[0]["generated_text"].strip()
    st.markdown(f"**Caption:** {cap}")

    # 3) Build a dynamic prompt
    prompt = (
        f"Here is an image description: “{cap}”.\n"
        "Write an 80–100 word playful story for 3–10 year-old children that:\n"
        "1) Describes the scene and subject from the description.\n"
        "2) Explains what the subject is doing and how it feels.\n"
        "3) Concludes with a fun, imaginative ending.\n\n"
        "Story:"
    )

    # 4) Generate the story
    with st.spinner("✍️ Writing the story…"):
        output = storyteller(
            prompt,
            max_new_tokens=120,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3
        )
        story = output[0]["generated_text"].strip()

    st.markdown("**Story:**")
    st.write(story)

    # 5) Text-to-Speech
    with st.spinner("🔊 Converting to speech…"):
        tts = gTTS(text=story, lang="en")
        tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
        tts.write_to_fp(tmp)
        tmp.flush()
    st.audio(tmp.name, format="audio/mp3")