Spaces:

CR7CAD
/

Assignment1

Sleeping

App Files Files Community

CR7CAD commited on Mar 8

Commit

ad4186a

verified ·

1 Parent(s): ab8ead3

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -121

app.py CHANGED Viewed

@@ -1,149 +1,69 @@
-# import part
 import streamlit as st
 from transformers import pipeline
 from PIL import Image
-# Set global caching options for Transformers
-from transformers import set_caching_enabled
-set_caching_enabled(True)
-# function part with caching for better performance
-@st.cache_resource
-def load_image_captioning_model():
-    return pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
-@st.cache_resource
-def load_text_generator():
-    return pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-@st.cache_resource
-def load_tts_model():
-    return pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
-# img2text - Using the original model with more constraints
 def img2text(image):
-    # Load the model (cached)
-    image_to_text = load_image_captioning_model()
-    # Strongly limit output length for speed
-    text = image_to_text(image, max_new_tokens=15)[0]["generated_text"]
     return text
-# text2story - Much more constrained for speed
 def text2story(text):
-    # Load the model (cached)
-    generator = load_text_generator()
-    # Very brief prompt to minimize work
-    prompt = f"Short story about {text}: Once upon a time, "
-    # Very constrained parameters for maximum speed
     story_result = generator(
         prompt,
-        max_new_tokens=60,  # Much shorter output
         num_return_sequences=1,
         temperature=0.7,
-        top_k=10,  # Lower value = faster
-        top_p=0.9,  # Lower value = faster
         do_sample=True
     )
-    # Extract and clean text
     story_text = story_result[0]['generated_text']
     story_text = story_text.replace(prompt, "Once upon a time, ")
-    # Find a natural ending point
-    last_period = story_text.rfind('.')
-    if last_period > 30:  # Ensure we have at least some content
-        story_text = story_text[:last_period + 1]
     return story_text
-# text2audio - Minimal text for faster processing
 def text2audio(story_text):
-    try:
-        # Load the model (cached)
-        synthesizer = load_tts_model()
-        # Aggressively limit text length to speed up TTS
-        max_chars = 200  # Much shorter than before
-        if len(story_text) > max_chars:
-            last_period = story_text[:max_chars].rfind('.')
-            if last_period > 0:
-                story_text = story_text[:last_period + 1]
-            else:
-                story_text = story_text[:max_chars]
-        # Generate speech
-        speech = synthesizer(story_text)
-        return speech
-    except Exception as e:
-        st.error(f"Error generating audio: {str(e)}")
-        return None
-# Streamlined main UI
-st.set_page_config(page_title="Image to Story", page_icon="📚")
-st.header("Image to Audio Story")
-# Add info about processing time
-st.info("Note: Processing may take some time as the models are loading. Please be patient.")
-# Cache the file uploader state
-if "uploaded_file" not in st.session_state:
-    st.session_state["uploaded_file"] = None
-uploaded_file = st.file_uploader("Select an Image...", key="file_uploader")
-# Process the image if uploaded
 if uploaded_file is not None:
-    st.session_state["uploaded_file"] = uploaded_file
-    # Display the uploaded image
-    st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
-    # Convert to PIL image
     image = Image.open(uploaded_file)
-    # Optional processing toggle to let user decide
-    if st.button("Generate Story and Audio"):
-        col1, col2 = st.columns(2)
-        # Stage 1: Image to Text with minimal output
-        with col1:
-            with st.spinner('Captioning image...'):
-                caption = img2text(image)
-            st.write(f"**Caption:** {caption}")
-        # Stage 2: Text to Story with minimal length
-        with col2:
-            with st.spinner('Creating story...'):
-                story = text2story(caption)
-            st.write(f"**Story:** {story}")
-        # Stage 3: Audio with minimal text
-        with st.spinner('Generating audio...'):
-            speech_output = text2audio(story)
-        # Display audio immediately
-        if speech_output is not None:
-            try:
-                if 'audio' in speech_output and 'sampling_rate' in speech_output:
-                    st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
-                elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
-                    st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
-                elif 'waveform' in speech_output and 'sample_rate' in speech_output:
-                    st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
-                else:
-                    # Try any array-like data
-                    for key, value in speech_output.items():
-                        if hasattr(value, '__len__') and len(value) > 1000:
-                            sample_rate = speech_output.get('sampling_rate', speech_output.get('sample_rate', 24000))
-                            st.audio(value, sample_rate=sample_rate)
-                            break
-                    else:
-                        st.error("Could not find audio data in the output")
-            except Exception as e:
-                st.error(f"Error playing audio: {str(e)}")
         else:
-            st.error("Audio generation failed")

+# Only the two imports you requested
 import streamlit as st
 from transformers import pipeline
 from PIL import Image
+# Simple image-to-text function
 def img2text(image):
+    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
+    text = image_to_text(image)[0]["generated_text"]
     return text
+# Simple text-to-story function
 def text2story(text):
+    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    prompt = f"Write a short children's story based on this: {text}. Once upon a time, "
     story_result = generator(
         prompt,
+        max_length=150,
         num_return_sequences=1,
         temperature=0.7,
         do_sample=True
     )
     story_text = story_result[0]['generated_text']
     story_text = story_text.replace(prompt, "Once upon a time, ")
     return story_text
+# Simple text-to-audio function
 def text2audio(story_text):
+    synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
+    speech = synthesizer(story_text)
+    return speech
+# Basic Streamlit interface
+st.title("Image to Audio Story")
+uploaded_file = st.file_uploader("Upload an image")
 if uploaded_file is not None:
+    # Display image
+    st.image(uploaded_file, caption="Uploaded Image")
+    # Convert to PIL Image
     image = Image.open(uploaded_file)
+    # Image to Text
+    st.write("Generating caption...")
+    caption = img2text(image)
+    st.write(f"Caption: {caption}")
+    # Text to Story
+    st.write("Creating story...")
+    story = text2story(caption)
+    st.write(f"Story: {story}")
+    # Text to Audio
+    st.write("Generating audio...")
+    speech_output = text2audio(story)
+    # Play audio
+    try:
+        if 'audio' in speech_output and 'sampling_rate' in speech_output:
+            st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
+        elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
+            st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
         else:
+            st.write("Audio generated but could not be played.")
+    except Exception as e:
+        st.error(f"Error playing audio: {e}")