Spaces:

CR7CAD
/

Assignment1

Sleeping

App Files Files Community

CR7CAD commited on Mar 9

Commit

fc13d66

verified ·

1 Parent(s): 5518670

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -53

app.py CHANGED Viewed

@@ -5,14 +5,51 @@ from PIL import Image
 import torch
 import os
 import tempfile
 # For TTS, try multiple options in order of preference
 try:
-    # Try gTTS first
     from gtts import gTTS
-    def text2audio(story_text):
-        # Create a temporary file
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
         temp_filename = temp_file.name
         temp_file.close()
@@ -29,44 +66,34 @@ try:
         os.unlink(temp_filename)
         return audio_bytes, 'audio/mp3'
-except ImportError:
-    st.warning("gTTS not available. Using alternative text-to-speech method.")
-    # Define alternative TTS using built-in transformers pipeline
-    def text2audio(story_text):
-        # Use a different TTS method
-        from transformers import pipeline
-        # Try a simple TTS model that should work with base transformers
-        synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng")
-        # Generate speech
-        speech = synthesizer(story_text)
         # Return the audio data
         if 'audio' in speech:
             return speech['audio'], speech.get('sampling_rate', 16000)
         elif 'audio_array' in speech:
             return speech['audio_array'], speech.get('sampling_rate', 16000)
-        else:
-            # In case of failure, return an error message
-            raise Exception("Failed to generate audio with any available method")
-# Simple image-to-text function
 def img2text(image):
-    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
-    text = image_to_text(image)[0]["generated_text"]
-    return text
 # Helper function to count words
 def count_words(text):
     return len(text.split())
 # Improved text-to-story function without "Once upon a time" constraint
 def text2story(text):
-    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
     # Ask for a story without specifying how to start
     prompt = f"""Write a children's story based on this: {text}.
     The story should have a clear beginning, middle, and end.
@@ -74,7 +101,7 @@ def text2story(text):
     """
     # Generate a longer text to ensure we get a complete story
-    story_result = generator(
         prompt,
         max_length=500,
         num_return_sequences=1,
@@ -160,8 +187,38 @@ def text2story(text):
 # Basic Streamlit interface
 st.title("Image to Audio Story")
-uploaded_file = st.file_uploader("Upload an image")
 if uploaded_file is not None:
     # Display image
     st.image(uploaded_file, caption="Uploaded Image")
@@ -169,29 +226,45 @@ if uploaded_file is not None:
     # Convert to PIL Image
     image = Image.open(uploaded_file)
-    # Image to Text
-    with st.spinner("Generating caption..."):
-        caption = img2text(image)
-    st.write(f"Caption: {caption}")
-    # Text to Story
-    with st.spinner("Creating story..."):
-        story = text2story(caption)
-        # Display word count for transparency
-        word_count = len(story.split())
-        st.write(f"Story ({word_count} words):")
-        st.write(story)
-    # Text to Audio
-    with st.spinner("Generating audio..."):
         try:
-            audio_data, audio_format = text2audio(story)
-            # Play audio
-            if isinstance(audio_format, str) and audio_format.startswith('audio/'):
-                st.audio(audio_data, format=audio_format)
-            else:
-                st.audio(audio_data, sample_rate=audio_format)
         except Exception as e:
-            st.error(f"Error generating or playing audio: {e}")
-            st.info("There was an issue with the text-to-speech conversion.")

 import torch
 import os
 import tempfile
+import time
+# Use Streamlit's caching mechanisms to optimize model loading
+@st.cache_resource
+def load_image_to_text_pipeline():
+    """Load and cache the image-to-text model"""
+    return pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
+@st.cache_resource
+def load_text_generation_pipeline():
+    """Load and cache the text generation model"""
+    return pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+@st.cache_resource
+def load_tts_pipeline():
+    """Load and cache the text-to-speech pipeline as fallback"""
+    try:
+        return pipeline("text-to-speech", model="facebook/mms-tts-eng")
+    except:
+        # Return None if loading fails
+        return None
+# Initialize all models at app startup
+with st.spinner("Loading models (this may take a moment the first time)..."):
+    # Load all models at startup and cache them
+    img2text_model = load_image_to_text_pipeline()
+    story_generator_model = load_text_generation_pipeline()
+    tts_fallback_model = load_tts_pipeline()
 # For TTS, try multiple options in order of preference
 try:
+    # Try importing gTTS
     from gtts import gTTS
+    has_gtts = True
+except ImportError:
+    has_gtts = False
+    if tts_fallback_model is None:
+        st.warning("No text-to-speech capability available. Audio generation will be disabled.")
+# Cache the text-to-audio conversion
+@st.cache_data
+def text2audio(story_text):
+    """Convert text to audio with caching to avoid regenerating the same audio"""
+    if has_gtts:
+        # Use gTTS
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
         temp_filename = temp_file.name
         temp_file.close()
         os.unlink(temp_filename)
         return audio_bytes, 'audio/mp3'
+    elif tts_fallback_model is not None:
+        # Use transformers TTS
+        speech = tts_fallback_model(story_text)
         # Return the audio data
         if 'audio' in speech:
             return speech['audio'], speech.get('sampling_rate', 16000)
         elif 'audio_array' in speech:
             return speech['audio_array'], speech.get('sampling_rate', 16000)
+    # If we got here, no TTS method worked
+    raise Exception("No text-to-speech capability available")
+# Simple image-to-text function using cached model
+@st.cache_data
 def img2text(image):
+    """Convert image to text with caching"""
+    result = img2text_model(image)
+    return result[0]["generated_text"]
 # Helper function to count words
 def count_words(text):
     return len(text.split())
 # Improved text-to-story function without "Once upon a time" constraint
+@st.cache_data
 def text2story(text):
+    """Generate a story from text with caching"""
     # Ask for a story without specifying how to start
     prompt = f"""Write a children's story based on this: {text}.
     The story should have a clear beginning, middle, and end.
     """
     # Generate a longer text to ensure we get a complete story
+    story_result = story_generator_model(
         prompt,
         max_length=500,
         num_return_sequences=1,
 # Basic Streamlit interface
 st.title("Image to Audio Story")
+# Add processing status indicator
+status_container = st.empty()
+# Initialize session state for tracking progress
+if 'progress' not in st.session_state:
+    st.session_state.progress = {
+        'caption_generated': False,
+        'story_generated': False,
+        'audio_generated': False,
+        'caption': '',
+        'story': '',
+        'audio_data': None,
+        'audio_format': None
+    }
+# File uploader
+uploaded_file = st.file_uploader("Upload an image", on_change=lambda: reset_progress())
+# Function to reset progress when a new file is uploaded
+def reset_progress():
+    st.session_state.progress = {
+        'caption_generated': False,
+        'story_generated': False,
+        'audio_generated': False,
+        'caption': '',
+        'story': '',
+        'audio_data': None,
+        'audio_format': None
+    }
+# Process the image if uploaded
 if uploaded_file is not None:
     # Display image
     st.image(uploaded_file, caption="Uploaded Image")
     # Convert to PIL Image
     image = Image.open(uploaded_file)
+    # Image to Text (if not already done)
+    if not st.session_state.progress['caption_generated']:
+        status_container.info("Generating caption...")
+        st.session_state.progress['caption'] = img2text(image)
+        st.session_state.progress['caption_generated'] = True
+    st.write(f"Caption: {st.session_state.progress['caption']}")
+    # Text to Story (if not already done)
+    if not st.session_state.progress['story_generated']:
+        status_container.info("Creating story...")
+        st.session_state.progress['story'] = text2story(st.session_state.progress['caption'])
+        st.session_state.progress['story_generated'] = True
+    # Display word count for transparency
+    word_count = count_words(st.session_state.progress['story'])
+    st.write(f"Story ({word_count} words):")
+    st.write(st.session_state.progress['story'])
+    # Pre-generate audio in background (if not already done)
+    if not st.session_state.progress['audio_generated'] and (has_gtts or tts_fallback_model is not None):
+        status_container.info("Pre-generating audio in background...")
         try:
+            st.session_state.progress['audio_data'], st.session_state.progress['audio_format'] = text2audio(st.session_state.progress['story'])
+            st.session_state.progress['audio_generated'] = True
+            status_container.success("Ready to play audio!")
         except Exception as e:
+            status_container.error(f"Error pre-generating audio: {e}")
+    # Button to play audio
+    if st.button("Play the audio"):
+        if st.session_state.progress['audio_generated']:
+            # Display the audio player
+            if isinstance(st.session_state.progress['audio_format'], str) and st.session_state.progress['audio_format'].startswith('audio/'):
+                st.audio(st.session_state.progress['audio_data'], format=st.session_state.progress['audio_format'])
+            else:
+                st.audio(st.session_state.progress['audio_data'], sample_rate=st.session_state.progress['audio_format'])
+        else:
+            # Handle case where audio generation failed or is not available
+            st.error("Unable to play audio. Audio generation was not successful.")
+else:
+    status_container.info("Upload an image to begin")