Spaces:

CR7CAD
/

Assignment1

Sleeping

App Files Files Community

CR7CAD commited on Mar 8

Commit

1ebc71c

verified ·

1 Parent(s): a4fc174

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -30

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# import part
 import streamlit as st
 from transformers import pipeline
@@ -55,7 +55,7 @@ def text2story(text):
     return story_text
-# text2audio - Simplified without numpy/scipy
 def text2audio(story_text):
     try:
         # Use the HelpingAI TTS model as requested
@@ -71,32 +71,17 @@ def text2audio(story_text):
                 story_text = story_text[:max_chars]
         # Generate speech
-        st.write("Generating audio...")
         speech = synthesizer(story_text)
         st.write(f"Speech output keys: {list(speech.keys())}")
-        # We'll pass the audio data directly to Streamlit instead of saving to a file
-        # This works because Streamlit's st.audio() can take raw audio data
         return speech
     except Exception as e:
         st.error(f"Error generating audio: {str(e)}")
-        import traceback
-        st.error(traceback.format_exc())
         return None
-# Function to save temporary image file
-def save_uploaded_image(uploaded_file):
-    if not os.path.exists("temp"):
-        os.makedirs("temp")
-    image_path = os.path.join("temp", uploaded_file.name)
-    with open(image_path, "wb") as f:
-        f.write(uploaded_file.getvalue())
-    return image_path
 # main part
 st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
 st.header("Turn Your Image to Audio Story")
@@ -106,12 +91,12 @@ if uploaded_file is not None:
     # Display the uploaded image
     st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
-    # Save the image temporarily
-    image_path = save_uploaded_image(uploaded_file)
     # Stage 1: Image to Text
     st.text('Processing img2text...')
-    caption = img2text(image_path)
     st.write(caption)
     # Stage 2: Text to Story
@@ -135,14 +120,21 @@ if uploaded_file is not None:
                 elif 'waveform' in speech_output and 'sample_rate' in speech_output:
                     st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
                 else:
-                    st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
             except Exception as e:
                 st.error(f"Error playing audio: {str(e)}")
         else:
-            st.error("Audio generation failed. Please try again.")
-    # Clean up the temporary files
-    try:
-        os.remove(image_path)
-    except:
-        pass

+# import part - only using the two requested imports
 import streamlit as st
 from transformers import pipeline
     return story_text
+# text2audio - Using HelpingAI-TTS-v1 model
 def text2audio(story_text):
     try:
         # Use the HelpingAI TTS model as requested
                 story_text = story_text[:max_chars]
         # Generate speech
         speech = synthesizer(story_text)
+        # Get output information
         st.write(f"Speech output keys: {list(speech.keys())}")
         return speech
     except Exception as e:
         st.error(f"Error generating audio: {str(e)}")
         return None
 # main part
 st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
 st.header("Turn Your Image to Audio Story")
     # Display the uploaded image
     st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
+    # Create a temporary file in memory from the uploaded file
+    image_bytes = uploaded_file.getvalue()
     # Stage 1: Image to Text
     st.text('Processing img2text...')
+    caption = img2text(image_bytes)  # Pass bytes directly to pipeline
     st.write(caption)
     # Stage 2: Text to Story
                 elif 'waveform' in speech_output and 'sample_rate' in speech_output:
                     st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
                 else:
+                    # Try the first array-like value as audio data
+                    for key, value in speech_output.items():
+                        if hasattr(value, '__len__') and len(value) > 1000:
+                            if 'rate' in speech_output:
+                                st.audio(value, sample_rate=speech_output['rate'])
+                            elif 'sample_rate' in speech_output:
+                                st.audio(value, sample_rate=speech_output['sample_rate'])
+                            elif 'sampling_rate' in speech_output:
+                                st.audio(value, sample_rate=speech_output['sampling_rate'])
+                            else:
+                                st.audio(value, sample_rate=24000)  # Default sample rate
+                            break
+                    else:
+                        st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
             except Exception as e:
                 st.error(f"Error playing audio: {str(e)}")
         else:
+            st.error("Audio generation failed. Please try again.")