Spaces:

CR7CAD
/

Assignment1

Sleeping

App Files Files Community

CR7CAD commited on Mar 8

Commit

cd79461

verified ·

1 Parent(s): 7c5a1e4

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -18

app.py CHANGED Viewed

@@ -2,11 +2,21 @@ import streamlit as st
 from transformers import pipeline
 from PIL import Image
 import os
 # function part
 # img2text
 def img2text(image_path):
     try:
         # Load the image-to-text model
         image_to_text_model = pipeline("image-to-text", model="naver-clova-ix/donut-base")
         # Open the image file
@@ -27,17 +37,21 @@ def text2story(text):
     story_text = f"Here's a story based on the text: {text}"
     return story_text
-# text2audio
 def text2audio(story_text):
     try:
-        # Load the text-to-speech model (using a common TTS pipeline)
-        # Note: You may need to install additional dependencies depending on the model used
-        tts_model = pipeline("text-to-speech", model="espnet/kan-bayashi_ljspeech_vits")
-        # Generate audio from the story text
-        audio_data = tts_model(story_text)
-        return audio_data
     except Exception as e:
         st.error(f"Error generating audio: {str(e)}")
         return None
@@ -53,7 +67,8 @@ uploaded_file = st.file_uploader("Select an Image...", type=['png', 'jpg', 'jpeg
 if uploaded_file is not None:
     # Save the uploaded file temporarily
     bytes_data = uploaded_file.getvalue()
-    with open(uploaded_file.name, "wb") as file:
         file.write(bytes_data)
     # Display the uploaded image
@@ -62,7 +77,7 @@ if uploaded_file is not None:
     # Stage 1: Image to Text
     with st.spinner('Processing img2text...'):
-        extracted_text = img2text(uploaded_file.name)
         st.subheader("Extracted Text:")
         st.write(extracted_text)
@@ -73,20 +88,27 @@ if uploaded_file is not None:
         st.write(story)
     # Stage 3: Story to Audio data
     with st.spinner('Generating audio data...'):
-        audio_data = text2audio(story)
-    # Remove the temporary file
-    if os.path.exists(uploaded_file.name):
-        os.remove(uploaded_file.name)
     # Play button
     if st.button("Play Audio"):
-        if audio_data:
-            st.audio(audio_data['audio'],
-                    format="audio/wav",
-                    start_time=0,
-                    sample_rate=audio_data['sampling_rate'])
         else:
             st.warning("Audio generation failed. Playing a placeholder audio.")
             try:

 from transformers import pipeline
 from PIL import Image
 import os
+import torch
+from gtts import gTTS
+import tempfile
 # function part
 # img2text
 def img2text(image_path):
     try:
+        # Check if sentencepiece is installed
+        try:
+            import sentencepiece
+        except ImportError:
+            st.error("sentencepiece is not installed. Please install it with: pip install sentencepiece")
+            return "Error: sentencepiece not installed"
         # Load the image-to-text model
         image_to_text_model = pipeline("image-to-text", model="naver-clova-ix/donut-base")
         # Open the image file
     story_text = f"Here's a story based on the text: {text}"
     return story_text
+# text2audio using Google Text-to-Speech instead of transformers
 def text2audio(story_text):
     try:
+        # Create a temporary file
+        temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
+        temp_audio_path = temp_audio.name
+        temp_audio.close()
+        # Initialize gTTS and generate audio
+        tts = gTTS(text=story_text, lang='en', slow=False)
+        # Save to the temporary file
+        tts.save(temp_audio_path)
+        return temp_audio_path
     except Exception as e:
         st.error(f"Error generating audio: {str(e)}")
         return None
 if uploaded_file is not None:
     # Save the uploaded file temporarily
     bytes_data = uploaded_file.getvalue()
+    image_temp_path = os.path.join(tempfile.gettempdir(), uploaded_file.name)
+    with open(image_temp_path, "wb") as file:
         file.write(bytes_data)
     # Display the uploaded image
     # Stage 1: Image to Text
     with st.spinner('Processing img2text...'):
+        extracted_text = img2text(image_temp_path)
         st.subheader("Extracted Text:")
         st.write(extracted_text)
         st.write(story)
     # Stage 3: Story to Audio data
+    audio_file_path = None
     with st.spinner('Generating audio data...'):
+        audio_file_path = text2audio(story)
+    # Remove the temporary image file
+    if os.path.exists(image_temp_path):
+        os.remove(image_temp_path)
     # Play button
     if st.button("Play Audio"):
+        if audio_file_path and os.path.exists(audio_file_path):
+            # Play the generated audio
+            with open(audio_file_path, "rb") as audio_file:
+                audio_bytes = audio_file.read()
+            st.audio(audio_bytes, format="audio/wav")
+            # Clean up the audio file after playing
+            try:
+                os.remove(audio_file_path)
+            except:
+                pass
         else:
             st.warning("Audio generation failed. Playing a placeholder audio.")
             try: