Spaces:

CR7CAD
/

Assignment1

Sleeping

App Files Files Community

CR7CAD commited on Mar 8

Commit

efe4c0f

verified ·

1 Parent(s): 3b82d8f

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -51

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
-# Imports
 import streamlit as st
 from transformers import pipeline
 from PIL import Image
-import torch
 # Simple image-to-text function
 def img2text(image):
@@ -10,7 +9,7 @@ def img2text(image):
     text = image_to_text(image)[0]["generated_text"]
     return text
-# Improved text-to-story function with natural ending
 def text2story(text):
     generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
     prompt = f"Write a short children's story based on this: {text}. The story should have a clear beginning, middle, and end. Keep it under 150 words. Once upon a time, "
@@ -53,33 +52,10 @@ def text2story(text):
     # If no good ending is found, return as is
     return story_text
-# Updated text-to-audio function with a compatible model
 def text2audio(story_text):
-    # Use Microsoft's SpeechT5 model which is widely supported
-    synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
-    # This model requires speaker embeddings
-    from transformers import SpeechT5HifiGan
-    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-    # Get speaker embeddings for a female voice
-    from transformers import SpeechT5Processor
-    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-    speaker_embeddings = processor.speaker_embeddings["female"]
-    # Limit text length to avoid issues
-    max_chars = 500
-    if len(story_text) > max_chars:
-        last_period = story_text[:max_chars].rfind('.')
-        if last_period > 0:
-            story_text = story_text[:last_period + 1]
-        else:
-            story_text = story_text[:max_chars]
-    # Generate speech with appropriate parameters
-    inputs = processor(text=story_text, return_tensors="pt")
-    speech = synthesizer(inputs["input_ids"][0], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
     return speech
 # Basic Streamlit interface
@@ -94,31 +70,26 @@ if uploaded_file is not None:
     image = Image.open(uploaded_file)
     # Image to Text
-    with st.spinner("Generating caption..."):
-        caption = img2text(image)
     st.write(f"Caption: {caption}")
     # Text to Story
-    with st.spinner("Creating story..."):
-        story = text2story(caption)
     st.write(f"Story: {story}")
     # Text to Audio
-    with st.spinner("Generating audio..."):
-        try:
-            speech_output = text2audio(story)
-            # Play audio
-            if hasattr(speech_output, 'numpy') or hasattr(speech_output, 'audio'):
-                if hasattr(speech_output, 'numpy'):
-                    audio_data = speech_output.numpy()
-                else:
-                    audio_data = speech_output.audio
-                sample_rate = speech_output.sampling_rate if hasattr(speech_output, 'sampling_rate') else 16000
-                st.audio(audio_data, sample_rate=sample_rate)
-            else:
-                st.audio(speech_output['audio'], sample_rate=speech_output.get('sampling_rate', 16000))
-        except Exception as e:
-            st.error(f"Error generating or playing audio: {e}")
-            st.write("Try installing the latest transformers library with: pip install --upgrade transformers")

+# Only the two imports you requested
 import streamlit as st
 from transformers import pipeline
 from PIL import Image
 # Simple image-to-text function
 def img2text(image):
     text = image_to_text(image)[0]["generated_text"]
     return text
+# Simple text-to-story function
 def text2story(text):
     generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
     prompt = f"Write a short children's story based on this: {text}. The story should have a clear beginning, middle, and end. Keep it under 150 words. Once upon a time, "
     # If no good ending is found, return as is
     return story_text
+# Simple text-to-audio function
 def text2audio(story_text):
+    synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
+    speech = synthesizer(story_text)
     return speech
 # Basic Streamlit interface
     image = Image.open(uploaded_file)
     # Image to Text
+    st.write("Generating caption...")
+    caption = img2text(image)
     st.write(f"Caption: {caption}")
     # Text to Story
+    st.write("Creating story...")
+    story = text2story(caption)
     st.write(f"Story: {story}")
     # Text to Audio
+    st.write("Generating audio...")
+    speech_output = text2audio(story)
+    # Play audio
+    try:
+        if 'audio' in speech_output and 'sampling_rate' in speech_output:
+            st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
+        elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
+            st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
+        else:
+            st.write("Audio generated but could not be played.")
+    except Exception as e:
+        st.error(f"Error playing audio: {e}")