CR7CAD commited on
Commit
cd79461
·
verified ·
1 Parent(s): 7c5a1e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -18
app.py CHANGED
@@ -2,11 +2,21 @@ import streamlit as st
2
  from transformers import pipeline
3
  from PIL import Image
4
  import os
 
 
 
5
 
6
  # function part
7
  # img2text
8
  def img2text(image_path):
9
  try:
 
 
 
 
 
 
 
10
  # Load the image-to-text model
11
  image_to_text_model = pipeline("image-to-text", model="naver-clova-ix/donut-base")
12
  # Open the image file
@@ -27,17 +37,21 @@ def text2story(text):
27
  story_text = f"Here's a story based on the text: {text}"
28
  return story_text
29
 
30
- # text2audio
31
  def text2audio(story_text):
32
  try:
33
- # Load the text-to-speech model (using a common TTS pipeline)
34
- # Note: You may need to install additional dependencies depending on the model used
35
- tts_model = pipeline("text-to-speech", model="espnet/kan-bayashi_ljspeech_vits")
 
 
 
 
36
 
37
- # Generate audio from the story text
38
- audio_data = tts_model(story_text)
39
 
40
- return audio_data
41
  except Exception as e:
42
  st.error(f"Error generating audio: {str(e)}")
43
  return None
@@ -53,7 +67,8 @@ uploaded_file = st.file_uploader("Select an Image...", type=['png', 'jpg', 'jpeg
53
  if uploaded_file is not None:
54
  # Save the uploaded file temporarily
55
  bytes_data = uploaded_file.getvalue()
56
- with open(uploaded_file.name, "wb") as file:
 
57
  file.write(bytes_data)
58
 
59
  # Display the uploaded image
@@ -62,7 +77,7 @@ if uploaded_file is not None:
62
 
63
  # Stage 1: Image to Text
64
  with st.spinner('Processing img2text...'):
65
- extracted_text = img2text(uploaded_file.name)
66
  st.subheader("Extracted Text:")
67
  st.write(extracted_text)
68
 
@@ -73,20 +88,27 @@ if uploaded_file is not None:
73
  st.write(story)
74
 
75
  # Stage 3: Story to Audio data
 
76
  with st.spinner('Generating audio data...'):
77
- audio_data = text2audio(story)
78
 
79
- # Remove the temporary file
80
- if os.path.exists(uploaded_file.name):
81
- os.remove(uploaded_file.name)
82
 
83
  # Play button
84
  if st.button("Play Audio"):
85
- if audio_data:
86
- st.audio(audio_data['audio'],
87
- format="audio/wav",
88
- start_time=0,
89
- sample_rate=audio_data['sampling_rate'])
 
 
 
 
 
 
90
  else:
91
  st.warning("Audio generation failed. Playing a placeholder audio.")
92
  try:
 
2
  from transformers import pipeline
3
  from PIL import Image
4
  import os
5
+ import torch
6
+ from gtts import gTTS
7
+ import tempfile
8
 
9
  # function part
10
  # img2text
11
  def img2text(image_path):
12
  try:
13
+ # Check if sentencepiece is installed
14
+ try:
15
+ import sentencepiece
16
+ except ImportError:
17
+ st.error("sentencepiece is not installed. Please install it with: pip install sentencepiece")
18
+ return "Error: sentencepiece not installed"
19
+
20
  # Load the image-to-text model
21
  image_to_text_model = pipeline("image-to-text", model="naver-clova-ix/donut-base")
22
  # Open the image file
 
37
  story_text = f"Here's a story based on the text: {text}"
38
  return story_text
39
 
40
+ # text2audio using Google Text-to-Speech instead of transformers
41
  def text2audio(story_text):
42
  try:
43
+ # Create a temporary file
44
+ temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
45
+ temp_audio_path = temp_audio.name
46
+ temp_audio.close()
47
+
48
+ # Initialize gTTS and generate audio
49
+ tts = gTTS(text=story_text, lang='en', slow=False)
50
 
51
+ # Save to the temporary file
52
+ tts.save(temp_audio_path)
53
 
54
+ return temp_audio_path
55
  except Exception as e:
56
  st.error(f"Error generating audio: {str(e)}")
57
  return None
 
67
  if uploaded_file is not None:
68
  # Save the uploaded file temporarily
69
  bytes_data = uploaded_file.getvalue()
70
+ image_temp_path = os.path.join(tempfile.gettempdir(), uploaded_file.name)
71
+ with open(image_temp_path, "wb") as file:
72
  file.write(bytes_data)
73
 
74
  # Display the uploaded image
 
77
 
78
  # Stage 1: Image to Text
79
  with st.spinner('Processing img2text...'):
80
+ extracted_text = img2text(image_temp_path)
81
  st.subheader("Extracted Text:")
82
  st.write(extracted_text)
83
 
 
88
  st.write(story)
89
 
90
  # Stage 3: Story to Audio data
91
+ audio_file_path = None
92
  with st.spinner('Generating audio data...'):
93
+ audio_file_path = text2audio(story)
94
 
95
+ # Remove the temporary image file
96
+ if os.path.exists(image_temp_path):
97
+ os.remove(image_temp_path)
98
 
99
  # Play button
100
  if st.button("Play Audio"):
101
+ if audio_file_path and os.path.exists(audio_file_path):
102
+ # Play the generated audio
103
+ with open(audio_file_path, "rb") as audio_file:
104
+ audio_bytes = audio_file.read()
105
+ st.audio(audio_bytes, format="audio/wav")
106
+
107
+ # Clean up the audio file after playing
108
+ try:
109
+ os.remove(audio_file_path)
110
+ except:
111
+ pass
112
  else:
113
  st.warning("Audio generation failed. Playing a placeholder audio.")
114
  try: