CR7CAD commited on
Commit
3fd88eb
·
verified ·
1 Parent(s): 7df9b81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -45
app.py CHANGED
@@ -4,7 +4,6 @@ from transformers import pipeline
4
  import os
5
  import numpy as np
6
  import io
7
- import scipy.io.wavfile as wavfile
8
 
9
  # function part
10
  # img2text
@@ -44,65 +43,53 @@ def text2story(text):
44
 
45
  return story_text
46
 
47
- # text2audio - REVISED to use facebook/mms-tts-eng model
48
  def text2audio(story_text):
49
  try:
50
- # Use a smaller and more reliable TTS model
51
  synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng")
52
 
53
- # Break the text into smaller chunks if needed (prevent timeout)
54
- max_chunk_size = 200 # characters
55
- chunks = []
 
 
 
 
 
 
56
 
57
- for i in range(0, len(story_text), max_chunk_size):
58
- chunk = story_text[i:i+max_chunk_size]
59
- # Make sure we break at word boundaries
60
- if i+max_chunk_size < len(story_text) and story_text[i+max_chunk_size] != ' ':
61
- # Find the last space in this chunk
62
- last_space = chunk.rfind(' ')
63
- if last_space != -1:
64
- chunk = chunk[:last_space]
65
-
66
- chunks.append(chunk)
67
 
68
- # Process each chunk
69
- audio_arrays = []
70
- sampling_rate = None
71
 
72
- for chunk in chunks:
73
- if not chunk.strip(): # Skip empty chunks
74
- continue
75
-
76
- speech = synthesizer(chunk)
77
- if sampling_rate is None:
78
- sampling_rate = speech["sampling_rate"]
79
-
80
- audio_arrays.append(speech["audio"])
81
 
82
- # Combine all audio chunks
83
- combined_audio = np.concatenate(audio_arrays)
 
84
 
85
- # Create a BytesIO object to store the wave file
86
- wav_buffer = io.BytesIO()
87
- wavfile.write(wav_buffer, sampling_rate, combined_audio)
88
- wav_buffer.seek(0) # Rewind the buffer
 
89
 
90
  return {
91
- "audio": wav_buffer.getvalue(),
92
- "sampling_rate": sampling_rate
93
  }
94
 
95
  except Exception as e:
96
  st.error(f"Error generating audio: {str(e)}")
97
- # Fallback to a pre-recorded audio file if available
98
- try:
99
- with open("fallback_audio.wav", "rb") as f:
100
- return {
101
- "audio": f.read(),
102
- "sampling_rate": 22050 # Common sample rate
103
- }
104
- except:
105
- return None
106
 
107
  # Function to save temporary image file
108
  def save_uploaded_image(uploaded_file):
 
4
  import os
5
  import numpy as np
6
  import io
 
7
 
8
  # function part
9
  # img2text
 
43
 
44
  return story_text
45
 
46
+ # text2audio - REVISED to use a simpler approach without scipy
47
  def text2audio(story_text):
48
  try:
49
+ # Use the facebook/mms-tts-eng model with fewer features
50
  synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng")
51
 
52
+ # For simplicity, we'll limit the text length to avoid timeouts
53
+ # If text is too long, truncate it to a reasonable length (500 chars ~ 100 words)
54
+ max_length = 500
55
+ if len(story_text) > max_length:
56
+ last_period = story_text[:max_length].rfind('.')
57
+ if last_period > 0:
58
+ story_text = story_text[:last_period + 1]
59
+ else:
60
+ story_text = story_text[:max_length]
61
 
62
+ # Generate speech
63
+ speech = synthesizer(story_text)
 
 
 
 
 
 
 
 
64
 
65
+ # Save the audio to a file instead of using in-memory processing
66
+ # This avoids needing scipy
67
+ temp_audio_path = "temp_audio.wav"
68
 
69
+ # Convert numpy array to bytes and save
70
+ with open(temp_audio_path, "wb") as f:
71
+ # Assuming the audio is in the right format already
72
+ np.save(f, speech["audio"])
 
 
 
 
 
73
 
74
+ # Read the file back
75
+ with open(temp_audio_path, "rb") as f:
76
+ audio_data = f.read()
77
 
78
+ # Clean up
79
+ try:
80
+ os.remove(temp_audio_path)
81
+ except:
82
+ pass
83
 
84
  return {
85
+ "audio": audio_data,
86
+ "sampling_rate": speech["sampling_rate"]
87
  }
88
 
89
  except Exception as e:
90
  st.error(f"Error generating audio: {str(e)}")
91
+ # No fallback - just return None
92
+ return None
 
 
 
 
 
 
 
93
 
94
  # Function to save temporary image file
95
  def save_uploaded_image(uploaded_file):