Edge_TTS_NGHIA_transcript

Sleeping

App Files Files Community

cnph001 commited on Apr 27

Commit

284179e

verified ·

1 Parent(s): 552e1db

Restore working

Browse files

Files changed (1) hide show

app.py +32 -74

app.py CHANGED Viewed

@@ -5,33 +5,12 @@ import asyncio
 import tempfile
 import os
 import re  # Import the regular expression module
-import struct
-import wave
-# Function to create a temporary silent WAV file
-def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2):
-    """Creates a temporary WAV file containing silence.
-    Args:
-        duration (float): Duration of silence in seconds.
-        temp_dir (str): Directory to save the temporary file.
-        sample_rate (int): Sample rate of the audio (samples per second).
-        num_channels (int): Number of audio channels (1 for mono, 2 for stereo).
-        sample_width (int): Sample width in bytes (e.g., 2 for 16-bit).
-    Returns:
-        str: Path to the temporary silent WAV file.
-    """
-    num_frames = int(duration * sample_rate)
-    silent_data = b'\x00' * (num_frames * num_channels * sample_width)
-    temp_wav_path = os.path.join(temp_dir, f"silent_{duration}.wav")
-    with wave.open(temp_wav_path, 'w') as wf:
-        wf.setnchannels(num_channels)
-        wf.setframerate(sample_rate)
-        wf.setsampwidth(sample_width)
-        wf.writeframes(silent_data)
-    return temp_wav_path
 # Text-to-speech function for a single paragraph with SS handling
 async def paragraph_to_speech(text, voice, rate, pitch):
@@ -48,16 +27,15 @@ async def paragraph_to_speech(text, voice, rate, pitch):
         return None, []  # Return None for audio path and empty list for silence
     audio_segments = []
-    temp_dir = tempfile.gettempdir()
     parts = re.split(r'(SS\d+\.?\d*)', text)
     for part in parts:
         if re.match(r'SS\d+\.?\d*', part):
             try:
                 silence_duration = float(part[2:])
-                # Assuming default WAV parameters for silence
-                silent_wav_path = create_silent_wav(silence_duration, temp_dir)
-                audio_segments.append(silent_wav_path)
             except ValueError:
                 print(f"Warning: Invalid silence duration format: {part}")
         elif part.strip():
@@ -93,19 +71,21 @@ async def paragraph_to_speech(text, voice, rate, pitch):
                 current_pitch = -30
                 current_rate = -20
             else:
                 current_voice = (voice or default_voice).split(" - ")[0]
                 processed_text=part[:]
             rate_str = f"{current_rate:+d}%"
             pitch_str = f"{current_pitch:+d}Hz"
             communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                 tmp_path = tmp_file.name
                 await communicate.save(tmp_path)
             audio_segments.append(tmp_path)
         else:
             audio_segments.append(None) # Empty string
-    return audio_segments, [] # Returning empty list for silence times as we are directly creating silent WAV
 # Main text-to-speech function that processes paragraphs and silence
 async def text_to_speech(text, voice, rate, pitch):
@@ -118,9 +98,12 @@ async def text_to_speech(text, voice, rate, pitch):
     final_audio_segments = []
     for paragraph in paragraphs:
-        audio_paths, _ = await paragraph_to_speech(paragraph, voice, rate, pitch)
         if audio_paths:
-            final_audio_segments.extend(audio_paths)
     if not any(isinstance(item, str) for item in final_audio_segments):
         return None, None  # No actual audio generated
@@ -128,42 +111,20 @@ async def text_to_speech(text, voice, rate, pitch):
     if all(not isinstance(item, str) for item in final_audio_segments):
         return None, "Only silence markers found."
-    combined_audio_path = tempfile.mktemp(suffix=".wav")
-    with wave.open(combined_audio_path, 'w') as outfile:
-        first_audio = True
-        sample_rate = None
-        num_channels = None
-        sample_width = None
-        for segment_path in final_audio_segments:
-            if isinstance(segment_path, str):
                 try:
-                    with wave.open(segment_path, 'rb') as infile:
-                        current_num_channels = infile.getnchannels()
-                        current_sample_rate = infile.getframerate()
-                        current_sample_width = infile.getsampwidth()
-                        frames = infile.readframes(infile.getnframes())
-                        if first_audio:
-                            num_channels = current_num_channels
-                            sample_rate = current_sample_rate
-                            sample_width = current_sample_width
-                            outfile.setnchannels(num_channels)
-                            outfile.setframerate(sample_rate)
-                            outfile.setsampwidth(sample_width)
-                            first_audio = False
-                        elif (current_num_channels != num_channels or
-                              current_sample_rate != sample_rate or
-                              current_sample_width != sample_width):
-                            print(f"Warning: Audio segment {segment_path} has different format. Skipping.")
-                            continue
-                        outfile.writeframes(frames)
-                    os.remove(segment_path)  # Clean up individual files
-                except wave.Error as e:
-                    print(f"Warning: Error reading WAV file {segment_path}: {e}")
                 except FileNotFoundError:
-                    print(f"Warning: Audio file not found: {segment_path}")
     return combined_audio_path, None
@@ -173,12 +134,9 @@ def tts_interface(text, voice, rate, pitch):
     audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
     return audio, warning
-async def get_voices():
-    voices_list = await edge_tts.list_voices()
-    voices_dict = {v["ShortName"]: f"{v['Name']} - {v['LocaleName']} ({v['Gender']})" for v in voices_list}
-    return voices_dict
 # Create Gradio application
 async def create_demo():
     voices = await get_voices()
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"  # 👈 Pick one of the available voices
@@ -201,7 +159,7 @@ async def create_demo():
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
-        title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph) - WAV Output",
         description=description,
         article="Process text paragraph by paragraph for smoother output and insert silence markers.",
         analytics_enabled=False,

 import tempfile
 import os
 import re  # Import the regular expression module
+# Get all available voices
+async def get_voices():
+    voices = await edge_tts.list_voices()
+    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 # Text-to-speech function for a single paragraph with SS handling
 async def paragraph_to_speech(text, voice, rate, pitch):
         return None, []  # Return None for audio path and empty list for silence
     audio_segments = []
+    silence_durations = []
     parts = re.split(r'(SS\d+\.?\d*)', text)
     for part in parts:
         if re.match(r'SS\d+\.?\d*', part):
             try:
                 silence_duration = float(part[2:])
+                silence_durations.append(silence_duration)
+                audio_segments.append(None) # Placeholder for silence
             except ValueError:
                 print(f"Warning: Invalid silence duration format: {part}")
         elif part.strip():
                 current_pitch = -30
                 current_rate = -20
             else:
+                # Use selected voice, or fallback to default
+                #voice_short_name = (voice or default_voice).split(" - ")[0]
                 current_voice = (voice or default_voice).split(" - ")[0]
                 processed_text=part[:]
             rate_str = f"{current_rate:+d}%"
             pitch_str = f"{current_pitch:+d}Hz"
             communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                 tmp_path = tmp_file.name
                 await communicate.save(tmp_path)
             audio_segments.append(tmp_path)
         else:
             audio_segments.append(None) # Empty string
+    return audio_segments, silence_durations
 # Main text-to-speech function that processes paragraphs and silence
 async def text_to_speech(text, voice, rate, pitch):
     final_audio_segments = []
     for paragraph in paragraphs:
+        audio_paths, silence_times = await paragraph_to_speech(paragraph, voice, rate, pitch)
         if audio_paths:
+            for i, path in enumerate(audio_paths):
+                final_audio_segments.append(path)
+                if i < len(silence_times):
+                    final_audio_segments.append(silence_times[i])
     if not any(isinstance(item, str) for item in final_audio_segments):
         return None, None  # No actual audio generated
     if all(not isinstance(item, str) for item in final_audio_segments):
         return None, "Only silence markers found."
+    combined_audio_path = tempfile.mktemp(suffix=".mp3")
+    with open(combined_audio_path, 'wb') as outfile:
+        for segment in final_audio_segments:
+            if isinstance(segment, str):
                 try:
+                    with open(segment, 'rb') as infile:
+                        outfile.write(infile.read())
+                    os.remove(segment)  # Clean up individual files
                 except FileNotFoundError:
+                    print(f"Warning: Audio file not found: {segment}")
+            elif isinstance(segment, (int, float)):
+                # Basic silence insertion (approximate)
+                silence = b'\x00' * int(segment * 44100 * 2) # Assuming 16-bit mono at 44.1kHz
+                outfile.write(silence)
     return combined_audio_path, None
     audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
     return audio, warning
 # Create Gradio application
+import gradio as gr
 async def create_demo():
     voices = await get_voices()
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"  # 👈 Pick one of the available voices
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
+        title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph)",
         description=description,
         article="Process text paragraph by paragraph for smoother output and insert silence markers.",
         analytics_enabled=False,