Spaces:

djkesu
/

tortoise5c

Running

App Files Files Community

djkesu commited on Sep 27, 2023

Commit

b794742

1 Parent(s): 6eb0448

reverted back to old app.py

Browse files

Files changed (1) hide show

app.py +292 -112

app.py CHANGED Viewed

@@ -1,126 +1,306 @@
 import os
 import shutil
 import streamlit as st
-import torchaudio
-import IPython
-import base64
-from tortoise.api import TextToSpeech
-from tortoise.utils.audio import load_voice, load_voices
-# Initialize TextToSpeech model
-tts = TextToSpeech()
-# Constants
-PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"]
-UPLOAD_FOLDER = "./uploads"
-OUTPUT_FOLDER = "./output"
-voice_samples = None
-conditioning_latents = None
-# Create upload and output directories if they don't exist
-os.makedirs(UPLOAD_FOLDER, exist_ok=True)
-os.makedirs(OUTPUT_FOLDER, exist_ok=True)
-# Streamlit UI elements
-st.title("Tortoise Cloning App")
-# Upload .wav files
-st.sidebar.header("Upload Audio Samples")
-uploaded_files = st.sidebar.file_uploader(
-    "Upload Audio Samples for a New Voice",
-    accept_multiple_files=True,
-    type=["wav"],
-)
-# Create a new voice
-voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.")
-if st.sidebar.button("Create Voice") and voice_name.strip() != "":
-    new_voice_name = voice_name.strip().replace(" ", "_")
-    voices_dir = f"./tortoise/voices/{new_voice_name}/"
-    if os.path.exists(voices_dir):
-        shutil.rmtree(voices_dir)
-    os.makedirs(voices_dir)
-    for index, uploaded_file in enumerate(uploaded_files):
-        bytes_data = uploaded_file.read()
-        with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
-            wav_file.write(bytes_data)
-    st.sidebar.success(f"Voice '{voice_name}' created successfully!")
-    voice_samples, conditioning_latents = load_voice(voice_name)
-# Input text and settings
-st.header("Text-to-Speech Generation")
-text = st.text_area(
-    "Enter Text",
-    help="Enter the text you want to convert to speech.",
-    value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?",
-)
-preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.")
-voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
-voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.")
-# Generate speech
-if st.button("Generate Speech"):
-    if voice_name.strip() == "":
-        st.warning("Please create a voice first.")
-    else:
-        st.info("Generating speech...")
-        # Load voice samples
-        # voice_samples, conditioning_latents = load_voice(voice)
-        print(voice_samples)
-        # Generate speech with Tortoise
-        gen = tts.tts_with_preset(
-            text,
-            voice_samples=voice_samples,
-            conditioning_latents=conditioning_latents,
-            preset=preset,
         )
-        # Save and display the generated audio
-        output_path = os.path.join(OUTPUT_FOLDER, "generated.wav")
-        torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)
-        print(output_path)
-        # Log the path of the generated audio
-        st.write(f"Generated audio saved at: {output_path}")
-        # Display the generated audio
-        st.subheader("Generated Output")
-        # Create a download link for the generated audio
-        audio_base64 = base64.b64encode(open(output_path, 'rb').read()).decode('utf-8')
-        href = f'<a href="data:audio/wav;base64,{audio_base64}" download="generated.wav">Download Audio</a>'
-        st.markdown(href, unsafe_allow_html=True)
-        # Display the generated audio
-        st.audio(output_path, format="audio/wav", start_time=0)
-        # Autoplay the audio
-        st.audio(output_path, format="audio/wav", start_time=0, autoplay=True)
-        st.success("Speech generated successfully!")
-# Clean up uploaded files and output directory
-if st.sidebar.button("Clean Up"):
-    shutil.rmtree(UPLOAD_FOLDER)
-    os.makedirs(UPLOAD_FOLDER, exist_ok=True)
-    shutil.rmtree(OUTPUT_FOLDER)
-    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
-    st.sidebar.success("Clean up completed!")
-# Display information
-st.sidebar.header("Information")
-st.sidebar.markdown(
-    "This app allows you to create a new voice by uploading .wav files. You can then generate speech "
-    "using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed."
-)

+# AGPL: a notification must be added stating that changes have been made to that file.
 import os
 import shutil
+from pathlib import Path
 import streamlit as st
+from random import randint
+from tortoise.api import MODELS_DIR
+from tortoise.inference import (
+    infer_on_texts,
+    run_and_save_tts,
+    split_and_recombine_text,
+)
+from tortoise.utils.diffusion import SAMPLERS
+from app_utils.filepicker import st_file_selector
+from app_utils.conf import TortoiseConfig
+from app_utils.funcs import (
+    timeit,
+    load_model,
+    list_voices,
+    load_voice_conditionings,
+)
+LATENT_MODES = [
+    "Tortoise original (bad)",
+    "average per 4.27s (broken on small files)",
+    "average per voice file (broken on small files)",
+]
+def main():
+    conf = TortoiseConfig()
+    with st.expander("Create New Voice", expanded=True):
+        if "file_uploader_key" not in st.session_state:
+            st.session_state["file_uploader_key"] = str(randint(1000, 100000000))
+            st.session_state["text_input_key"] = str(randint(1000, 100000000))
+        uploaded_files = st.file_uploader(
+            "Upload Audio Samples for a New Voice",
+            accept_multiple_files=True,
+            type=["wav"],
+            key=st.session_state["file_uploader_key"]
+        )
+        voice_name = st.text_input(
+            "New Voice Name",
+            help="Enter a name for your new voice.",
+            value="",
+            key=st.session_state["text_input_key"]
+        )
+        create_voice_button = st.button(
+            "Create Voice",
+            disabled = ((voice_name.strip() == "") | (len(uploaded_files) == 0))
+        )
+        if create_voice_button:
+            st.write(st.session_state)
+            with st.spinner(f"Creating new voice: {voice_name}"):
+                new_voice_name = voice_name.strip().replace(" ", "_")
+                voices_dir = f'./tortoise/voices/{new_voice_name}/'
+                if os.path.exists(voices_dir):
+                    shutil.rmtree(voices_dir)
+                os.makedirs(voices_dir)
+                for index, uploaded_file in enumerate(uploaded_files):
+                    bytes_data = uploaded_file.read()
+                    with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
+                        wav_file.write(bytes_data)
+                st.session_state["text_input_key"] = str(randint(1000, 100000000))
+                st.session_state["file_uploader_key"] = str(randint(1000, 100000000))
+                st.experimental_rerun()
+    text = st.text_area(
+        "Text",
+        help="Text to speak.",
+        value="The expressiveness of autoregressive transformers is literally nuts! I absolutely adore them.",
+    )
+    voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
+    voice = st.selectbox(
+        "Voice",
+        voices,
+        help="Selects the voice to use for generation. See options in voices/ directory (and add your own!) "
+        "Use the & character to join two voices together. Use a comma to perform inference on multiple voices.",
+        index=0,
+    )
+    preset = st.selectbox(
+        "Preset",
+        (
+            "single_sample",
+            "ultra_fast",
+            "very_fast",
+            "ultra_fast_old",
+            "fast",
+            "standard",
+            "high_quality",
+        ),
+        help="Which voice preset to use.",
+        index=1,
+    )
+    with st.expander("Advanced"):
+        col1, col2 = st.columns(2)
+        with col1:
+            """#### Model parameters"""
+            candidates = st.number_input(
+                "Candidates",
+                help="How many output candidates to produce per-voice.",
+                value=1,
+            )
+            latent_averaging_mode = st.radio(
+                "Latent averaging mode",
+                LATENT_MODES,
+                help="How voice samples should be averaged together.",
+                index=0,
+            )
+            sampler = st.radio(
+                "Sampler",
+                #SAMPLERS,
+                ["dpm++2m", "p", "ddim"],
+                help="Diffusion sampler. Note that dpm++2m is experimental and typically requires more steps.",
+                index=1,
+            )
+            steps = st.number_input(
+                "Steps",
+                help="Override the steps used for diffusion (default depends on preset)",
+                value=10,
+            )
+            seed = st.number_input(
+                "Seed",
+                help="Random seed which can be used to reproduce results.",
+                value=-1,
+            )
+            if seed == -1:
+                seed = None
+            voice_fixer = st.checkbox(
+                "Voice fixer",
+                help="Use `voicefixer` to improve audio quality. This is a post-processing step which can be applied to any output.",
+                value=True,
+            )
+            """#### Directories"""
+            output_path = st.text_input(
+                "Output Path", help="Where to store outputs.", value="results/"
+            )
+        with col2:
+            """#### Optimizations"""
+            high_vram = not st.checkbox(
+                "Low VRAM",
+                help="Re-enable default offloading behaviour of tortoise",
+                value=True,
+            )
+            half = st.checkbox(
+                "Half-Precision",
+                help="Enable autocast to half precision for autoregressive model",
+                value=False,
+            )
+            kv_cache = st.checkbox(
+                "Key-Value Cache",
+                help="Enable kv_cache usage, leading to drastic speedups but worse memory usage",
+                value=True,
+            )
+            cond_free = st.checkbox(
+                "Conditioning Free",
+                help="Force conditioning free diffusion",
+                value=True,
+            )
+            no_cond_free = st.checkbox(
+                "Force Not Conditioning Free",
+                help="Force disable conditioning free diffusion",
+                value=False,
+            )
+            """#### Text Splitting"""
+            min_chars_to_split = st.number_input(
+                "Min Chars to Split",
+                help="Minimum number of characters to split text on",
+                min_value=50,
+                value=200,
+                step=1,
+            )
+            """#### Debug"""
+            produce_debug_state = st.checkbox(
+                "Produce Debug State",
+                help="Whether or not to produce debug_state.pth, which can aid in reproducing problems. Defaults to true.",
+                value=True,
+            )
+    ar_checkpoint = "."
+    diff_checkpoint = "."
+    if st.button("Update Basic Settings"):
+        conf.update(
+            EXTRA_VOICES_DIR=extra_voices_dir,
+            LOW_VRAM=not high_vram,
+            AR_CHECKPOINT=ar_checkpoint,
+            DIFF_CHECKPOINT=diff_checkpoint,
         )
+    ar_checkpoint = None
+    diff_checkpoint = None
+    tts = load_model(MODELS_DIR, high_vram, kv_cache, ar_checkpoint, diff_checkpoint)
+    if st.button("Start"):
+        assert latent_averaging_mode
+        assert preset
+        assert voice
+        def show_generation(fp, filename: str):
+            """
+            audio_buffer = BytesIO()
+            save_gen_with_voicefix(g, audio_buffer, squeeze=False)
+            torchaudio.save(audio_buffer, g, 24000, format='wav')
+            """
+            st.audio(str(fp), format="audio/wav")
+            st.download_button(
+                "Download sample",
+                str(fp),
+                file_name=filename,  # this doesn't actually seem to work lol
+            )
+        with st.spinner(
+            f"Generating {candidates} candidates for voice {voice} (seed={seed}). You can see progress in the terminal"
+        ):
+            os.makedirs(output_path, exist_ok=True)
+            selected_voices = voice.split(",")
+            for k, selected_voice in enumerate(selected_voices):
+                if "&" in selected_voice:
+                    voice_sel = selected_voice.split("&")
+                else:
+                    voice_sel = [selected_voice]
+                voice_samples, conditioning_latents = load_voice_conditionings(
+                    voice_sel, []
+                )
+                voice_path = Path(os.path.join(output_path, selected_voice))
+                with timeit(
+                    f"Generating {candidates} candidates for voice {selected_voice} (seed={seed})"
+                ):
+                    nullable_kwargs = {
+                        k: v
+                        for k, v in zip(
+                            ["sampler", "diffusion_iterations", "cond_free"],
+                            [sampler, steps, cond_free],
+                        )
+                        if v is not None
+                    }
+                    def call_tts(text: str):
+                        return tts.tts_with_preset(
+                            text,
+                            k=candidates,
+                            voice_samples=voice_samples,
+                            conditioning_latents=conditioning_latents,
+                            preset=preset,
+                            use_deterministic_seed=seed,
+                            return_deterministic_state=True,
+                            cvvp_amount=0.0,
+                            half=half,
+                            latent_averaging_mode=LATENT_MODES.index(
+                                latent_averaging_mode
+                            ),
+                            **nullable_kwargs,
+                        )
+                    if len(text) < min_chars_to_split:
+                        filepaths = run_and_save_tts(
+                            call_tts,
+                            text,
+                            voice_path,
+                            return_deterministic_state=True,
+                            return_filepaths=True,
+                            voicefixer=voice_fixer,
+                        )
+                        for i, fp in enumerate(filepaths):
+                            show_generation(fp, f"{selected_voice}-text-{i}.wav")
+                    else:
+                        desired_length = int(min_chars_to_split)
+                        texts = split_and_recombine_text(
+                            text, desired_length, desired_length + 100
+                        )
+                        filepaths = infer_on_texts(
+                            call_tts,
+                            texts,
+                            voice_path,
+                            return_deterministic_state=True,
+                            return_filepaths=True,
+                            lines_to_regen=set(range(len(texts))),
+                            voicefixer=voice_fixer,
+                        )
+                        for i, fp in enumerate(filepaths):
+                            show_generation(fp, f"{selected_voice}-text-{i}.wav")
+        if produce_debug_state:
+            """Debug states can be found in the output directory"""
+if __name__ == "__main__":
+    main()