vixtts-demo

Running

App Files Files Community

Uhhy commited on Sep 12, 2024

Commit

6f5b5a3

verified ·

1 Parent(s): 33f1e50

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -28

app.py CHANGED Viewed

@@ -58,8 +58,8 @@ if torch.cuda.is_available():
 supported_languages = config.languages
 if not "vi" in supported_languages:
     supported_languages.append("vi")
-if not "es-AR" in supported_languages:
-    supported_languages.append("es-AR")
 def normalize_vietnamese_text(text):
     text = (
@@ -76,20 +76,6 @@ def normalize_vietnamese_text(text):
     )
     return text
-def calculate_keep_len(text, lang):
-    if lang in ["ja", "zh-cn"]:
-        return -1
-    word_count = len(text.split())
-    num_punct = text.count(".") + text.count("!") + text.count("?") + text.count(",")
-    if word_count < 5:
-        return 15000 * word_count + 2000 * num_punct
-    elif word_count < 10:
-        return 13000 * word_count + 2000 * num_punct
-    return -1
 def analyze_sentiment(text):
     sia = SentimentIntensityAnalyzer()
     scores = sia.polarity_scores(text)
@@ -99,6 +85,10 @@ def change_pitch(audio_data, sampling_rate, sentiment):
     semitones = sentiment * 2
     return pyrubberband.pitch_shift(audio_data, sampling_rate, semitones)
 @spaces.GPU(duration=0)
 def predict(
     prompt,
@@ -118,12 +108,6 @@ def predict(
         metrics_text = gr.Warning("Por favor, introduce un texto más largo.")
         return (None, metrics_text)
-    if len(prompt) > 250000000:
-        metrics_text = gr.Warning(
-            f"El texto tiene {len(prompt)} caracteres. Es demasiado largo, por favor, mantenlo por debajo de 250000000 caracteres."
-        )
-        return (None, metrics_text)
     try:
         metrics_text = ""
         t_latent = time.time()
@@ -173,13 +157,12 @@ def predict(
         real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
         metrics_text += f"Factor de tiempo real (RTF): {real_time_factor:.2f}\n"
-        keep_len = calculate_keep_len(prompt, language)
-        out["wav"] = out["wav"][:keep_len]
         audio_data = np.array(out["wav"])
         modified_audio = change_pitch(audio_data, 24000, sentiment)
         torchaudio.save("output.wav", torch.tensor(modified_audio).unsqueeze(0), 24000)
     except RuntimeError as e:
@@ -246,13 +229,12 @@ with gr.Blocks(analytics_enabled=False) as demo:
         with gr.Column():
             input_text_gr = gr.Textbox(
                 label="Texto a convertir a voz",
-                info="Cada frase debe tener al menos 10 palabras. Máximo 250 caracteres (alrededor de 2-3 frases).",
                 value="Hola, soy un modelo de texto a voz.",
             )
             language_gr = gr.Dropdown(
                 label="Idioma",
                 choices=[
-                    "es-AR",
                     "vi",
                     "en",
                     "es",
@@ -273,7 +255,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
                     "hi",
                 ],
                 max_choices=1,
-                value="es-AR",
             )
             normalize_text = gr.Checkbox(
                 label="Normalizar texto en vietnamita",

 supported_languages = config.languages
 if not "vi" in supported_languages:
     supported_languages.append("vi")
+if not "es-ar" in supported_languages:
+    supported_languages.append("es-ar")
 def normalize_vietnamese_text(text):
     text = (
     )
     return text
 def analyze_sentiment(text):
     sia = SentimentIntensityAnalyzer()
     scores = sia.polarity_scores(text)
     semitones = sentiment * 2
     return pyrubberband.pitch_shift(audio_data, sampling_rate, semitones)
+def apply_distortion(audio_data, sentiment):
+    distortion_factor = abs(sentiment) * 0.5
+    return audio_data * (1 + distortion_factor * np.random.randn(len(audio_data)))
 @spaces.GPU(duration=0)
 def predict(
     prompt,
         metrics_text = gr.Warning("Por favor, introduce un texto más largo.")
         return (None, metrics_text)
     try:
         metrics_text = ""
         t_latent = time.time()
         real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
         metrics_text += f"Factor de tiempo real (RTF): {real_time_factor:.2f}\n"
         audio_data = np.array(out["wav"])
         modified_audio = change_pitch(audio_data, 24000, sentiment)
+        modified_audio = apply_distortion(modified_audio, sentiment)
         torchaudio.save("output.wav", torch.tensor(modified_audio).unsqueeze(0), 24000)
     except RuntimeError as e:
         with gr.Column():
             input_text_gr = gr.Textbox(
                 label="Texto a convertir a voz",
                 value="Hola, soy un modelo de texto a voz.",
             )
             language_gr = gr.Dropdown(
                 label="Idioma",
                 choices=[
+                    "es-ar",
                     "vi",
                     "en",
                     "es",
                     "hi",
                 ],
                 max_choices=1,
+                value="es-ar",
             )
             normalize_text = gr.Checkbox(
                 label="Normalizar texto en vietnamita",