Spaces:
Running
Running
mrolando
commited on
Commit
•
1ffabd9
1
Parent(s):
2fd4c61
fixed space
Browse files
app.py
CHANGED
@@ -1,12 +1,10 @@
|
|
1 |
from diffusers import AudioLDMPipeline
|
2 |
import torch
|
3 |
import gradio as gr
|
4 |
-
from transformers import pipeline
|
5 |
#from googletrans import Translator
|
6 |
import os
|
7 |
|
8 |
-
|
9 |
-
|
10 |
if torch.cuda.is_available():
|
11 |
device = "cuda"
|
12 |
torch_dtype = torch.float16
|
@@ -17,8 +15,6 @@ print(device)
|
|
17 |
repo_id = "cvssp/audioldm-m-full"
|
18 |
pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype)
|
19 |
pipe = pipe.to(device)
|
20 |
-
# pipe.unet = torch.compile(pipe.unet)
|
21 |
-
#pipe.unet = torch.compile(pipe.unet)
|
22 |
|
23 |
|
24 |
|
@@ -27,14 +23,32 @@ import base64
|
|
27 |
with open("Iso_Logotipo_Ceibal.png", "rb") as image_file:
|
28 |
encoded_image = base64.b64encode(image_file.read()).decode()
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def generate_sound(text,steps,audio_length,negative_prompt):
|
32 |
print(text)
|
33 |
-
# text=translate_text(text)
|
34 |
text = translate_text(text)
|
35 |
negative_prompt = translate_text(negative_prompt)
|
36 |
-
#translator = Translator()
|
37 |
-
#text=translator.translate(text, src='es',dest="en").text
|
38 |
print(text)
|
39 |
waveforms = pipe(text,
|
40 |
num_inference_steps=steps,
|
@@ -42,14 +56,6 @@ def generate_sound(text,steps,audio_length,negative_prompt):
|
|
42 |
negative_prompt = negative_prompt).audios
|
43 |
rate =16000
|
44 |
return rate, waveforms[0]
|
45 |
-
#return gr.make_waveform((rate, waveforms[0]))
|
46 |
-
|
47 |
-
es_en_translator = pipeline("translation",model = "Helsinki-NLP/opus-mt-es-en")
|
48 |
-
|
49 |
-
|
50 |
-
def translate_text(text):
|
51 |
-
text = es_en_translator(text)[0].get("translation_text")
|
52 |
-
return text
|
53 |
|
54 |
with gr.Blocks(title="Uso de AI para la generación de sonidos a partir de texto.") as demo:
|
55 |
gr.Markdown("""
|
@@ -79,7 +85,7 @@ with gr.Blocks(title="Uso de AI para la generación de sonidos a partir de texto
|
|
79 |
with gr.Column():
|
80 |
audio_len = gr.Slider(label="Duración del sonido", minimum=1, maximum=30, value=5, step = 1,
|
81 |
info="Cuánto mayor sonido, mayor será el tiempo de procesamiento.")
|
82 |
-
steps = gr.Slider(label="Paos de Inferencia", minimum=1, maximum=100, value=
|
83 |
info="Al aumentar los pasos de inferencia se puede acercar más a la descripción del texto pero con un mayor tiempo de procesamiento.")
|
84 |
with gr.Row():
|
85 |
examples = gr.Examples(inputs=[prompt,negative_prompt],examples=[["Un martillo golpeando madera","low quality"]])
|
@@ -89,5 +95,4 @@ with gr.Blocks(title="Uso de AI para la generación de sonidos a partir de texto
|
|
89 |
|
90 |
btn.click(fn=generate_sound, inputs=[prompt,steps,audio_len,negative_prompt], outputs=[output]) #steps,guidance,width,height]
|
91 |
|
92 |
-
gr.close_all()
|
93 |
demo.launch()
|
|
|
1 |
from diffusers import AudioLDMPipeline
|
2 |
import torch
|
3 |
import gradio as gr
|
4 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
5 |
#from googletrans import Translator
|
6 |
import os
|
7 |
|
|
|
|
|
8 |
if torch.cuda.is_available():
|
9 |
device = "cuda"
|
10 |
torch_dtype = torch.float16
|
|
|
15 |
repo_id = "cvssp/audioldm-m-full"
|
16 |
pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype)
|
17 |
pipe = pipe.to(device)
|
|
|
|
|
18 |
|
19 |
|
20 |
|
|
|
23 |
with open("Iso_Logotipo_Ceibal.png", "rb") as image_file:
|
24 |
encoded_image = base64.b64encode(image_file.read()).decode()
|
25 |
|
26 |
+
# es_en_translator = pipeline("translation",model = "Helsinki-NLP/opus-mt-es-en")
|
27 |
+
# def translate_text(text):
|
28 |
+
# text = es_en_translator(text)[0].get("translation_text")
|
29 |
+
# return text
|
30 |
+
CKPT = "facebook/nllb-200-distilled-600M"
|
31 |
+
|
32 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(CKPT)
|
33 |
+
tokenizer = AutoTokenizer.from_pretrained(CKPT)
|
34 |
+
|
35 |
+
def translate_text(text):
|
36 |
+
translation_pipeline = pipeline("translation",
|
37 |
+
model=model,
|
38 |
+
tokenizer=tokenizer,
|
39 |
+
src_lang="spa_Latn",
|
40 |
+
tgt_lang="eng_Latn",
|
41 |
+
max_length=400,
|
42 |
+
device=device)
|
43 |
+
|
44 |
+
result = translation_pipeline(text)
|
45 |
+
return result[0]['translation_text']
|
46 |
+
|
47 |
|
48 |
def generate_sound(text,steps,audio_length,negative_prompt):
|
49 |
print(text)
|
|
|
50 |
text = translate_text(text)
|
51 |
negative_prompt = translate_text(negative_prompt)
|
|
|
|
|
52 |
print(text)
|
53 |
waveforms = pipe(text,
|
54 |
num_inference_steps=steps,
|
|
|
56 |
negative_prompt = negative_prompt).audios
|
57 |
rate =16000
|
58 |
return rate, waveforms[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
with gr.Blocks(title="Uso de AI para la generación de sonidos a partir de texto.") as demo:
|
61 |
gr.Markdown("""
|
|
|
85 |
with gr.Column():
|
86 |
audio_len = gr.Slider(label="Duración del sonido", minimum=1, maximum=30, value=5, step = 1,
|
87 |
info="Cuánto mayor sonido, mayor será el tiempo de procesamiento.")
|
88 |
+
steps = gr.Slider(label="Paos de Inferencia", minimum=1, maximum=100, value=15,step =1 ,
|
89 |
info="Al aumentar los pasos de inferencia se puede acercar más a la descripción del texto pero con un mayor tiempo de procesamiento.")
|
90 |
with gr.Row():
|
91 |
examples = gr.Examples(inputs=[prompt,negative_prompt],examples=[["Un martillo golpeando madera","low quality"]])
|
|
|
95 |
|
96 |
btn.click(fn=generate_sound, inputs=[prompt,steps,audio_len,negative_prompt], outputs=[output]) #steps,guidance,width,height]
|
97 |
|
|
|
98 |
demo.launch()
|