texto_a_sonido

Running

texto_a_sonido / app.py

mrolando

fixed space

6cee433 about 1 year ago

4.43 kB

	from diffusers import AudioLDMPipeline
	import torch
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
	#from googletrans import Translator
	import os

	if torch.cuda.is_available():
	device = "cuda"
	torch_dtype = torch.float16
	else:
	device = "cpu"
	torch_dtype = torch.float32
	print(device)
	repo_id = "cvssp/audioldm-s-full-v2"

	pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch_dtype)
	pipe = pipe.to(device)



	import base64

	with open("Iso_Logotipo_Ceibal.png", "rb") as image_file:
	encoded_image = base64.b64encode(image_file.read()).decode()

	# es_en_translator = pipeline("translation",model = "Helsinki-NLP/opus-mt-es-en")
	# def translate_text(text):
	# text = es_en_translator(text)[0].get("translation_text")
	# return text
	CKPT = "facebook/nllb-200-distilled-600M"

	model = AutoModelForSeq2SeqLM.from_pretrained(CKPT)
	tokenizer = AutoTokenizer.from_pretrained(CKPT)

	def translate_text(text):
	translation_pipeline = pipeline("translation",
	model=model,
	tokenizer=tokenizer,
	src_lang="spa_Latn",
	tgt_lang="eng_Latn",
	max_length=400,
	device=device)

	result = translation_pipeline(text)
	return result[0]['translation_text']


	def generate_sound(text,negative_prompt):
	steps = 17
	audio_length=5
	print(text)
	text = translate_text(text)
	negative_prompt = translate_text(negative_prompt)
	print(text)
	waveforms = pipe(text,
	num_inference_steps=steps,
	audio_length_in_s=audio_length,
	negative_prompt = negative_prompt).audios
	rate =16000
	return rate, waveforms[0]

	with gr.Blocks(title="Uso de AI para la generación de sonidos a partir de texto.") as demo:
	gr.Markdown("""
	<center>
	<h1>
	Uso de AI para la generación de sonidos a partir de texto.
	</h1>
	<img src='data:image/jpg;base64,{}' width=200px>
	<h3>
	Con este espacio podrás generar sondios a partir de texto, intentá ser lo más descriptivo/a posible en el texto. Se puede usar directamente o podés cambiar ajustes, que impacto tiene cada uno está detallado en su descripción. Cambiá valores y mirá los resultados!
	</h3>
	<h4>El texto se traduce del español al inglés para alimentar al modelo, también se puede escribir el texto de entrada en inglés.</h4>
	</center>
	""".format(encoded_image))
	with gr.Row():
	with gr.Column():
	gr.Markdown("Primero debes ingresar el texto para generar el sonido:")
	with gr.Row():
	with gr.Column(scale=4):
	prompt = gr.Textbox(label="Texo base para generar el sonido") #Give prompt some real estate
	with gr.Column(scale=1, min_width=50):
	btn = gr.Button("Generar") #Submit button side by side!
	with gr.Row():
	with gr.Accordion("Opciones avanzadas", open=False): #Let's hide the advanced options!
	negative_prompt = gr.Textbox(label="Texto negativo para la generación", info='Al ingresar texto en este campo el modelo intentará alejarse lo mas posible del mismo, este puede ser "baja calidad"')
	# with gr.Row():
	# with gr.Column():
	# audio_len = gr.Slider(label="Duración del sonido", minimum=1, maximum=30, value=5, step = 1,
	# info="Cuánto mayor sonido, mayor será el tiempo de procesamiento.")
	# steps = gr.Slider(label="Paos de Inferencia", minimum=1, maximum=100, value=15,step =1 ,
	# info="Al aumentar los pasos de inferencia se puede acercar más a la descripción del texto pero con un mayor tiempo de procesamiento.")
	with gr.Row():
	examples = gr.Examples(inputs=[prompt,negative_prompt],examples=[["Un martillo golpeando madera","low quality"]])

	with gr.Column():
	output = gr.Audio(label="Resultado") #Move the output up too

	btn.click(fn=generate_sound, inputs=[prompt,negative_prompt], outputs=[output]) #steps,guidance,width,height]

	demo.launch()