from audio_to_text import AudioPipeline from text_to_img import init_text2img_pipe, predict if __name__ == "__main__": negative_prompt = [ "(watermark:2)", "signature", "username", "(text:2)", "website", "(worst quality:2)", "(low quality:2)", "(normal quality:2)", "polar lowres", "jpeg", "((monochrome))", "((grayscale))", "sketches", "Paintings", "(blurry:2)", "cropped", "lowres", "error", "sketches", "(duplicate:1.331)", "(morbid:1.21)", "(mutilated:1.21)", "(tranny:1.331)", "(bad proportions:1.331)", ] pipeline = AudioPipeline(audio_text_path='/home/user/app/dedup_audio_text_80.json', audio_text_embeddings_path='/home/user/app/audio_text_embeddings_cpu.safetensors') text = pipeline.audio2txt('/home/user/app/demo.wav') text2img_pipeline = init_text2img_pipe() images = predict(text, " ".join(negative_prompt), text2img_pipeline) for idx, image in enumerate(images): image.save(f"/root/autodl-tmp/image_{idx}.png")