from audio_to_text import AudioPipeline
from text_to_img import init_text2img_pipe, predict

if __name__ == "__main__":
    negative_prompt = [
        "(watermark:2)", "signature", "username", "(text:2)", "website",
        "(worst quality:2)", "(low quality:2)", "(normal quality:2)", "polar lowres", "jpeg",
        "((monochrome))", "((grayscale))", "sketches", "Paintings",
        "(blurry:2)", "cropped", "lowres", "error", "sketches",
        "(duplicate:1.331)", "(morbid:1.21)", "(mutilated:1.21)", "(tranny:1.331)",
        "(bad proportions:1.331)",
    ]
    pipeline = AudioPipeline(audio_text_path='/home/user/app/dedup_audio_text_80.json',
                             audio_text_embeddings_path='/home/user/app/audio_text_embeddings_cpu.safetensors')
    text = pipeline.audio2txt('/home/user/app/demo.wav')

    text2img_pipeline = init_text2img_pipe()
    images = predict(text, " ".join(negative_prompt), text2img_pipeline)
    for idx, image in enumerate(images):
        image.save(f"/root/autodl-tmp/image_{idx}.png")