import gradio as gr

# 画像とMP3の情報
image_data = {
    "sikokumetan": {
        "webp": "default/sikokumetan.webp",
        "mp3": "default/sikokumetan.mp3"
    }
}

def on_image_select(image_name):
    # 画像が選択されたとき、対応するMP3ファイルを返す
    return image_data[image_name]["mp3"]

def voice_conversion(source_audio, reference_audio, steps, length_adjustment, cfg, use_f0_model, auto_f0, pitch_shift):
    # 音声変換のロジック（仮の関数）
    pass

if __name__ == "__main__":
    description = ("Zero-shot音声変換モデル（学習不要）。ローカルでの利用方法は[GitHubリポジトリ](https://github.com/Plachtaa/seed-vc)をご覧ください。"
                   "参考音声が25秒を超える場合、自動的に25秒にクリップされます。"
                   "また、元音声と参考音声の合計時間が30秒を超える場合、元音声は分割処理されます。")
    
    # 入力欄の定義
    inputs = [
        gr.Audio(type="filepath", label="元音声"),
        gr.Audio(type="filepath", label="参考音声"),
        gr.Slider(minimum=1, maximum=200, value=10, step=1, label="拡散ステップ数", info="デフォルトは10、50～100が最適な品質"),
        gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="長さ調整", info="1.0未満で速度を上げ、1.0以上で速度を遅くします"),
        gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="推論CFG率", info="わずかな影響があります"),
        gr.Checkbox(label="F0条件付きモデルを使用", value=False, info="歌声変換には必須です"),
        gr.Checkbox(label="F0自動調整", value=True, info="F0をおおよそ調整して目標音声に合わせます。F0条件付きモデル使用時にのみ有効です"),
        gr.Slider(label='音程変換', minimum=-24, maximum=24, step=1, value=0, info="半音単位の音程変換。F0条件付きモデル使用時にのみ有効です"),
    ]

    examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
                ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, True, True, 0],
                ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
                 "examples/reference/teio_0.wav", 100, 1.0, 0.7, True, False, 0],
                ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
                 "examples/reference/trump_0.wav", 50, 1.0, 0.7, True, False, -12],
                ]

    outputs = [gr.Audio(label="ストリーム出力音声", streaming=True, format='mp3'),
               gr.Audio(label="完全出力音声", streaming=False, format='wav')]

    # ギャラリーに渡す画像のパスリスト
    gallery_images = [image_data["sikokumetan"]["webp"]]  # 画像のファイルパスを直接渡す

    # ギャラリーを追加
    gallery = gr.Gallery(
        value=gallery_images,
        label="選択した画像に基づく参考音声",
        elem_id="image_gallery",
        interactive=True,
        grid=2  # 画像を2列に並べる
    )

    # ギャラリーの選択時にMP3ファイルを更新する処理
    gallery.change(fn=on_image_select, inputs=gallery, outputs=inputs[1])  # 参考音声を更新

    gr.Interface(fn=voice_conversion,
                 description=description,
                 inputs=inputs,
                 outputs=outputs,
                 title="Seed Voice Conversion",
                 examples=examples,
                 cache_examples=False,
                 ).launch()