Spaces:
Running
Running
File size: 6,208 Bytes
68cfefc c7006d8 ae4d7aa fe77cbc 1488c83 a6ed34a 9289e32 1488c83 4427b01 43d27a1 18f0bbe 09c0809 18f0bbe 09c0809 18f0bbe 09c0809 8c0a78a 4427b01 dcacde3 fe77cbc 43d27a1 923aa33 fe77cbc 8a3c983 1488c83 9289e32 a6ed34a 9289e32 1488c83 9289e32 fe77cbc 1488c83 a6ed34a fe77cbc 9289e32 1260986 9289e32 43d27a1 9289e32 5b506de 43d27a1 1488c83 9289e32 fe77cbc 9289e32 fe77cbc 1488c83 8a95680 cf7e36e fe77cbc 9289e32 8a3c983 9289e32 8a3c983 9289e32 fe77cbc 9289e32 fe77cbc 1488c83 8a3c983 9289e32 8a3c983 1488c83 fe77cbc 8a3c983 9289e32 8a3c983 fe77cbc 791533f fe77cbc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import os
try:
import torchaudio
except ImportError:
os.system("cd ./F5-TTS; pip install -e .")
os.system("pip install -U gradio")
import spaces
import logging
from datetime import datetime
from pathlib import Path
import gradio as gr
import torch
import torchaudio
import tempfile
import requests
import shutil
import numpy as np
from huggingface_hub import hf_hub_download
if True:
model_path = "./MMAudio/weights/"
file_path = hf_hub_download(repo_id="lshzhm/DeepAudio-V1", filename="MMAudio/mmaudio_small_44k.pth", local_dir=model_path)
print(f"Model saved at: {file_path}")
shutil.move("./MMAudio/weights/MMAudio/mmaudio_small_44k.pth", "./MMAudio/weights/")
model_path = "./MMAudio/ext_weights/"
file_path = hf_hub_download(repo_id="lshzhm/DeepAudio-V1", filename="MMAudio/v1-44.pth", local_dir=model_path)
print(f"Model saved at: {file_path}")
shutil.move("./MMAudio/ext_weights/MMAudio/v1-44.pth", "./MMAudio/ext_weights/")
file_path = hf_hub_download(repo_id="lshzhm/DeepAudio-V1", filename="MMAudio/synchformer_state_dict.pth", local_dir=model_path)
print(f"Model saved at: {file_path}")
shutil.move("./MMAudio/ext_weights/MMAudio/synchformer_state_dict.pth", "./MMAudio/ext_weights/")
model_path = "./F5-TTS/ckpts/v2c/"
if not os.path.exists(model_path):
os.makedirs(model_path)
file_path = hf_hub_download(repo_id="lshzhm/DeepAudio-V1", filename="v2c_s44.pt", local_dir=model_path)
print(f"Model saved at: {file_path}")
log = logging.getLogger()
import sys
sys.path.insert(0, "./MMAudio/")
from demo import v2a_load, v2a_infer
v2a_loaded = v2a_load()
import sys
sys.path.insert(0, "./F5-TTS/src/")
from f5_tts.infer.infer_cli_test import v2s_infer
#@spaces.GPU(duration=120)
def video_to_audio_and_speech(video: gr.Video, prompt: str, v2a_num_steps: int, text: str, audio_prompt: gr.Audio, text_prompt: str, v2s_num_steps: int):
video_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
audio_p_path = tempfile.NamedTemporaryFile(delete=False, suffix='.wav').name
output_dir = os.path.dirname(video_path)
video_save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".mp4"
print("paths", video, video_path, output_dir, video_save_path)
print("paths", audio_prompt, audio_p_path, audio_prompt[1].shape, audio_prompt[1].max(), audio_prompt[1].min(), type(audio_prompt[1]))
if video.startswith("http"):
data = requests.get(video, timeout=60).content
with open(video_path, "wb") as fw:
fw.write(data)
else:
shutil.copy(video, video_path)
if isinstance(audio_prompt, tuple):
sr, data = audio_prompt
torchaudio.save(audio_p_path, torch.from_numpy(data.reshape(1,-1)/32768.0).to(torch.float32), sr)
elif audio_prompt.startswith("http"):
data = requests.get(audio_prompt, timeout=60).content
with open(audio_p_path, "wb") as fw:
fw.write(data)
else:
shutil.copy(audio_prompt, audio_p_path)
#if prompt == "":
# command = "cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --calc_energy 1 --num_steps %d" % (output_dir, video_path, v2a_num_steps)
#else:
# command = "cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --prompt %s --calc_energy 1 --num_steps %d" % (output_dir, video_path, prompt, v2a_num_steps)
#print("v2a command", command)
#os.system(command)
v2a_infer(output_dir, video_path, prompt, v2a_num_steps, v2a_loaded)
video_gen = video_save_path[:-4]+".mp4.gen.mp4"
#command = "python ./F5-TTS/src/f5_tts/infer/infer_cli_test.py --output_dir %s --start 0 --end 1 --ckpt_file ./F5-TTS/ckpts/v2c/v2c_s44.pt --v2a_path %s --wav_p %s --txt_p \"%s\" --video %s --v2a_wav %s --txt \"%s\" --nfe_step %d" % (output_dir, output_dir, audio_p_path, text_prompt, video_save_path, video_save_path[:-4]+".flac", text, v2s_num_steps)
#print("v2s command", command, video_gen)
#os.system(command)
v2s_infer(output_dir, output_dir, audio_p_path, text_prompt, video_save_path, video_save_path[:-4]+".flac", text, v2s_num_steps)
return video_save_path, video_gen
video_to_audio_and_speech_tab = gr.Interface(
fn=video_to_audio_and_speech,
description="""
Project page: <a href="https://acappemin.github.io/DeepAudio-V1.github.io">https://acappemin.github.io/DeepAudio-V1.github.io</a><br>
Code: <a href="https://github.com/acappemin/DeepAudio-V1">https://github.com/acappemin/DeepAudio-V1</a><br>
""",
inputs=[
gr.Video(label="Input Video"),
gr.Text(label='Video-to-Audio Text Prompt'),
gr.Number(label='Video-to-Audio Num Steps', value=25, precision=0, minimum=1),
gr.Text(label='Video-to-Speech Transcription'),
gr.Audio(label='Video-to-Speech Speech Prompt'),
gr.Text(label='Video-to-Speech Speech Prompt Transcription'),
gr.Number(label='Video-to-Speech Num Steps', value=32, precision=0, minimum=1),
],
outputs=[
gr.Video(label="Video-to-Audio Output"),
gr.Video(label="Video-to-Speech Output"),
],
cache_examples=False,
title='Video-to-Audio-and-Speech',
examples=[
[
'./tests/0235.mp4',
'',
25,
"Who finally decided to show up for work Yay",
'./tests/Gobber-00-0778.wav',
"I've still got a few knocking around in here",
32,
],
[
'./tests/0778.mp4',
'',
25,
"I've still got a few knocking around in here",
'./tests/Gobber-00-0235.wav',
"Who finally decided to show up for work Yay",
32,
],
])
if __name__ == "__main__":
gr.TabbedInterface([video_to_audio_and_speech_tab], ['Video-to-Audio-and-Speech']).queue(max_size=1).launch()
|