File size: 6,208 Bytes
68cfefc
 
 
 
 
c7006d8
ae4d7aa
 
fe77cbc
 
 
 
 
 
 
 
 
 
 
1488c83
a6ed34a
9289e32
1488c83
4427b01
 
43d27a1
18f0bbe
 
 
 
09c0809
18f0bbe
 
 
 
 
09c0809
18f0bbe
 
09c0809
8c0a78a
 
 
 
 
 
 
 
 
 
4427b01
dcacde3
fe77cbc
 
 
43d27a1
 
 
 
 
 
 
923aa33
 
 
 
 
fe77cbc
8a3c983
1488c83
 
 
9289e32
 
a6ed34a
9289e32
1488c83
 
9289e32
fe77cbc
1488c83
 
 
 
 
a6ed34a
fe77cbc
9289e32
 
1260986
9289e32
 
 
 
 
 
 
43d27a1
 
 
 
 
 
 
 
 
 
9289e32
5b506de
 
43d27a1
 
 
 
 
 
 
1488c83
9289e32
fe77cbc
 
9289e32
 
fe77cbc
1488c83
8a95680
cf7e36e
fe77cbc
9289e32
 
8a3c983
9289e32
 
 
8a3c983
9289e32
 
 
 
fe77cbc
 
9289e32
fe77cbc
 
1488c83
 
8a3c983
9289e32
 
 
8a3c983
1488c83
 
 
fe77cbc
8a3c983
9289e32
 
 
8a3c983
fe77cbc
 
 
 
 
791533f
fe77cbc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
try:
    import torchaudio
except ImportError:
    os.system("cd ./F5-TTS; pip install -e .")
os.system("pip install -U gradio")


import spaces
import logging
from datetime import datetime
from pathlib import Path

import gradio as gr
import torch
import torchaudio

import tempfile

import requests
import shutil
import numpy as np

from huggingface_hub import hf_hub_download

if True:
    model_path = "./MMAudio/weights/"
    
    file_path = hf_hub_download(repo_id="lshzhm/DeepAudio-V1", filename="MMAudio/mmaudio_small_44k.pth", local_dir=model_path)
    print(f"Model saved at: {file_path}")
    shutil.move("./MMAudio/weights/MMAudio/mmaudio_small_44k.pth", "./MMAudio/weights/")
    
    model_path = "./MMAudio/ext_weights/"
    
    file_path = hf_hub_download(repo_id="lshzhm/DeepAudio-V1", filename="MMAudio/v1-44.pth", local_dir=model_path)
    print(f"Model saved at: {file_path}")
    shutil.move("./MMAudio/ext_weights/MMAudio/v1-44.pth", "./MMAudio/ext_weights/")
    file_path = hf_hub_download(repo_id="lshzhm/DeepAudio-V1", filename="MMAudio/synchformer_state_dict.pth", local_dir=model_path)
    print(f"Model saved at: {file_path}")
    shutil.move("./MMAudio/ext_weights/MMAudio/synchformer_state_dict.pth", "./MMAudio/ext_weights/")
    
    
    model_path = "./F5-TTS/ckpts/v2c/"

    if not os.path.exists(model_path):
        os.makedirs(model_path)

    file_path = hf_hub_download(repo_id="lshzhm/DeepAudio-V1", filename="v2c_s44.pt", local_dir=model_path)

    print(f"Model saved at: {file_path}")


log = logging.getLogger()


import sys
sys.path.insert(0, "./MMAudio/")
from demo import v2a_load, v2a_infer

v2a_loaded = v2a_load()


import sys
sys.path.insert(0, "./F5-TTS/src/")
from f5_tts.infer.infer_cli_test import v2s_infer


#@spaces.GPU(duration=120)
def video_to_audio_and_speech(video: gr.Video, prompt: str, v2a_num_steps: int, text: str, audio_prompt: gr.Audio, text_prompt: str, v2s_num_steps: int):

    video_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
    
    audio_p_path = tempfile.NamedTemporaryFile(delete=False, suffix='.wav').name
    
    output_dir = os.path.dirname(video_path)
    video_save_path = str(output_dir) + "/" + str(video_path).replace("/", "__").strip(".") + ".mp4"
    
    print("paths", video, video_path, output_dir, video_save_path)
    print("paths", audio_prompt, audio_p_path, audio_prompt[1].shape, audio_prompt[1].max(), audio_prompt[1].min(), type(audio_prompt[1]))

    if video.startswith("http"):
        data = requests.get(video, timeout=60).content
        with open(video_path, "wb") as fw:
            fw.write(data)
    else:
        shutil.copy(video, video_path)
    
    if isinstance(audio_prompt, tuple):
        sr, data = audio_prompt
        torchaudio.save(audio_p_path, torch.from_numpy(data.reshape(1,-1)/32768.0).to(torch.float32), sr)
    elif audio_prompt.startswith("http"):
        data = requests.get(audio_prompt, timeout=60).content
        with open(audio_p_path, "wb") as fw:
            fw.write(data)
    else:
        shutil.copy(audio_prompt, audio_p_path)
    
    #if prompt == "":
    #    command = "cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --calc_energy 1 --num_steps %d" % (output_dir, video_path, v2a_num_steps)
    #else:
    #    command = "cd ./MMAudio; python ./demo.py --variant small_44k --output %s --video %s --prompt %s --calc_energy 1 --num_steps %d" % (output_dir, video_path, prompt, v2a_num_steps)
    #print("v2a command", command)
    #os.system(command)
    
    
    v2a_infer(output_dir, video_path, prompt, v2a_num_steps, v2a_loaded)
    
    
    video_gen = video_save_path[:-4]+".mp4.gen.mp4"
    
    #command = "python ./F5-TTS/src/f5_tts/infer/infer_cli_test.py --output_dir %s --start 0 --end 1 --ckpt_file ./F5-TTS/ckpts/v2c/v2c_s44.pt --v2a_path %s --wav_p %s --txt_p \"%s\" --video %s --v2a_wav %s --txt \"%s\" --nfe_step %d" % (output_dir, output_dir, audio_p_path, text_prompt, video_save_path, video_save_path[:-4]+".flac", text, v2s_num_steps)
    #print("v2s command", command, video_gen)
    #os.system(command)
    
    
    v2s_infer(output_dir, output_dir, audio_p_path, text_prompt, video_save_path, video_save_path[:-4]+".flac", text, v2s_num_steps)
    
    
    return video_save_path, video_gen


video_to_audio_and_speech_tab = gr.Interface(
    fn=video_to_audio_and_speech,
    description="""

    Project page: <a href="https://acappemin.github.io/DeepAudio-V1.github.io">https://acappemin.github.io/DeepAudio-V1.github.io</a><br>

    Code: <a href="https://github.com/acappemin/DeepAudio-V1">https://github.com/acappemin/DeepAudio-V1</a><br>

    """,
    inputs=[
        gr.Video(label="Input Video"),
        gr.Text(label='Video-to-Audio Text Prompt'),
        gr.Number(label='Video-to-Audio Num Steps', value=25, precision=0, minimum=1),
        gr.Text(label='Video-to-Speech Transcription'),
        gr.Audio(label='Video-to-Speech Speech Prompt'),
        gr.Text(label='Video-to-Speech Speech Prompt Transcription'),
        gr.Number(label='Video-to-Speech Num Steps', value=32, precision=0, minimum=1),
    ],
    outputs=[
        gr.Video(label="Video-to-Audio Output"),
        gr.Video(label="Video-to-Speech Output"),
    ],
    cache_examples=False,
    title='Video-to-Audio-and-Speech',
    examples=[
        [
            './tests/0235.mp4',
            '',
            25,
            "Who finally decided to show up for work Yay",
            './tests/Gobber-00-0778.wav',
            "I've still got a few knocking around in here",
            32,
        ],
        [
            './tests/0778.mp4',
            '',
            25,
            "I've still got a few knocking around in here",
            './tests/Gobber-00-0235.wav',
            "Who finally decided to show up for work Yay",
            32,
        ],
    ])


if __name__ == "__main__":
    gr.TabbedInterface([video_to_audio_and_speech_tab], ['Video-to-Audio-and-Speech']).queue(max_size=1).launch()