File size: 5,011 Bytes
b71b6bf
 
 
 
 
 
b40da7f
 
 
 
b71b6bf
466df4f
b71b6bf
7b4a4e7
b71b6bf
 
 
 
 
 
 
 
 
 
 
 
 
 
abd5868
 
 
e932cdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae4e904
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99578a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b71b6bf
 
ae4e904
 
 
b71b6bf
 
c14bad7
 
b71b6bf
 
c14bad7
b71b6bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e7972e
e932cdf
 
99578a5
 
 
b71b6bf
 
 
 
 
 
4c7d0b3
99578a5
4c7d0b3
b71b6bf
 
 
99578a5
b71b6bf
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import gradio as gr
import huggingface_hub
import os 
import subprocess
import threading

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile

# download model
huggingface_hub.snapshot_download(
    repo_id='ariesssxu/vta-ldm-clip4clip-v-large',
    local_dir='./ckpt/vta-ldm-clip4clip-v-large'
)

def stream_output(pipe):
    for line in iter(pipe.readline, ''):
        print(line, end='')

def print_directory_contents(path):
    for root, dirs, files in os.walk(path):
        level = root.replace(path, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print(f"{subindent}{f}")

# Print the ckpt directory contents
print_directory_contents('./ckpt')

def get_wav_files(path):
    wav_files = []  # Initialize an empty list to store the paths of .wav files
    for root, dirs, files in os.walk(path):
        level = root.replace(path, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            file_path = os.path.join(root, f)
            if f.lower().endswith('.wav'):
                wav_files.append(file_path)  # Add .wav file paths to the list
                print(f"{subindent}{file_path}")
            else:
                print(f"{subindent}{f}")
    return wav_files  # Return the list of .wav file paths

def check_outputs_folder(folder_path):
    # Check if the folder exists
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        # Delete all contents inside the folder
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)  # Remove file or link
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)  # Remove directory
            except Exception as e:
                print(f'Failed to delete {file_path}. Reason: {e}')
    else:
        print(f'The folder {folder_path} does not exist.')

def plot_spectrogram(wav_file, output_image):
    # Read the WAV file
    sample_rate, audio_data = wavfile.read(wav_file)

    # Check if audio_data is stereo (2 channels) and convert it to mono (1 channel) if needed
    if len(audio_data.shape) == 2:
        audio_data = audio_data.mean(axis=1)

    # Create a plot for the spectrogram
    plt.figure(figsize=(10, 4))
    plt.specgram(audio_data, Fs=sample_rate, NFFT=1024, noverlap=512, cmap='inferno', aspect='auto')
    plt.title('Spectrogram')
    plt.xlabel('Time [s]')
    plt.ylabel('Frequency [Hz]')

    # Save the plot as an image file
    plt.colorbar(label='Intensity [dB]')
    plt.savefig(output_image)
    plt.close()

def infer(video_in):

    # check if 'outputs' dir exists and empty it if necessary
    check_outputs_folder('./outputs/tmp')
    
    # Need to find path to gradio temp vid from video input
    print(f"VIDEO IN PATH: {video_in}")
    # Get the directory name
    folder_path = os.path.dirname(video_in)

    # Execute the inference command
    command = ['python', 'inference_from_video.py', '--original_args', 'ckpt/vta-ldm-clip4clip-v-large/summary.jsonl', '--model', 'ckpt/vta-ldm-clip4clip-v-large/pytorch_model_2.bin', '--data_path', folder_path]
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)

    # Create threads to handle stdout and stderr
    stdout_thread = threading.Thread(target=stream_output, args=(process.stdout,))
    stderr_thread = threading.Thread(target=stream_output, args=(process.stderr,))

    # Start the threads
    stdout_thread.start()
    stderr_thread.start()

    # Wait for the process to complete and the threads to finish
    process.wait()
    stdout_thread.join()
    stderr_thread.join()

    print("Inference script finished with return code:", process.returncode)

    # Need to find where are the results stored, default should be "./outputs/tmp"
    # Print the outputs directory contents
    print_directory_contents('./outputs/tmp')
    wave_files = get_wav_files('./outputs/tmp')
    print(wave_files)
    plot_spectrogram(wave_files[0], 'spectrogram.png')
    
    return wave_files[0], 'spectrogram.png'

with gr.Blocks() as demo:
    with gr.Column(elem_id="col-container"):
        gr.Markdown("# Video-To-Audio")
        video_in = gr.Video(label='Video IN')
        submit_btn = gr.Button("Submit")
        output_sound = gr.Audio(label="Audio OUT")
        output_spectrogram = gr.Image(label='Spectrogram')
        #output_sound = gr.Textbox(label="Audio OUT")
    submit_btn.click(
        fn = infer,
        inputs = [video_in],
        outputs = [output_sound, output_spectrogram],
        show_api = False
    )
demo.launch(show_api=False, show_error=True)