Spaces:
Build error
Build error
File size: 7,933 Bytes
72d6257 8a09121 72d6257 8a09121 72d6257 b6c3fcf 72d6257 b6c3fcf 72d6257 b6c3fcf 72d6257 b6c3fcf 72d6257 8a09121 18cc1c5 8a09121 9f217c9 8a09121 9f217c9 8a09121 72d6257 8a09121 72d6257 8a09121 9f217c9 8a09121 72d6257 8a09121 72d6257 8a09121 18cc1c5 8a09121 72d6257 8a09121 72d6257 8a09121 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
"""Streamlit demo to visualize auto-annotated Foley segments from movie clips."""
import os
from os.path import join, exists, dirname, abspath, basename
import json
from glob import glob
from tqdm import tqdm
import numpy as np
import pandas as pd
import streamlit as st
from moviepy.video.io.VideoFileClip import VideoFileClip
import warnings
warnings.simplefilter(action='ignore')
curr_filepath = abspath(__file__)
repo_path = dirname(dirname(curr_filepath))
def load_json(path: str) -> dict:
"""Helper to load json file"""
with open(path, 'rb') as f:
data = json.load(f)
return data
def tqdm_iterator(items, desc=None, bar_format=None, **kwargs):
tqdm._instances.clear()
iterator = tqdm(
items,
desc=desc,
bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}',
**kwargs,
)
return iterator
def get_data_root_from_hostname():
import socket
data_root_lib = {
"diva": "/ssd/pbagad/datasets/",
"node": "/var/scratch/pbagad/datasets/",
"fs4": "/var/scratch/pbagad/datasets/",
}
hostname = socket.gethostname()
hostname = hostname[:4]
data_root = data_root_lib.get(hostname, "NA")
return data_root
def load_clips_df(df_path, data_dir, verbose=True, use_local=False):
assert exists(df_path), f"File {df_path} does not exist"
df = pd.read_csv(df_path)
print(df.columns)
if verbose:
print("Number of clips:", len(df))
# filter out clips that are not downloaded
df["video_path"] = df["videoid"].apply(lambda x: join(data_dir, "pytube_videos", f"{x}.mp4"))
if use_local:
df = df[df["video_path"].apply(exists)]
if verbose:
print("Number of clips (with videos available):", len(df))
df["audio_path"] = df["videoid"].apply(lambda x: join(data_dir, "pytube_audio", f"{x}.wav"))
if use_local:
df = df[df["audio_path"].apply(exists)]
if verbose:
print("Number of clips (with audio available):", len(df))
df["annot_path"] = df["videoid"].apply(lambda x: join(data_dir, "annotations", f"{x}.json"))
if use_local:
df = df[df["annot_path"].apply(exists)]
if verbose:
print("Number of clips (with annotations available):", len(df))
return df
def summarize_classification_probs(silence, probs):
summary = [f"Silence: {silence}"]
summary += [f"{l.capitalize()}: {p}" for (l, p) in probs]
return " | ".join(summary)
def cut_video_in_segments(video_path, segments):
video = VideoFileClip(video_path)
tmp_dir = os.path.join(os.path.expanduser("~"), "tmp")
clip_paths = [f"{tmp_dir}/clip_{i}.mp4" for i in range(len(segments))]
iterator = tqdm_iterator(
zip(segments, clip_paths), total=len(segments), desc="Preparing clips",
)
clips = [
video.subclip(x, y).write_videofile(f, logger=None, verbose=False) \
for (x, y), f in iterator
]
return clip_paths
def process_sample(row):
video_path = row["video_path"]
audio_path = row["audio_path"]
annot = load_json(row["annot_filtered"])
seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag]
keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"]
for k in keys:
annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices]
del annot["keep_status"]
labels = [
summarize_classification_probs(
annot["silence_prob"][i], annot["audiomae_on_audioset"][i]
) for i in range(len(annot["non_speech_segments"]))
]
clip_paths = cut_video_in_segments(video_path, annot["non_speech_segments"])
return clip_paths, labels, annot["non_speech_segments"], annot["duration"]
def make_grid(cols,rows):
grid = [0]*cols
for i in range(cols):
with st.container():
grid[i] = st.columns(rows)
return grid
if __name__ == "__main__":
# Streamlit app code
st.set_page_config(layout="wide")
st.title("Foley Segments from Condensed Movies Dataset 🎬")
st.markdown(
"> **Note**: This demo shows cut out segments from clips in the [Condensed Movies](https://www.robots.ox.ac.uk/~vgg/data/condensed-movies/) dataset. "\
"The segments are adjudged to have Foley sounds based on AudioMAE predictions on AudioSet classes. "\
"The segments with duration not in [2s, 30s] or those with silence or high probability of speech/music sounds are filtered out. "\
"However, segments can still be noisy. Furthermore, even if a clip has "\
"Foley, there can still be background music/score which we have not removed yet."
)
st.markdown(
"""> <span style="color:red">Warning</span>: Currently, each clip can be played only once. Replaying starts the clip from beginning of the video.""",
unsafe_allow_html=True
)
st.markdown(
"**Instructions**: Click the **Reload** button to see segments from a new clip. "\
"Reloading the page is not necessary."
)
csv_path = "./clips.csv"
ann_dirs = glob(join(".", "annotations_", "*"))
annot_paths = glob(join(".", "annotations_*", "*_filtered.json"))
per_video_width = 360
per_video_height = 240
print("Total number of clips: {}".format(len(annot_paths)))
if "data" not in st.session_state:
# store video ids
video_ids = [basename(x).split("_filtered.json")[0] for x in annot_paths]
# load annotation data
data = [load_json(p) for p in annot_paths]
num_foley_per_clip = [sum(d["keep_status"]) for d in data]
num_foley_segments = np.sum(num_foley_per_clip)
data = [d for d, n in zip(data, num_foley_per_clip) if n > 0]
# get movie titles
df = pd.read_csv(csv_path)
titles = df["title"].values
# store variables
st.session_state.titles = titles
st.session_state.video_ids = video_ids
st.session_state.data = data
st.session_state.num_foley_segments = num_foley_segments
reload_button = st.button("Reload")
index = np.random.randint(0, len(st.session_state.data))
if reload_button:
index = np.random.randint(0, len(st.session_state.data))
# Gather data
annot = st.session_state.data[index]
video_id = st.session_state.video_ids[index]
seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag]
keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"]
for k in keys:
annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices]
del annot["keep_status"]
labels = [
summarize_classification_probs(
annot["silence_prob"][i], annot["audiomae_on_audioset"][i]
) for i in range(len(annot["non_speech_segments"]))
]
segments, durations = annot["non_speech_segments"], annot["duration"]
movie = st.session_state.titles[index]
st.markdown(f"Showing Foley segments from a clip in movie: **{movie}**")
# Create a grid of videos
grid = make_grid(3, 3)
# Add videos to the grid
for idx in range(0, min(len(segments), 9)):
i, j = idx // 3, idx % 3
start, end = segments[idx]
duration = durations[idx]
grid[i][j].caption(f"Segment duration: {duration}")
url = f"https://www.youtube.com/embed/{video_id}?start={int(start)}&end={int(end)}"
html_code = f"""
<iframe height="{per_video_height}" width="{per_video_width}" src="{url}" frameborder="0" allowfullscreen></iframe>
"""
grid[i][j].markdown(html_code, unsafe_allow_html=True)
grid[i][j].caption(f"{labels[idx]}")
st.markdown("##### Some stats")
st.write(f"Total number of unique clips: {len(st.session_state.data)}")
st.write("Total number of foley segments: {}".format(st.session_state.num_foley_segments))
|