Spaces:
Build error
Build error
"""Streamlit demo to visualize auto-annotated Foley segments from movie clips.""" | |
import os | |
from os.path import join, exists, dirname, abspath, basename | |
import json | |
from glob import glob | |
from tqdm import tqdm | |
import numpy as np | |
import pandas as pd | |
import streamlit as st | |
from moviepy.video.io.VideoFileClip import VideoFileClip | |
import warnings | |
warnings.simplefilter(action='ignore') | |
curr_filepath = abspath(__file__) | |
repo_path = dirname(dirname(curr_filepath)) | |
def load_json(path: str) -> dict: | |
"""Helper to load json file""" | |
with open(path, 'rb') as f: | |
data = json.load(f) | |
return data | |
def tqdm_iterator(items, desc=None, bar_format=None, **kwargs): | |
tqdm._instances.clear() | |
iterator = tqdm( | |
items, | |
desc=desc, | |
bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}', | |
**kwargs, | |
) | |
return iterator | |
def get_data_root_from_hostname(): | |
import socket | |
data_root_lib = { | |
"diva": "/ssd/pbagad/datasets/", | |
"node": "/var/scratch/pbagad/datasets/", | |
"fs4": "/var/scratch/pbagad/datasets/", | |
} | |
hostname = socket.gethostname() | |
hostname = hostname[:4] | |
data_root = data_root_lib.get(hostname, "NA") | |
return data_root | |
def load_clips_df(df_path, data_dir, verbose=True, use_local=False): | |
assert exists(df_path), f"File {df_path} does not exist" | |
df = pd.read_csv(df_path) | |
print(df.columns) | |
if verbose: | |
print("Number of clips:", len(df)) | |
# filter out clips that are not downloaded | |
df["video_path"] = df["videoid"].apply(lambda x: join(data_dir, "pytube_videos", f"{x}.mp4")) | |
if use_local: | |
df = df[df["video_path"].apply(exists)] | |
if verbose: | |
print("Number of clips (with videos available):", len(df)) | |
df["audio_path"] = df["videoid"].apply(lambda x: join(data_dir, "pytube_audio", f"{x}.wav")) | |
if use_local: | |
df = df[df["audio_path"].apply(exists)] | |
if verbose: | |
print("Number of clips (with audio available):", len(df)) | |
df["annot_path"] = df["videoid"].apply(lambda x: join(data_dir, "annotations", f"{x}.json")) | |
if use_local: | |
df = df[df["annot_path"].apply(exists)] | |
if verbose: | |
print("Number of clips (with annotations available):", len(df)) | |
return df | |
def summarize_classification_probs(silence, probs): | |
summary = [f"Silence: {silence}"] | |
summary += [f"{l.capitalize()}: {p}" for (l, p) in probs] | |
return " | ".join(summary) | |
def cut_video_in_segments(video_path, segments): | |
video = VideoFileClip(video_path) | |
tmp_dir = os.path.join(os.path.expanduser("~"), "tmp") | |
clip_paths = [f"{tmp_dir}/clip_{i}.mp4" for i in range(len(segments))] | |
iterator = tqdm_iterator( | |
zip(segments, clip_paths), total=len(segments), desc="Preparing clips", | |
) | |
clips = [ | |
video.subclip(x, y).write_videofile(f, logger=None, verbose=False) \ | |
for (x, y), f in iterator | |
] | |
return clip_paths | |
def process_sample(row): | |
video_path = row["video_path"] | |
audio_path = row["audio_path"] | |
annot = load_json(row["annot_filtered"]) | |
seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag] | |
keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"] | |
for k in keys: | |
annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices] | |
del annot["keep_status"] | |
labels = [ | |
summarize_classification_probs( | |
annot["silence_prob"][i], annot["audiomae_on_audioset"][i] | |
) for i in range(len(annot["non_speech_segments"])) | |
] | |
clip_paths = cut_video_in_segments(video_path, annot["non_speech_segments"]) | |
return clip_paths, labels, annot["non_speech_segments"], annot["duration"] | |
def make_grid(cols,rows): | |
grid = [0]*cols | |
for i in range(cols): | |
with st.container(): | |
grid[i] = st.columns(rows) | |
return grid | |
if __name__ == "__main__": | |
# Streamlit app code | |
st.set_page_config(layout="wide") | |
st.title("Foley Segments from Condensed Movies Dataset π¬") | |
st.markdown( | |
"> **Note**: This demo shows cut out segments from clips in the [Condensed Movies](https://www.robots.ox.ac.uk/~vgg/data/condensed-movies/) dataset. "\ | |
"The segments are adjudged to have Foley sounds based on AudioMAE predictions on AudioSet classes. "\ | |
"The segments are cut into fixed 5s long chunks. Those with silence or high probability of speech/music sounds are filtered out. "\ | |
"However, segments can still be noisy. Furthermore, even if a clip has "\ | |
"Foley, there can still be background music/score which we have not removed yet." | |
) | |
st.markdown( | |
"""> <span style="color:red">Warning</span>: Currently, each clip can be played only once. Replaying starts the clip from beginning of the video.""", | |
unsafe_allow_html=True | |
) | |
st.markdown( | |
"**Instructions**: Click the **Reload** button to see segments from a new clip. "\ | |
"Reloading the page is not necessary." | |
) | |
csv_path = "./clips.csv" | |
ann_dirname = "annotations-v2.0" | |
ext = ".json" | |
ann_dirs = glob(join(".", f"{ann_dirname}_", "*")) | |
annot_paths = glob(join(".", f"{ann_dirname}_*", f"*{ext}")) | |
per_video_width = 360 | |
per_video_height = 240 | |
print("Total number of clips: {}".format(len(annot_paths))) | |
if "data" not in st.session_state: | |
# Store video ids | |
video_ids = [basename(x).split(ext)[0] for x in annot_paths] | |
# Load annotation data | |
data = [load_json(p) for p in annot_paths] | |
# Filter those examples with no foley segments | |
indices = [i for i, d in enumerate(data) if len(d["segments"]) > 0] | |
data = [d for i, d in enumerate(data) if i in indices] | |
video_ids = [video_ids[i] for i in indices] | |
# Compute the total number of Foley segments | |
num_foley_segments = np.sum([len(d["segments"]) for d in data]) | |
# get movie titles | |
df = pd.read_csv(csv_path) | |
df = df[df["videoid"].isin(video_ids)] | |
# reorder rows to match video_ids | |
df = df.set_index("videoid").loc[video_ids].reset_index() | |
titles = df["title"].values | |
# store variables | |
st.session_state.titles = titles | |
st.session_state.video_ids = video_ids | |
st.session_state.data = data | |
st.session_state.num_foley_segments = num_foley_segments | |
st.markdown("##### Some stats") | |
st.write(f"Total number of unique clips: {len(st.session_state.data)}") | |
st.write("Total number of foley segments: {}".format(st.session_state.num_foley_segments)) | |
st.markdown("---") | |
reload_button = st.button("Reload") | |
index = np.random.randint(0, len(st.session_state.data)) | |
if reload_button: | |
index = np.random.randint(0, len(st.session_state.data)) | |
# Gather data | |
annot = st.session_state.data[index] | |
video_id = st.session_state.video_ids[index] | |
title = st.session_state.titles[index] | |
segments = annot["segments"] | |
durations = [np.round(y - x, 2) for [x, y] in annot["segments"]] | |
labels = annot["labels"] | |
# seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag] | |
# keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"] | |
# for k in keys: | |
# annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices] | |
# del annot["keep_status"] | |
# labels = [ | |
# summarize_classification_probs( | |
# annot["silence_prob"][i], annot["audiomae_on_audioset"][i] | |
# ) for i in range(len(annot["non_speech_segments"])) | |
# ] | |
# segments, durations = annot["non_speech_segments"], annot["duration"] | |
# movie = st.session_state.titles[index] | |
st.markdown(f"Showing Foley segments from a clip in movie: **{title}**") | |
# Create a grid of videos | |
grid = make_grid(3, 3) | |
# Add videos to the grid | |
for idx in range(0, min(len(segments), 9)): | |
i, j = idx // 3, idx % 3 | |
start, end = segments[idx] | |
duration = durations[idx] | |
grid[i][j].caption(f"Segment duration: {duration}") | |
url = f"https://www.youtube.com/embed/{video_id}?start={int(start)}&end={int(end)}" | |
html_code = f""" | |
<iframe height="{per_video_height}" width="{per_video_width}" src="{url}" frameborder="0" allowfullscreen></iframe> | |
""" | |
grid[i][j].markdown(html_code, unsafe_allow_html=True) | |
grid[i][j].caption(f"{labels[idx]}") | |