Spaces:
Build error
Build error
New app loaded
Browse files- app.py +49 -30
- app_old.py +226 -0
app.py
CHANGED
@@ -136,7 +136,7 @@ if __name__ == "__main__":
|
|
136 |
st.markdown(
|
137 |
"> **Note**: This demo shows cut out segments from clips in the [Condensed Movies](https://www.robots.ox.ac.uk/~vgg/data/condensed-movies/) dataset. "\
|
138 |
"The segments are adjudged to have Foley sounds based on AudioMAE predictions on AudioSet classes. "\
|
139 |
-
"The segments
|
140 |
"However, segments can still be noisy. Furthermore, even if a clip has "\
|
141 |
"Foley, there can still be background music/score which we have not removed yet."
|
142 |
)
|
@@ -150,34 +150,50 @@ if __name__ == "__main__":
|
|
150 |
)
|
151 |
|
152 |
csv_path = "./clips.csv"
|
153 |
-
|
154 |
-
|
|
|
|
|
155 |
per_video_width = 360
|
156 |
per_video_height = 240
|
157 |
print("Total number of clips: {}".format(len(annot_paths)))
|
158 |
|
159 |
|
160 |
if "data" not in st.session_state:
|
161 |
-
|
162 |
-
|
|
|
163 |
|
164 |
-
#
|
165 |
data = [load_json(p) for p in annot_paths]
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
# get movie titles
|
171 |
df = pd.read_csv(csv_path)
|
|
|
|
|
|
|
172 |
titles = df["title"].values
|
173 |
-
|
174 |
# store variables
|
175 |
st.session_state.titles = titles
|
176 |
st.session_state.video_ids = video_ids
|
177 |
st.session_state.data = data
|
178 |
st.session_state.num_foley_segments = num_foley_segments
|
179 |
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
181 |
reload_button = st.button("Reload")
|
182 |
index = np.random.randint(0, len(st.session_state.data))
|
183 |
if reload_button:
|
@@ -186,21 +202,27 @@ if __name__ == "__main__":
|
|
186 |
# Gather data
|
187 |
annot = st.session_state.data[index]
|
188 |
video_id = st.session_state.video_ids[index]
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
]
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
# Create a grid of videos
|
206 |
grid = make_grid(3, 3)
|
@@ -220,7 +242,4 @@ if __name__ == "__main__":
|
|
220 |
grid[i][j].markdown(html_code, unsafe_allow_html=True)
|
221 |
grid[i][j].caption(f"{labels[idx]}")
|
222 |
|
223 |
-
|
224 |
-
st.markdown("##### Some stats")
|
225 |
-
st.write(f"Total number of unique clips: {len(st.session_state.data)}")
|
226 |
-
st.write("Total number of foley segments: {}".format(st.session_state.num_foley_segments))
|
|
|
136 |
st.markdown(
|
137 |
"> **Note**: This demo shows cut out segments from clips in the [Condensed Movies](https://www.robots.ox.ac.uk/~vgg/data/condensed-movies/) dataset. "\
|
138 |
"The segments are adjudged to have Foley sounds based on AudioMAE predictions on AudioSet classes. "\
|
139 |
+
"The segments are cut into fixed 5s long chunks. Those with silence or high probability of speech/music sounds are filtered out. "\
|
140 |
"However, segments can still be noisy. Furthermore, even if a clip has "\
|
141 |
"Foley, there can still be background music/score which we have not removed yet."
|
142 |
)
|
|
|
150 |
)
|
151 |
|
152 |
csv_path = "./clips.csv"
|
153 |
+
ann_dirname = "annotations-v2.0"
|
154 |
+
ext = ".json"
|
155 |
+
ann_dirs = glob(join(".", f"{ann_dirname}_", "*"))
|
156 |
+
annot_paths = glob(join(".", f"{ann_dirname}_*", f"*{ext}"))
|
157 |
per_video_width = 360
|
158 |
per_video_height = 240
|
159 |
print("Total number of clips: {}".format(len(annot_paths)))
|
160 |
|
161 |
|
162 |
if "data" not in st.session_state:
|
163 |
+
|
164 |
+
# Store video ids
|
165 |
+
video_ids = [basename(x).split(ext)[0] for x in annot_paths]
|
166 |
|
167 |
+
# Load annotation data
|
168 |
data = [load_json(p) for p in annot_paths]
|
169 |
+
|
170 |
+
# Filter those examples with no foley segments
|
171 |
+
indices = [i for i, d in enumerate(data) if len(d["segments"]) > 0]
|
172 |
+
data = [d for i, d in enumerate(data) if i in indices]
|
173 |
+
video_ids = [video_ids[i] for i in indices]
|
174 |
+
|
175 |
+
# Compute the total number of Foley segments
|
176 |
+
num_foley_segments = np.sum([len(d["segments"]) for d in data])
|
177 |
|
178 |
# get movie titles
|
179 |
df = pd.read_csv(csv_path)
|
180 |
+
df = df[df["videoid"].isin(video_ids)]
|
181 |
+
# reorder rows to match video_ids
|
182 |
+
df = df.set_index("videoid").loc[video_ids].reset_index()
|
183 |
titles = df["title"].values
|
184 |
+
|
185 |
# store variables
|
186 |
st.session_state.titles = titles
|
187 |
st.session_state.video_ids = video_ids
|
188 |
st.session_state.data = data
|
189 |
st.session_state.num_foley_segments = num_foley_segments
|
190 |
|
191 |
+
|
192 |
+
st.markdown("##### Some stats")
|
193 |
+
st.write(f"Total number of unique clips: {len(st.session_state.data)}")
|
194 |
+
st.write("Total number of foley segments: {}".format(st.session_state.num_foley_segments))
|
195 |
+
st.markdown("---")
|
196 |
+
|
197 |
reload_button = st.button("Reload")
|
198 |
index = np.random.randint(0, len(st.session_state.data))
|
199 |
if reload_button:
|
|
|
202 |
# Gather data
|
203 |
annot = st.session_state.data[index]
|
204 |
video_id = st.session_state.video_ids[index]
|
205 |
+
title = st.session_state.titles[index]
|
206 |
+
|
207 |
+
segments = annot["segments"]
|
208 |
+
durations = [np.round(y - x, 2) for [x, y] in annot["segments"]]
|
209 |
+
labels = annot["labels"]
|
210 |
+
|
211 |
+
# seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag]
|
212 |
+
# keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"]
|
213 |
+
# for k in keys:
|
214 |
+
# annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices]
|
215 |
+
# del annot["keep_status"]
|
216 |
+
# labels = [
|
217 |
+
# summarize_classification_probs(
|
218 |
+
# annot["silence_prob"][i], annot["audiomae_on_audioset"][i]
|
219 |
+
# ) for i in range(len(annot["non_speech_segments"]))
|
220 |
+
# ]
|
221 |
+
# segments, durations = annot["non_speech_segments"], annot["duration"]
|
222 |
+
# movie = st.session_state.titles[index]
|
223 |
+
|
224 |
+
|
225 |
+
st.markdown(f"Showing Foley segments from a clip in movie: **{title}**")
|
226 |
|
227 |
# Create a grid of videos
|
228 |
grid = make_grid(3, 3)
|
|
|
242 |
grid[i][j].markdown(html_code, unsafe_allow_html=True)
|
243 |
grid[i][j].caption(f"{labels[idx]}")
|
244 |
|
245 |
+
|
|
|
|
|
|
app_old.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Streamlit demo to visualize auto-annotated Foley segments from movie clips."""
|
2 |
+
import os
|
3 |
+
from os.path import join, exists, dirname, abspath, basename
|
4 |
+
import json
|
5 |
+
from glob import glob
|
6 |
+
|
7 |
+
from tqdm import tqdm
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
import streamlit as st
|
11 |
+
from moviepy.video.io.VideoFileClip import VideoFileClip
|
12 |
+
|
13 |
+
import warnings
|
14 |
+
warnings.simplefilter(action='ignore')
|
15 |
+
|
16 |
+
|
17 |
+
curr_filepath = abspath(__file__)
|
18 |
+
repo_path = dirname(dirname(curr_filepath))
|
19 |
+
|
20 |
+
|
21 |
+
def load_json(path: str) -> dict:
|
22 |
+
"""Helper to load json file"""
|
23 |
+
with open(path, 'rb') as f:
|
24 |
+
data = json.load(f)
|
25 |
+
return data
|
26 |
+
|
27 |
+
|
28 |
+
def tqdm_iterator(items, desc=None, bar_format=None, **kwargs):
|
29 |
+
tqdm._instances.clear()
|
30 |
+
iterator = tqdm(
|
31 |
+
items,
|
32 |
+
desc=desc,
|
33 |
+
bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}',
|
34 |
+
**kwargs,
|
35 |
+
)
|
36 |
+
|
37 |
+
return iterator
|
38 |
+
|
39 |
+
|
40 |
+
def get_data_root_from_hostname():
|
41 |
+
import socket
|
42 |
+
|
43 |
+
data_root_lib = {
|
44 |
+
"diva": "/ssd/pbagad/datasets/",
|
45 |
+
"node": "/var/scratch/pbagad/datasets/",
|
46 |
+
"fs4": "/var/scratch/pbagad/datasets/",
|
47 |
+
}
|
48 |
+
hostname = socket.gethostname()
|
49 |
+
hostname = hostname[:4]
|
50 |
+
|
51 |
+
data_root = data_root_lib.get(hostname, "NA")
|
52 |
+
return data_root
|
53 |
+
|
54 |
+
|
55 |
+
def load_clips_df(df_path, data_dir, verbose=True, use_local=False):
|
56 |
+
assert exists(df_path), f"File {df_path} does not exist"
|
57 |
+
df = pd.read_csv(df_path)
|
58 |
+
print(df.columns)
|
59 |
+
if verbose:
|
60 |
+
print("Number of clips:", len(df))
|
61 |
+
# filter out clips that are not downloaded
|
62 |
+
df["video_path"] = df["videoid"].apply(lambda x: join(data_dir, "pytube_videos", f"{x}.mp4"))
|
63 |
+
if use_local:
|
64 |
+
df = df[df["video_path"].apply(exists)]
|
65 |
+
if verbose:
|
66 |
+
print("Number of clips (with videos available):", len(df))
|
67 |
+
df["audio_path"] = df["videoid"].apply(lambda x: join(data_dir, "pytube_audio", f"{x}.wav"))
|
68 |
+
if use_local:
|
69 |
+
df = df[df["audio_path"].apply(exists)]
|
70 |
+
if verbose:
|
71 |
+
print("Number of clips (with audio available):", len(df))
|
72 |
+
df["annot_path"] = df["videoid"].apply(lambda x: join(data_dir, "annotations", f"{x}.json"))
|
73 |
+
if use_local:
|
74 |
+
df = df[df["annot_path"].apply(exists)]
|
75 |
+
if verbose:
|
76 |
+
print("Number of clips (with annotations available):", len(df))
|
77 |
+
return df
|
78 |
+
|
79 |
+
|
80 |
+
def summarize_classification_probs(silence, probs):
|
81 |
+
summary = [f"Silence: {silence}"]
|
82 |
+
summary += [f"{l.capitalize()}: {p}" for (l, p) in probs]
|
83 |
+
return " | ".join(summary)
|
84 |
+
|
85 |
+
|
86 |
+
def cut_video_in_segments(video_path, segments):
|
87 |
+
video = VideoFileClip(video_path)
|
88 |
+
tmp_dir = os.path.join(os.path.expanduser("~"), "tmp")
|
89 |
+
clip_paths = [f"{tmp_dir}/clip_{i}.mp4" for i in range(len(segments))]
|
90 |
+
iterator = tqdm_iterator(
|
91 |
+
zip(segments, clip_paths), total=len(segments), desc="Preparing clips",
|
92 |
+
)
|
93 |
+
clips = [
|
94 |
+
video.subclip(x, y).write_videofile(f, logger=None, verbose=False) \
|
95 |
+
for (x, y), f in iterator
|
96 |
+
]
|
97 |
+
return clip_paths
|
98 |
+
|
99 |
+
|
100 |
+
def process_sample(row):
|
101 |
+
|
102 |
+
video_path = row["video_path"]
|
103 |
+
audio_path = row["audio_path"]
|
104 |
+
|
105 |
+
annot = load_json(row["annot_filtered"])
|
106 |
+
seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag]
|
107 |
+
|
108 |
+
keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"]
|
109 |
+
for k in keys:
|
110 |
+
annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices]
|
111 |
+
del annot["keep_status"]
|
112 |
+
|
113 |
+
labels = [
|
114 |
+
summarize_classification_probs(
|
115 |
+
annot["silence_prob"][i], annot["audiomae_on_audioset"][i]
|
116 |
+
) for i in range(len(annot["non_speech_segments"]))
|
117 |
+
]
|
118 |
+
clip_paths = cut_video_in_segments(video_path, annot["non_speech_segments"])
|
119 |
+
return clip_paths, labels, annot["non_speech_segments"], annot["duration"]
|
120 |
+
|
121 |
+
|
122 |
+
def make_grid(cols,rows):
|
123 |
+
grid = [0]*cols
|
124 |
+
for i in range(cols):
|
125 |
+
with st.container():
|
126 |
+
grid[i] = st.columns(rows)
|
127 |
+
return grid
|
128 |
+
|
129 |
+
|
130 |
+
if __name__ == "__main__":
|
131 |
+
|
132 |
+
# Streamlit app code
|
133 |
+
st.set_page_config(layout="wide")
|
134 |
+
st.title("Foley Segments from Condensed Movies Dataset 🎬")
|
135 |
+
|
136 |
+
st.markdown(
|
137 |
+
"> **Note**: This demo shows cut out segments from clips in the [Condensed Movies](https://www.robots.ox.ac.uk/~vgg/data/condensed-movies/) dataset. "\
|
138 |
+
"The segments are adjudged to have Foley sounds based on AudioMAE predictions on AudioSet classes. "\
|
139 |
+
"The segments with duration not in [2s, 30s] or those with silence or high probability of speech/music sounds are filtered out. "\
|
140 |
+
"However, segments can still be noisy. Furthermore, even if a clip has "\
|
141 |
+
"Foley, there can still be background music/score which we have not removed yet."
|
142 |
+
)
|
143 |
+
st.markdown(
|
144 |
+
"""> <span style="color:red">Warning</span>: Currently, each clip can be played only once. Replaying starts the clip from beginning of the video.""",
|
145 |
+
unsafe_allow_html=True
|
146 |
+
)
|
147 |
+
st.markdown(
|
148 |
+
"**Instructions**: Click the **Reload** button to see segments from a new clip. "\
|
149 |
+
"Reloading the page is not necessary."
|
150 |
+
)
|
151 |
+
|
152 |
+
csv_path = "./clips.csv"
|
153 |
+
ann_dirs = glob(join(".", "annotations_", "*"))
|
154 |
+
annot_paths = glob(join(".", "annotations_*", "*_filtered.json"))
|
155 |
+
per_video_width = 360
|
156 |
+
per_video_height = 240
|
157 |
+
print("Total number of clips: {}".format(len(annot_paths)))
|
158 |
+
|
159 |
+
|
160 |
+
if "data" not in st.session_state:
|
161 |
+
# store video ids
|
162 |
+
video_ids = [basename(x).split("_filtered.json")[0] for x in annot_paths]
|
163 |
+
|
164 |
+
# load annotation data
|
165 |
+
data = [load_json(p) for p in annot_paths]
|
166 |
+
num_foley_per_clip = [sum(d["keep_status"]) for d in data]
|
167 |
+
num_foley_segments = np.sum(num_foley_per_clip)
|
168 |
+
data = [d for d, n in zip(data, num_foley_per_clip) if n > 0]
|
169 |
+
|
170 |
+
# get movie titles
|
171 |
+
df = pd.read_csv(csv_path)
|
172 |
+
titles = df["title"].values
|
173 |
+
|
174 |
+
# store variables
|
175 |
+
st.session_state.titles = titles
|
176 |
+
st.session_state.video_ids = video_ids
|
177 |
+
st.session_state.data = data
|
178 |
+
st.session_state.num_foley_segments = num_foley_segments
|
179 |
+
|
180 |
+
|
181 |
+
reload_button = st.button("Reload")
|
182 |
+
index = np.random.randint(0, len(st.session_state.data))
|
183 |
+
if reload_button:
|
184 |
+
index = np.random.randint(0, len(st.session_state.data))
|
185 |
+
|
186 |
+
# Gather data
|
187 |
+
annot = st.session_state.data[index]
|
188 |
+
video_id = st.session_state.video_ids[index]
|
189 |
+
seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag]
|
190 |
+
keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"]
|
191 |
+
for k in keys:
|
192 |
+
annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices]
|
193 |
+
del annot["keep_status"]
|
194 |
+
labels = [
|
195 |
+
summarize_classification_probs(
|
196 |
+
annot["silence_prob"][i], annot["audiomae_on_audioset"][i]
|
197 |
+
) for i in range(len(annot["non_speech_segments"]))
|
198 |
+
]
|
199 |
+
segments, durations = annot["non_speech_segments"], annot["duration"]
|
200 |
+
movie = st.session_state.titles[index]
|
201 |
+
|
202 |
+
|
203 |
+
st.markdown(f"Showing Foley segments from a clip in movie: **{movie}**")
|
204 |
+
|
205 |
+
# Create a grid of videos
|
206 |
+
grid = make_grid(3, 3)
|
207 |
+
|
208 |
+
# Add videos to the grid
|
209 |
+
for idx in range(0, min(len(segments), 9)):
|
210 |
+
i, j = idx // 3, idx % 3
|
211 |
+
|
212 |
+
start, end = segments[idx]
|
213 |
+
duration = durations[idx]
|
214 |
+
|
215 |
+
grid[i][j].caption(f"Segment duration: {duration}")
|
216 |
+
url = f"https://www.youtube.com/embed/{video_id}?start={int(start)}&end={int(end)}"
|
217 |
+
html_code = f"""
|
218 |
+
<iframe height="{per_video_height}" width="{per_video_width}" src="{url}" frameborder="0" allowfullscreen></iframe>
|
219 |
+
"""
|
220 |
+
grid[i][j].markdown(html_code, unsafe_allow_html=True)
|
221 |
+
grid[i][j].caption(f"{labels[idx]}")
|
222 |
+
|
223 |
+
|
224 |
+
st.markdown("##### Some stats")
|
225 |
+
st.write(f"Total number of unique clips: {len(st.session_state.data)}")
|
226 |
+
st.write("Total number of foley segments: {}".format(st.session_state.num_foley_segments))
|