bpiyush commited on
Commit
72d6257
·
1 Parent(s): a152d7a

First commit

Browse files
Files changed (3) hide show
  1. app.py +221 -0
  2. clips.csv +0 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit demo to visualize auto-annotated Foley segments from movie clips."""
2
+ import os
3
+ from os.path import join, exists, dirname, abspath
4
+ import json
5
+
6
+ from tqdm import tqdm
7
+ import numpy as np
8
+ import pandas as pd
9
+ import streamlit as st
10
+ from moviepy.video.io.VideoFileClip import VideoFileClip
11
+
12
+ import warnings
13
+ warnings.simplefilter(action='ignore')
14
+
15
+
16
+ curr_filepath = abspath(__file__)
17
+ repo_path = dirname(dirname(curr_filepath))
18
+
19
+
20
+ def load_json(path: str) -> dict:
21
+ """Helper to load json file"""
22
+ with open(path, 'rb') as f:
23
+ data = json.load(f)
24
+ return data
25
+
26
+
27
+ def tqdm_iterator(items, desc=None, bar_format=None, **kwargs):
28
+ tqdm._instances.clear()
29
+ iterator = tqdm(
30
+ items,
31
+ desc=desc,
32
+ bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}',
33
+ **kwargs,
34
+ )
35
+
36
+ return iterator
37
+
38
+
39
+ def get_data_root_from_hostname():
40
+ import socket
41
+
42
+ data_root_lib = {
43
+ "diva": "/ssd/pbagad/datasets/",
44
+ "node": "/var/scratch/pbagad/datasets/",
45
+ "fs4": "/var/scratch/pbagad/datasets/",
46
+ }
47
+ hostname = socket.gethostname()
48
+ hostname = hostname[:4]
49
+
50
+ data_root = data_root_lib.get(hostname, "NA")
51
+ return data_root
52
+
53
+
54
+ def load_clips_df(df_path, data_dir, verbose=True):
55
+ assert exists(df_path), f"File {df_path} does not exist"
56
+ df = pd.read_csv(df_path)
57
+ print(df.columns)
58
+ if verbose:
59
+ print("Number of clips:", len(df))
60
+ # filter out clips that are not downloaded
61
+ df["video_path"] = df["videoid"].apply(lambda x: join(data_dir, "pytube_videos", f"{x}.mp4"))
62
+ df = df[df["video_path"].apply(exists)]
63
+ if verbose:
64
+ print("Number of clips (with videos available):", len(df))
65
+ df["audio_path"] = df["videoid"].apply(lambda x: join(data_dir, "pytube_audio", f"{x}.wav"))
66
+ df = df[df["audio_path"].apply(exists)]
67
+ if verbose:
68
+ print("Number of clips (with audio available):", len(df))
69
+ df["annot_path"] = df["videoid"].apply(lambda x: join(data_dir, "annotations", f"{x}.json"))
70
+ df = df[df["annot_path"].apply(exists)]
71
+ if verbose:
72
+ print("Number of clips (with annotations available):", len(df))
73
+ return df
74
+
75
+
76
+ def summarize_classification_probs(silence, probs):
77
+ summary = [f"Silence: {silence}"]
78
+ summary += [f"{l.capitalize()}: {p}" for (l, p) in probs]
79
+ return " | ".join(summary)
80
+
81
+
82
+ def cut_video_in_segments(video_path, segments):
83
+ video = VideoFileClip(video_path)
84
+ tmp_dir = os.path.join(os.path.expanduser("~"), "tmp")
85
+ clip_paths = [f"{tmp_dir}/clip_{i}.mp4" for i in range(len(segments))]
86
+ iterator = tqdm_iterator(
87
+ zip(segments, clip_paths), total=len(segments), desc="Preparing clips",
88
+ )
89
+ clips = [
90
+ video.subclip(x, y).write_videofile(f, logger=None, verbose=False) \
91
+ for (x, y), f in iterator
92
+ ]
93
+ return clip_paths
94
+
95
+
96
+ def process_sample(row):
97
+
98
+ video_path = row["video_path"]
99
+ audio_path = row["audio_path"]
100
+
101
+ annot = load_json(row["annot_filtered"])
102
+ seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag]
103
+
104
+ keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"]
105
+ for k in keys:
106
+ annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices]
107
+ del annot["keep_status"]
108
+
109
+ labels = [
110
+ summarize_classification_probs(
111
+ annot["silence_prob"][i], annot["audiomae_on_audioset"][i]
112
+ ) for i in range(len(annot["non_speech_segments"]))
113
+ ]
114
+ clip_paths = cut_video_in_segments(video_path, annot["non_speech_segments"])
115
+ return clip_paths, labels, annot["non_speech_segments"], annot["duration"]
116
+
117
+
118
+ def make_grid(cols,rows):
119
+ grid = [0]*cols
120
+ for i in range(cols):
121
+ with st.container():
122
+ grid[i] = st.columns(rows)
123
+ return grid
124
+
125
+
126
+ if __name__ == "__main__":
127
+
128
+ # Streamlit app code
129
+ st.set_page_config(layout="wide")
130
+ st.title("Foley Segments from Condensed Movies Dataset 🎬")
131
+
132
+ st.markdown(
133
+ "> **Note**: This demo shows cut out segments from clips in the [Condensed Movies](https://www.robots.ox.ac.uk/~vgg/data/condensed-movies/) dataset. "\
134
+ "The segments are adjudged to have Foley sounds based on AudioMAE predictions on AudioSet classes. "\
135
+ "The segments with duration not in [2s, 30s] or those with silence or high probability of speech/music sounds are filtered out. "\
136
+ "However, segments can still be noisy. Furthermore, even if a clip has "\
137
+ "Foley, there can still be background music/score which we have not removed yet."
138
+ )
139
+ st.markdown(
140
+ """> <span style="color:red">Warning</span>: Currently, each clip can be played only once. Replaying starts the clip from beginning of the video.""",
141
+ unsafe_allow_html=True
142
+ )
143
+ st.markdown(
144
+ "**Instructions**: Click the **Reload** button to see segments from a new clip. "\
145
+ "Reloading the page is not necessary."
146
+ )
147
+
148
+ use_local = False
149
+ data_root = get_data_root_from_hostname()
150
+ data_dir = join(data_root, "CondensedMovies")
151
+ video_dir = join(data_dir, "pytube_videos")
152
+
153
+ annot_dir = join(repo_path, "external/CondensedMovies/data/metadata/")
154
+
155
+ if "subdf" not in st.session_state:
156
+ df = load_clips_df(join(".", "clips.csv"), data_dir, verbose=True)
157
+ df["annot_filtered"] = df["annot_path"].apply(lambda x: x.replace(".json", "_filtered.json"))
158
+ df = df[df["annot_filtered"].apply(exists)]
159
+ df["num_foley_segments"] = df["annot_filtered"].apply(lambda f: sum(load_json(f)["keep_status"]))
160
+ subdf = df[df["num_foley_segments"].apply(lambda x: x > 0)]
161
+ st.session_state.subdf = subdf
162
+ num_foley = subdf["num_foley_segments"].sum()
163
+ st.session_state.num_foley = num_foley
164
+ print("Loaded subdf with {} rows".format(len(subdf)))
165
+
166
+
167
+ reload_button = st.button("Reload")
168
+ # index = 0
169
+ index = np.random.randint(0, len(st.session_state.subdf))
170
+ if reload_button:
171
+ index = np.random.randint(0, len(st.session_state.subdf))
172
+
173
+ row = st.session_state.subdf.iloc[index].to_dict()
174
+ if use_local:
175
+ clip_paths, labels, segments, durations = process_sample(row)
176
+ else:
177
+ annot = load_json(row["annot_filtered"])
178
+ seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag]
179
+ keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"]
180
+ for k in keys:
181
+ annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices]
182
+ del annot["keep_status"]
183
+ labels = [
184
+ summarize_classification_probs(
185
+ annot["silence_prob"][i], annot["audiomae_on_audioset"][i]
186
+ ) for i in range(len(annot["non_speech_segments"]))
187
+ ]
188
+ segments, durations = annot["non_speech_segments"], annot["duration"]
189
+ clip_paths = [f"https://www.youtube.com/watch?v={row['videoid']}"] * len(segments)
190
+
191
+ # Make a grid of videos and captions in streamlit
192
+ videos = clip_paths
193
+ video_id = row["videoid"]
194
+ movie = row["title"]
195
+ st.markdown(f"Showing Foley segments from a clip in movie: **{movie}**")
196
+
197
+ # Create a grid of videos
198
+ grid = make_grid(3, 3)
199
+
200
+ # Add videos to the grid
201
+ for idx in range(0, min(len(videos), 9)):
202
+ i, j = idx // 3, idx % 3
203
+
204
+ start, end = segments[idx]
205
+ duration = durations[idx]
206
+
207
+ grid[i][j].caption(f"Segment duration: {duration}")
208
+ if not use_local:
209
+ url = f"https://www.youtube.com/embed/{video_id}?start={int(start)}&end={int(end)}"
210
+ html_code = f"""
211
+ <iframe height="320" width="420" src="{url}" frameborder="0" allowfullscreen></iframe>
212
+ """
213
+ grid[i][j].markdown(html_code, unsafe_allow_html=True)
214
+ else:
215
+ grid[i][j].video(videos[idx])
216
+ grid[i][j].caption(f"{labels[idx]}")
217
+
218
+
219
+ st.markdown("##### Some stats")
220
+ st.write(f"Total number of unique clips: {len(st.session_state.subdf)}")
221
+ st.write(f"Total number of foley segments: {st.session_state.num_foley}")
clips.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ streamlit
4
+ moviepy
5
+ tqdm