Spaces:

bpiyush
/

FoleySegments

Build error

App Files Files Community

bpiyush commited on Feb 5, 2023

Commit

b190ca2

1 Parent(s): ebe4290

New app loaded

Browse files

Files changed (2) hide show

app.py +49 -30
app_old.py +226 -0

app.py CHANGED Viewed

@@ -136,7 +136,7 @@ if __name__ == "__main__":
     st.markdown(
         "> **Note**: This demo shows cut out segments from clips in the [Condensed Movies](https://www.robots.ox.ac.uk/~vgg/data/condensed-movies/) dataset. "\
         "The segments are adjudged to have Foley sounds based on AudioMAE predictions on AudioSet classes. "\
-        "The segments with duration not in [2s, 30s] or those with silence or high probability of speech/music sounds are filtered out. "\
         "However, segments can still be noisy. Furthermore, even if a clip has "\
         "Foley, there can still be background music/score which we have not removed yet."
     )
@@ -150,34 +150,50 @@ if __name__ == "__main__":
     )
     csv_path = "./clips.csv"
-    ann_dirs = glob(join(".", "annotations_", "*"))
-    annot_paths = glob(join(".", "annotations_*", "*_filtered.json"))
     per_video_width = 360
     per_video_height = 240
     print("Total number of clips: {}".format(len(annot_paths)))
     if "data" not in st.session_state:
-        # store video ids
-        video_ids = [basename(x).split("_filtered.json")[0] for x in annot_paths]
-        # load annotation data
         data = [load_json(p) for p in annot_paths]
-        num_foley_per_clip = [sum(d["keep_status"]) for d in data]
-        num_foley_segments = np.sum(num_foley_per_clip)
-        data = [d for d, n in zip(data, num_foley_per_clip) if n > 0]
         # get movie titles
         df = pd.read_csv(csv_path)
         titles = df["title"].values
         # store variables
         st.session_state.titles = titles
         st.session_state.video_ids = video_ids
         st.session_state.data = data
         st.session_state.num_foley_segments = num_foley_segments
     reload_button = st.button("Reload")
     index = np.random.randint(0, len(st.session_state.data))
     if reload_button:
@@ -186,21 +202,27 @@ if __name__ == "__main__":
     # Gather data
     annot = st.session_state.data[index]
     video_id = st.session_state.video_ids[index]
-    seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag]
-    keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"]
-    for k in keys:
-        annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices]
-    del annot["keep_status"]
-    labels = [
-        summarize_classification_probs(
-            annot["silence_prob"][i], annot["audiomae_on_audioset"][i]
-        ) for i in range(len(annot["non_speech_segments"]))
-    ]
-    segments, durations = annot["non_speech_segments"], annot["duration"]
-    movie = st.session_state.titles[index]
-    st.markdown(f"Showing Foley segments from a clip in movie: **{movie}**")
     # Create a grid of videos
     grid = make_grid(3, 3)
@@ -220,7 +242,4 @@ if __name__ == "__main__":
         grid[i][j].markdown(html_code, unsafe_allow_html=True)
         grid[i][j].caption(f"{labels[idx]}")
-    st.markdown("##### Some stats")
-    st.write(f"Total number of unique clips: {len(st.session_state.data)}")
-    st.write("Total number of foley segments: {}".format(st.session_state.num_foley_segments))

     st.markdown(
         "> **Note**: This demo shows cut out segments from clips in the [Condensed Movies](https://www.robots.ox.ac.uk/~vgg/data/condensed-movies/) dataset. "\
         "The segments are adjudged to have Foley sounds based on AudioMAE predictions on AudioSet classes. "\
+        "The segments are cut into fixed 5s long chunks. Those with silence or high probability of speech/music sounds are filtered out. "\
         "However, segments can still be noisy. Furthermore, even if a clip has "\
         "Foley, there can still be background music/score which we have not removed yet."
     )
     )
     csv_path = "./clips.csv"
+    ann_dirname = "annotations-v2.0"
+    ext = ".json"
+    ann_dirs = glob(join(".", f"{ann_dirname}_", "*"))
+    annot_paths = glob(join(".", f"{ann_dirname}_*", f"*{ext}"))
     per_video_width = 360
     per_video_height = 240
     print("Total number of clips: {}".format(len(annot_paths)))
     if "data" not in st.session_state:
+        # Store video ids
+        video_ids = [basename(x).split(ext)[0] for x in annot_paths]
+        # Load annotation data
         data = [load_json(p) for p in annot_paths]
+        # Filter those examples with no foley segments
+        indices = [i for i, d in enumerate(data) if len(d["segments"]) > 0]
+        data = [d for i, d in enumerate(data) if i in indices]
+        video_ids = [video_ids[i] for i in indices]
+        # Compute the total number of Foley segments
+        num_foley_segments = np.sum([len(d["segments"]) for d in data])
         # get movie titles
         df = pd.read_csv(csv_path)
+        df = df[df["videoid"].isin(video_ids)]
+        # reorder rows to match video_ids
+        df = df.set_index("videoid").loc[video_ids].reset_index()
         titles = df["title"].values
         # store variables
         st.session_state.titles = titles
         st.session_state.video_ids = video_ids
         st.session_state.data = data
         st.session_state.num_foley_segments = num_foley_segments
+    st.markdown("##### Some stats")
+    st.write(f"Total number of unique clips: {len(st.session_state.data)}")
+    st.write("Total number of foley segments: {}".format(st.session_state.num_foley_segments))
+    st.markdown("---")
     reload_button = st.button("Reload")
     index = np.random.randint(0, len(st.session_state.data))
     if reload_button:
     # Gather data
     annot = st.session_state.data[index]
     video_id = st.session_state.video_ids[index]
+    title = st.session_state.titles[index]
+    segments = annot["segments"]
+    durations = [np.round(y - x, 2) for [x, y] in annot["segments"]]
+    labels = annot["labels"]
+    # seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag]
+    # keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"]
+    # for k in keys:
+    #     annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices]
+    # del annot["keep_status"]
+    # labels = [
+    #     summarize_classification_probs(
+    #         annot["silence_prob"][i], annot["audiomae_on_audioset"][i]
+    #     ) for i in range(len(annot["non_speech_segments"]))
+    # ]
+    # segments, durations = annot["non_speech_segments"], annot["duration"]
+    # movie = st.session_state.titles[index]
+    st.markdown(f"Showing Foley segments from a clip in movie: **{title}**")
     # Create a grid of videos
     grid = make_grid(3, 3)
         grid[i][j].markdown(html_code, unsafe_allow_html=True)
         grid[i][j].caption(f"{labels[idx]}")

app_old.py ADDED Viewed

	@@ -0,0 +1,226 @@

+"""Streamlit demo to visualize auto-annotated Foley segments from movie clips."""
+import os
+from os.path import join, exists, dirname, abspath, basename
+import json
+from glob import glob
+from tqdm import tqdm
+import numpy as np
+import pandas as pd
+import streamlit as st
+from moviepy.video.io.VideoFileClip import VideoFileClip
+import warnings
+warnings.simplefilter(action='ignore')
+curr_filepath = abspath(__file__)
+repo_path = dirname(dirname(curr_filepath))
+def load_json(path: str) -> dict:
+    """Helper to load json file"""
+    with open(path, 'rb') as f:
+        data = json.load(f)
+    return data
+def tqdm_iterator(items, desc=None, bar_format=None, **kwargs):
+    tqdm._instances.clear()
+    iterator = tqdm(
+        items,
+        desc=desc,
+        bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}',
+        **kwargs,
+    )
+    return iterator
+def get_data_root_from_hostname():
+    import socket
+    data_root_lib = {
+        "diva": "/ssd/pbagad/datasets/",
+        "node": "/var/scratch/pbagad/datasets/",
+        "fs4": "/var/scratch/pbagad/datasets/",
+    }
+    hostname = socket.gethostname()
+    hostname = hostname[:4]
+    data_root = data_root_lib.get(hostname, "NA")
+    return data_root
+def load_clips_df(df_path, data_dir, verbose=True, use_local=False):
+    assert exists(df_path), f"File {df_path} does not exist"
+    df = pd.read_csv(df_path)
+    print(df.columns)
+    if verbose:
+        print("Number of clips:", len(df))
+    # filter out clips that are not downloaded
+    df["video_path"] = df["videoid"].apply(lambda x: join(data_dir, "pytube_videos", f"{x}.mp4"))
+    if use_local:
+        df = df[df["video_path"].apply(exists)]
+    if verbose:
+        print("Number of clips (with videos available):", len(df))
+    df["audio_path"] = df["videoid"].apply(lambda x: join(data_dir, "pytube_audio", f"{x}.wav"))
+    if use_local:
+        df = df[df["audio_path"].apply(exists)]
+    if verbose:
+        print("Number of clips (with audio available):", len(df))
+    df["annot_path"] = df["videoid"].apply(lambda x: join(data_dir, "annotations", f"{x}.json"))
+    if use_local:
+        df = df[df["annot_path"].apply(exists)]
+    if verbose:
+        print("Number of clips (with annotations available):", len(df))
+    return df
+def summarize_classification_probs(silence, probs):
+    summary = [f"Silence: {silence}"]
+    summary += [f"{l.capitalize()}: {p}" for (l, p) in probs]
+    return " | ".join(summary)
+def cut_video_in_segments(video_path, segments):
+    video = VideoFileClip(video_path)
+    tmp_dir = os.path.join(os.path.expanduser("~"), "tmp")
+    clip_paths = [f"{tmp_dir}/clip_{i}.mp4" for i in range(len(segments))]
+    iterator = tqdm_iterator(
+        zip(segments, clip_paths), total=len(segments), desc="Preparing clips",
+    )
+    clips = [
+        video.subclip(x, y).write_videofile(f, logger=None, verbose=False) \
+            for (x, y), f in iterator
+    ]
+    return clip_paths
+def process_sample(row):
+    video_path = row["video_path"]
+    audio_path = row["audio_path"]
+    annot = load_json(row["annot_filtered"])
+    seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag]
+    keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"]
+    for k in keys:
+        annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices]
+    del annot["keep_status"]
+    labels = [
+        summarize_classification_probs(
+            annot["silence_prob"][i], annot["audiomae_on_audioset"][i]
+        ) for i in range(len(annot["non_speech_segments"]))
+    ]
+    clip_paths = cut_video_in_segments(video_path, annot["non_speech_segments"])
+    return clip_paths, labels, annot["non_speech_segments"], annot["duration"]
+def make_grid(cols,rows):
+    grid = [0]*cols
+    for i in range(cols):
+        with st.container():
+            grid[i] = st.columns(rows)
+    return grid
+if __name__ == "__main__":
+    # Streamlit app code
+    st.set_page_config(layout="wide")
+    st.title("Foley Segments from Condensed Movies Dataset 🎬")
+    st.markdown(
+        "> **Note**: This demo shows cut out segments from clips in the [Condensed Movies](https://www.robots.ox.ac.uk/~vgg/data/condensed-movies/) dataset. "\
+        "The segments are adjudged to have Foley sounds based on AudioMAE predictions on AudioSet classes. "\
+        "The segments with duration not in [2s, 30s] or those with silence or high probability of speech/music sounds are filtered out. "\
+        "However, segments can still be noisy. Furthermore, even if a clip has "\
+        "Foley, there can still be background music/score which we have not removed yet."
+    )
+    st.markdown(
+        """> <span style="color:red">Warning</span>: Currently, each clip can be played only once. Replaying starts the clip from beginning of the video.""",
+        unsafe_allow_html=True
+    )
+    st.markdown(
+        "**Instructions**: Click the **Reload** button to see segments from a new clip. "\
+        "Reloading the page is not necessary."
+    )
+    csv_path = "./clips.csv"
+    ann_dirs = glob(join(".", "annotations_", "*"))
+    annot_paths = glob(join(".", "annotations_*", "*_filtered.json"))
+    per_video_width = 360
+    per_video_height = 240
+    print("Total number of clips: {}".format(len(annot_paths)))
+    if "data" not in st.session_state:
+        # store video ids
+        video_ids = [basename(x).split("_filtered.json")[0] for x in annot_paths]
+        # load annotation data
+        data = [load_json(p) for p in annot_paths]
+        num_foley_per_clip = [sum(d["keep_status"]) for d in data]
+        num_foley_segments = np.sum(num_foley_per_clip)
+        data = [d for d, n in zip(data, num_foley_per_clip) if n > 0]
+        # get movie titles
+        df = pd.read_csv(csv_path)
+        titles = df["title"].values
+        # store variables
+        st.session_state.titles = titles
+        st.session_state.video_ids = video_ids
+        st.session_state.data = data
+        st.session_state.num_foley_segments = num_foley_segments
+    reload_button = st.button("Reload")
+    index = np.random.randint(0, len(st.session_state.data))
+    if reload_button:
+        index = np.random.randint(0, len(st.session_state.data))
+    # Gather data
+    annot = st.session_state.data[index]
+    video_id = st.session_state.video_ids[index]
+    seg_indices = [i for i, flag in enumerate(annot["keep_status"]) if flag]
+    keys = ["non_speech_segments", "silence_prob", "audiomae_on_audioset", "duration"]
+    for k in keys:
+        annot[k] = [x for i, x in enumerate(annot[k]) if i in seg_indices]
+    del annot["keep_status"]
+    labels = [
+        summarize_classification_probs(
+            annot["silence_prob"][i], annot["audiomae_on_audioset"][i]
+        ) for i in range(len(annot["non_speech_segments"]))
+    ]
+    segments, durations = annot["non_speech_segments"], annot["duration"]
+    movie = st.session_state.titles[index]
+    st.markdown(f"Showing Foley segments from a clip in movie: **{movie}**")
+    # Create a grid of videos
+    grid = make_grid(3, 3)
+    # Add videos to the grid
+    for idx in range(0, min(len(segments), 9)):
+        i, j = idx // 3, idx % 3
+        start, end = segments[idx]
+        duration = durations[idx]
+        grid[i][j].caption(f"Segment duration: {duration}")
+        url = f"https://www.youtube.com/embed/{video_id}?start={int(start)}&end={int(end)}"
+        html_code = f"""
+        <iframe height="{per_video_height}" width="{per_video_width}" src="{url}" frameborder="0" allowfullscreen></iframe>
+        """
+        grid[i][j].markdown(html_code, unsafe_allow_html=True)
+        grid[i][j].caption(f"{labels[idx]}")
+    st.markdown("##### Some stats")
+    st.write(f"Total number of unique clips: {len(st.session_state.data)}")
+    st.write("Total number of foley segments: {}".format(st.session_state.num_foley_segments))