DenseAV

Runtime error

App Files Files Community

mhamilton723 commited on Jun 11, 2024

Commit

c5d5ef0

verified ·

1 Parent(s): 523ffdf

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -98

app.py CHANGED Viewed

@@ -1,122 +1,165 @@
-import matplotlib.pyplot as plt
 import torch
 import torchvision.transforms as T
 from PIL import Image
-import gradio as gr
-from featup.util import norm, unnorm, pca, remove_axes
-from pytorch_lightning import seed_everything
-import os
-import requests
-import os
-import csv
-def plot_feats(image, lr, hr):
-    assert len(image.shape) == len(lr.shape) == len(hr.shape) == 3
-    seed_everything(0)
-    [lr_feats_pca, hr_feats_pca], _ = pca([lr.unsqueeze(0), hr.unsqueeze(0)], dim=9)
-    fig, ax = plt.subplots(3, 3, figsize=(15, 15))
-    ax[0, 0].imshow(image.permute(1, 2, 0).detach().cpu())
-    ax[1, 0].imshow(image.permute(1, 2, 0).detach().cpu())
-    ax[2, 0].imshow(image.permute(1, 2, 0).detach().cpu())
-    ax[0, 0].set_title("Image", fontsize=22)
-    ax[0, 1].set_title("Original", fontsize=22)
-    ax[0, 2].set_title("Upsampled Features", fontsize=22)
-    ax[0, 1].imshow(lr_feats_pca[0, :3].permute(1, 2, 0).detach().cpu())
-    ax[0, 0].set_ylabel("PCA Components 1-3", fontsize=22)
-    ax[0, 2].imshow(hr_feats_pca[0, :3].permute(1, 2, 0).detach().cpu())
-    ax[1, 1].imshow(lr_feats_pca[0, 3:6].permute(1, 2, 0).detach().cpu())
-    ax[1, 0].set_ylabel("PCA Components 4-6", fontsize=22)
-    ax[1, 2].imshow(hr_feats_pca[0, 3:6].permute(1, 2, 0).detach().cpu())
-    ax[2, 1].imshow(lr_feats_pca[0, 6:9].permute(1, 2, 0).detach().cpu())
-    ax[2, 0].set_ylabel("PCA Components 7-9", fontsize=22)
-    ax[2, 2].imshow(hr_feats_pca[0, 6:9].permute(1, 2, 0).detach().cpu())
-    remove_axes(ax)
-    plt.tight_layout()
-    plt.close(fig)  # Close plt to avoid additional empty plots
-    return fig
-if __name__ == "__main__":
-    def download_image(url, save_path):
         response = requests.get(url)
         with open(save_path, 'wb') as file:
             file.write(response.content)
-    base_url = "https://marhamilresearch4.blob.core.windows.net/feature-upsampling-public/sample_images/"
-    sample_images_urls = {
-        "skate.jpg": base_url + "skate.jpg",
-        "car.jpg": base_url + "car.jpg",
-        "plant.png": base_url + "plant.png",
-    }
-    sample_images_dir = "/tmp/sample_images"
-    # Ensure the directory for sample images exists
-    os.makedirs(sample_images_dir, exist_ok=True)
-    # Download each sample image
-    for filename, url in sample_images_urls.items():
-        save_path = os.path.join(sample_images_dir, filename)
-        # Download the image if it doesn't already exist
         if not os.path.exists(save_path):
             print(f"Downloading {filename}...")
-            download_image(url, save_path)
         else:
             print(f"{filename} already exists. Skipping download.")
-    os.environ['TORCH_HOME'] = '/tmp/.cache'
-    os.environ['GRADIO_EXAMPLES_CACHE'] = '/tmp/gradio_cache'
     csv.field_size_limit(100000000)
-    options = ['dino16', 'vit', 'dinov2', 'clip', 'resnet50']
-    image_input = gr.Image(label="Choose an image to featurize",
-                           height=480,
-                           type="pil",
-                           image_mode='RGB',
-                           sources=['upload', 'webcam', 'clipboard']
-                           )
-    model_option = gr.Radio(options, value="dino16", label='Choose a backbone to upsample')
-    models = {o: torch.hub.load("mhamilton723/FeatUp", o) for o in options}
-    def upsample_features(image, model_option):
-        # Image preprocessing
-        input_size = 224
-        transform = T.Compose([
-            T.Resize(input_size),
-            T.CenterCrop((input_size, input_size)),
-            T.ToTensor(),
-            norm
-        ])
-        image_tensor = transform(image).unsqueeze(0).cuda()
-        # Load the selected model
-        upsampler = models[model_option].cuda()
-        hr_feats = upsampler(image_tensor)
-        lr_feats = upsampler.model(image_tensor)
-        upsampler.cpu()
-        return plot_feats(unnorm(image_tensor)[0], lr_feats[0], hr_feats[0])
-    demo = gr.Interface(fn=upsample_features,
-                        inputs=[image_input, model_option],
-                        outputs="plot",
-                        title="Feature Upsampling Demo",
-                        description="This demo allows you to upsample features of an image using selected models.",
-                        examples=[
-                            ["/tmp/sample_images/skate.jpg", "dino16"],
-                            ["/tmp/sample_images/car.jpg", "dinov2"],
-                            ["/tmp/sample_images/plant.png", "dino16"],
-                        ]
-                        )
     demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)

+import csv
+import os
+import tempfile
+import gradio as gr
+import requests
 import torch
+import torchvision
 import torchvision.transforms as T
 from PIL import Image
+from featup.util import norm
+from torchaudio.functional import resample
+from denseav.plotting import plot_attention_video, plot_2head_attention_video, plot_feature_video
+from denseav.shared import norm, crop_to_divisor, blur_dim
+from os.path import join
+if __name__ == "__main__":
+    os.environ['TORCH_HOME'] = '/tmp/.cache'
+    os.environ['GRADIO_EXAMPLES_CACHE'] = '/tmp/gradio_cache'
+    sample_images_dir = "/tmp/samples"
+    # sample_videos_dir = "samples"
+    def download_video(url, save_path):
         response = requests.get(url)
         with open(save_path, 'wb') as file:
             file.write(response.content)
+    base_url = "https://marhamilresearch4.blob.core.windows.net/denseav-public/samples/"
+    sample_videos_urls = {
+        "puppies.mp4": base_url + "puppies.mp4",
+    }
+    # Ensure the directory for sample videos exists
+    os.makedirs(sample_videos_dir, exist_ok=True)
+    # Download each sample video
+    for filename, url in sample_videos_urls.items():
+        save_path = os.path.join(sample_videos_dir, filename)
+        # Download the video if it doesn't already exist
         if not os.path.exists(save_path):
             print(f"Downloading {filename}...")
+            download_video(url, save_path)
         else:
             print(f"{filename} already exists. Skipping download.")
     csv.field_size_limit(100000000)
+    options = ['language', "sound", "sound_and_language"]
+    load_size = 224
+    plot_size = 224
+    video_input = gr.Video(label="Choose a video to featurize", height=480)
+    model_option = gr.Radio(options, value="language", label='Choose a model')
+    video_output1 = gr.Video(label="Audio Video Attention", height=480)
+    video_output2 = gr.Video(label="Multi-Head Audio Video Attention (Only Availible for sound_and_language)",
+                             height=480)
+    video_output3 = gr.Video(label="Visual Features", height=480)
+    video_output4 = gr.Video(label="Audio Features", height=480)
+    models = {o: torch.hub.load("mhamilton723/DenseAV", o) for o in options}
+    def process_video(video, model_option):
+        model = models[model_option].cuda()
+        original_frames, audio, info = torchvision.io.read_video(video, end_pts=10, pts_unit='sec')
+        sample_rate = 16000
+        if info["audio_fps"] != sample_rate:
+            audio = resample(audio, info["audio_fps"], sample_rate)
+        audio = audio[0].unsqueeze(0)
+        img_transform = T.Compose([
+            T.Resize(load_size, Image.BILINEAR),
+            lambda x: crop_to_divisor(x, 8),
+            lambda x: x.to(torch.float32) / 255,
+            norm])
+        frames = torch.cat([img_transform(f.permute(2, 0, 1)).unsqueeze(0) for f in original_frames], axis=0)
+        plotting_img_transform = T.Compose([
+            T.Resize(plot_size, Image.BILINEAR),
+            lambda x: crop_to_divisor(x, 8),
+            lambda x: x.to(torch.float32) / 255])
+        frames_to_plot = plotting_img_transform(original_frames.permute(0, 3, 1, 2))
+        with torch.no_grad():
+            audio_feats = model.forward_audio({"audio": audio.cuda()})
+            audio_feats = {k: v.cpu() for k, v in audio_feats.items()}
+            image_feats = model.forward_image({"frames": frames.unsqueeze(0).cuda()}, max_batch_size=2)
+            image_feats = {k: v.cpu() for k, v in image_feats.items()}
+            sim_by_head = model.sim_agg.get_pairwise_sims(
+                {**image_feats, **audio_feats},
+                raw=False,
+                agg_sim=False,
+                agg_heads=False
+            ).mean(dim=-2).cpu()
+            sim_by_head = blur_dim(sim_by_head, window=3, dim=-1)
+            print(sim_by_head.shape)
+        temp_video_path_1 = tempfile.mktemp(suffix='.mp4')
+        plot_attention_video(
+            sim_by_head,
+            frames_to_plot,
+            audio,
+            info["video_fps"],
+            sample_rate,
+            temp_video_path_1)
+        if model_option == "sound_and_language":
+            temp_video_path_2 = tempfile.mktemp(suffix='.mp4')
+            plot_2head_attention_video(
+                sim_by_head,
+                frames_to_plot,
+                audio,
+                info["video_fps"],
+                sample_rate,
+                temp_video_path_2)
+        else:
+            temp_video_path_2 = None
+        temp_video_path_3 = tempfile.mktemp(suffix='.mp4')
+        temp_video_path_4 = tempfile.mktemp(suffix='.mp4')
+        plot_feature_video(
+            image_feats["image_feats"].cpu(),
+            audio_feats['audio_feats'].cpu(),
+            frames_to_plot,
+            audio,
+            info["video_fps"],
+            sample_rate,
+            temp_video_path_3,
+            temp_video_path_4,
+        )
+        return temp_video_path_1, temp_video_path_2, temp_video_path_3, temp_video_path_4
+    with gr.Blocks() as demo:
+        with gr.Column():
+            video_input.render()
+            model_option.render()
+            with gr.Row():
+                video_output1.render()
+                video_output2.render()
+            with gr.Row():
+                video_output3.render()
+                video_output4.render()
+        demo.examples = [
+            [join(sample_videos_dir, "puppies.mp4"), "language"],
+        ]
+    # demo.launch(server_name="0.0.0.0", server_port=6006, debug=True)
     demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)