Spaces:

BridgeTower
/

bridgetower-video-search

Runtime error

App Files Files Community

shaoyent commited on Feb 11, 2023

Commit

a1ebdce

1 Parent(s): 54597df

First update

Browse files

Files changed (3) hide show

app.py +306 -80
bridgetower_custom.py +183 -0
requirements.txt +5 -1

app.py CHANGED Viewed

@@ -1,90 +1,316 @@
 import cv2
 import gradio as gr
 from PIL import Image
 from transformers import BridgeTowerForImageAndTextRetrieval, BridgeTowerProcessor
-model_id = "BridgeTower/bridgetower-large-itm-mlm-gaudi"
-processor = BridgeTowerProcessor.from_pretrained(model_id)
-model = BridgeTowerForImageAndTextRetrieval.from_pretrained(model_id)
-# Process a frame
-def process_frame(image, texts):
-    scores = {}
-    texts = texts.split(",")
-    for t in texts:
-        encoding = processor(image, t, return_tensors="pt")
-        outputs = model(**encoding)
-        scores[t] = "{:.2f}".format(outputs.logits[0, 1].item())
-        # sort scores in descending order
-        scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))
-    return scores
-# Process a video
-def process(video, text, sample_rate, min_score):
-    video = cv2.VideoCapture(video)
-    fps = round(video.get(cv2.CAP_PROP_FPS))
-    frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
-    length = frames // fps
-    print(f"{fps} fps, {frames} frames, {length} seconds")
-    frame_count = 0
-    clips = []
-    clip_images = []
-    clip_started = False
-    while True:
-        ret, frame = video.read()
-        if not ret:
             break
-        if frame_count % (fps * sample_rate) == 0:
-            frame = Image.fromarray(frame)
-            score = process_frame(frame, text)
-            # print(f"{frame_count} {scores}")
-            if float(score[text]) > min_score:
-                if clip_started:
-                    end_time = frame_count / fps
-                else:
-                    clip_started = True
-                    start_time = frame_count / fps
-                    end_time = start_time
-                    start_score = score[text]
-                    clip_images.append(frame)
-            elif clip_started:
-                clip_started = False
-                end_time = frame_count / fps
-                clips.append((start_score, start_time, end_time))
-        frame_count += 1
-    return clip_images, clips
-# Inputs
-video = gr.Video(label="Video")
-text = gr.Text(label="Text query")
-sample_rate = gr.Number(value=5, label="Sample rate (1 frame every 'n' seconds)")
-min_score = gr.Number(value=3, label="Minimum score")
-# Output
-gallery = gr.Gallery(label="Images")
-clips = gr.Text(label="Clips (score, start time, end time)")
 description = "This Space lets you run semantic search on a video."
-iface = gr.Interface(
-    description=description,
-    fn=process,
-    inputs=[video, text, sample_rate, min_score],
-    outputs=[gallery, clips],
-    examples=[
-        [
-            "video.mp4",
-            "wild bears",
-            5,
-            3,
-        ]
-    ],
-    allow_flagging="never",
-)
-iface.launch()

+# In[]:
+import sys
+import os
 import cv2
 import gradio as gr
 from PIL import Image
+import numpy as np
+from torch.nn.utils.rnn import pad_sequence
 from transformers import BridgeTowerForImageAndTextRetrieval, BridgeTowerProcessor
+from bridgetower_custom import BridgeTowerTextFeatureExtractor, BridgeTowerForITC
+import pickle
+from tqdm import tqdm
+from PIL import Image
+import torch
+import re
+import urllib.parse
+import faiss
+import webvtt
+import json
+from pytube import YouTube
+from youtube_transcript_api import YouTubeTranscriptApi
+from youtube_transcript_api.formatters import WebVTTFormatter
+device = 'cpu'
+model_name = 'BridgeTower/bridgetower-large-itm-mlm-itc'
+model = BridgeTowerForITC.from_pretrained(model_name).to(device)
+text_model = BridgeTowerTextFeatureExtractor.from_pretrained(model_name).to(device)
+processor = BridgeTowerProcessor.from_pretrained(model_name)
+def download_video(video_url, path='/tmp/'):
+    yt = YouTube(video_url)
+    yt = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
+    if not os.path.exists(path):
+        os.makedirs(path)
+    filepath = os.path.join(path, yt.default_filename)
+    if not os.path.exists(filepath):
+        print('Downloading video from YouTube...')
+        yt.download(path)
+    return filepath
+# Get transcript in webvtt
+def get_transcript_vtt(video_id, path='/tmp'):
+    filepath = os.path.join(path,'test_vm.vtt')
+    if os.path.exists(filepath):
+        return filepath
+    transcript = YouTubeTranscriptApi.get_transcript(video_id)
+    formatter = WebVTTFormatter()
+    webvtt_formatted = formatter.format_transcript(transcript)
+    with open(filepath, 'w', encoding='utf-8') as webvtt_file:
+        webvtt_file.write(webvtt_formatted)
+    webvtt_file.close()
+    return filepath
+# https://stackoverflow.com/a/57781047
+# Resizes a image and maintains aspect ratio
+def maintain_aspect_ratio_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
+    # Grab the image size and initialize dimensions
+    dim = None
+    (h, w) = image.shape[:2]
+    # Return original image if no need to resize
+    if width is None and height is None:
+        return image
+    # We are resizing height if width is none
+    if width is None:
+        # Calculate the ratio of the height and construct the dimensions
+        r = height / float(h)
+        dim = (int(w * r), height)
+    # We are resizing width if height is none
+    else:
+        # Calculate the ratio of the width and construct the dimensions
+        r = width / float(w)
+        dim = (width, int(h * r))
+    # Return the resized image
+    return cv2.resize(image, dim, interpolation=inter)
+def time_to_frame(time, fps):
+    '''
+        convert time in seconds into frame number
+    '''
+    return time * fps - 1
+def str2time(strtime):
+    strtime = strtime.strip('"')
+    hrs, mins, seconds = [float(c) for c in strtime.split(':')]
+    total_seconds = hrs * 60**2 + mins * 60 + seconds
+    return total_seconds
+def collate_fn(batch_list):
+    batch = {}
+    batch['input_ids']      = pad_sequence([encoding['input_ids'].squeeze(0)  for encoding in batch_list], batch_first=True)
+    batch['attention_mask'] = pad_sequence([encoding['attention_mask'].squeeze(0) for encoding in batch_list], batch_first=True)
+    batch['pixel_values']   = torch.cat([encoding['pixel_values'] for encoding in batch_list], dim=0)
+    batch['pixel_mask']   = torch.cat([encoding['pixel_mask'] for encoding in batch_list], dim=0)
+    return batch
+def extract_images_and_embeds(video_id, video_path, subtitles, output, expanded=False, batch_size=2):
+    if os.path.exists(os.path.join(output, 'embeddings.pkl')):
+        return
+    os.makedirs(output, exist_ok=True)
+    os.makedirs(os.path.join(output, 'frames'), exist_ok=True)
+    os.makedirs(os.path.join(output, 'frames_thumb'), exist_ok=True)
+    count = 0
+    vidcap = cv2.VideoCapture(video_path)
+    # Get the frames per second
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    # Get the total numer of frames in the video.
+    frame_count = vidcap.get(cv2.CAP_PROP_FRAME_COUNT)
+    print(fps, frame_count)
+    frame_number = 0
+    count = 0
+    anno = []
+    embeddings = []
+    batch_list = []
+    for idx, caption in enumerate(webvtt.read(subtitles)):
+        st_time = str2time(caption.start)
+        ed_time = str2time(caption.end)
+        mid_time = (ed_time + st_time) / 2
+        text = caption.text.replace('\n', ' ')
+        if expanded :
+            raise NotImplementedError
+        frame_no =  time_to_frame(mid_time, fps)
+        print('Read a new frame: ', idx, mid_time, frame_no, text)
+        vidcap.set(1, frame_no)    # added this line
+        success, image = vidcap.read()
+        if success:
+            img_fname = f'{video_id}_{idx:06d}'
+            img_fpath = os.path.join(output, 'frames', img_fname + '.jpg')
+            image = maintain_aspect_ratio_resize(image, height=350)     # save frame as JPEG file
+            cv2.imwrite( img_fpath, image)     # save frame as JPEG file
+            count += 1
+            anno.append({
+                'image_id': idx,
+                'img_fname': img_fname,
+                'caption': text,
+                'time': mid_time,
+                'frame_no': frame_no
+            })
+        else:
             break
+        encoding = processor(image, text, return_tensors="pt").to(device)
+        encoding['text'] = text
+        encoding['image_filepath'] = img_fpath
+        encoding['start_time'] = caption.start
+        batch_list.append(encoding)
+        if len(batch_list) == batch_size:
+            batch = collate_fn(batch_list)
+            with torch.no_grad():
+                outputs = model(**batch, output_hidden_states=True)
+            for i in range(batch_size):
+                embeddings.append({
+                    'embeddings':outputs.logits[i,2,:].detach().cpu().numpy(),
+                    'text': batch_list[i]['text'],
+                    'image_filepath': batch_list[i]['image_filepath'],
+                    'start_time': batch_list[i]['start_time'],
+                })
+            batch_list = []
+    if batch_list:
+        batch = collate_fn(batch_list)
+        with torch.no_grad():
+            outputs = model(**batch, output_hidden_states=True)
+        for i in range(len(batch_list)):
+            embeddings.append({
+                'embeddings':outputs.logits[i,2,:].detach().cpu().numpy(),
+                'text': batch_list[i]['text'],
+                'image_filepath': batch_list[i]['image_filepath'],
+                'start_time': batch_list[i]['start_time'],
+            })
+    with open(os.path.join(output, 'annotations.json'), 'w') as fh:
+        json.dump(anno, fh)
+    with open(os.path.join(output, 'embeddings.pkl'), 'wb') as fh:
+        pickle.dump(embeddings, fh)
+def run_query(video_id, text_query, path='/tmp'):
+    embeddings_filepath = os.path.join(path, 'embeddings.pkl')
+    faiss_filepath = os.path.join(path, 'faiss_index.pkl')
+    embeddings = pickle.load(open(embeddings_filepath, 'rb'))
+    if os.path.exists(faiss_filepath):
+        faiss_index = pickle.load(open(faiss_filepath, 'rb'))
+    else :
+        embs = [emb['embeddings'] for emb in embeddings]
+        vectors = np.stack(embs, axis=0)
+        num_vectors, vector_dim  = vectors.shape
+        faiss_index = faiss.IndexFlatIP(vector_dim)
+        faiss_index.add(vectors)
+        pickle.dump(faiss_index, open(faiss_filepath, 'wb'))
+    print('Processing query')
+    encoding = processor.tokenizer(text_query, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = text_model(**encoding)
+    emb_query = outputs.cpu().numpy()
+    print('Running FAISS search')
+    _, I = faiss_index.search(emb_query, 6)
+    clip_images = [embeddings[idx]['image_filepath'] for idx in I[0]]
+    transcripts = [f"({embeddings[idx]['start_time']}) {embeddings[idx]['text']}" for idx in I[0]]
+    return clip_images, transcripts
+def get_video_id_from_url(video_url):
+    """
+    Examples:
+    - http://youtu.be/SA2iWivDJiE
+    - http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu
+    - http://www.youtube.com/embed/SA2iWivDJiE
+    - http://www.youtube.com/v/SA2iWivDJiE?version=3&amp;hl=en_US
+    """
+    import urllib.parse
+    url = urllib.parse.urlparse(video_url)
+    if url.hostname == 'youtu.be':
+        return url.path[1:]
+    if url.hostname in ('www.youtube.com', 'youtube.com'):
+        if url.path == '/watch':
+            p = urllib.parse.parse_qs(url.query)
+            return p['v'][0]
+        if url.path[:7] == '/embed/':
+            return url.path.split('/')[2]
+        if url.path[:3] == '/v/':
+            return url.path.split('/')[2]
+    return None
+def process(video_url, text_query):
+    tmp_dir = os.path.join(os.getcwd(), 'cache')
+    video_id = get_video_id_from_url(video_url)
+    output_dir = os.path.join(tmp_dir, video_id)
+    video_file = download_video(video_url, path=output_dir)
+    subtitles = get_transcript_vtt(video_id, path=output_dir)
+    extract_images_and_embeds(video_id=video_id,
+        video_path=video_file,
+        subtitles=subtitles,
+        output=output_dir,
+        expanded=False,
+        batch_size=8,
+    )
+    frame_paths, transcripts = run_query(video_id, text_query, path=output_dir)
+    return video_file, [(image, caption) for image, caption in zip(frame_paths, transcripts)]
 description = "This Space lets you run semantic search on a video."
+with gr.Blocks() as demo:
+    gr.Markdown(description)
+    with gr.Row():
+        with gr.Column():
+            video_url = gr.Text(label="Youtube url")
+            text_query = gr.Text(label="Text query")
+            btn = gr.Button("Run query")
+        video_player = gr.Video(label="Video")
+    with gr.Row():
+        gallery = gr.Gallery(label="Images").style(grid=6)
+    gr.Examples(
+        examples=[
+            ['https://www.youtube.com/watch?v=CvjoXdC-WkM','wedding'],
+            ['https://www.youtube.com/watch?v=fWs2dWcNGu0', 'cheesecake on floor'],
+            ['https://www.youtube.com/watch?v=rmPpNsx4yAk', 'cat woman'],
+            ['https://www.youtube.com/watch?v=KCFYf4TJdN0' ,'sandwich'],
+        ],
+        inputs=[video_url, text_query],
+    )
+    btn.click(fn=process,
+        inputs=[video_url, text_query],
+        outputs=[video_player, gallery],
+    )
+demo.launch(share=True, server_port=25566)

bridgetower_custom.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from collections import OrderedDict
+from typing import List, Optional, Tuple, Union
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torchvision import transforms
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+from transformers.modeling_outputs import SequenceClassifierOutput
+from transformers import BridgeTowerPreTrainedModel, BridgeTowerModel
+from transformers.models.bridgetower.modeling_bridgetower import BridgeTowerTextModel
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class BridgeTowerImageFeatureExtractor(nn.Module):
+    def __init__(
+            self,
+            patch_size=14,
+            width=1024,
+            resolution_after=294,
+            ckpt_path=None,
+        ):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((resolution_after // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        if ckpt_path is not None:
+            sd = torch.load(ckpt_path)
+            if 'state_dict' in sd:
+                sd = sd["state_dict"]
+            print(f'Loading feature extractor checkpoint from {ckpt_path}')
+            self.load_state_dict(sd)
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        t=self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([t, x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        return x
+class BridgeTowerITCHead(nn.Module):
+    def __init__(self, hidden_size, embed_size):
+        super().__init__()
+        self.fc = nn.Linear(hidden_size, embed_size)
+    def forward(self, x):
+        x = self.fc(x)
+        return x
+class _BridgeTowerTextModelWrapper(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.text_model = BridgeTowerTextModel(config)
+    def forward(self, **kwargs):
+        return self.text_model(**kwargs)
+class BridgeTowerTextFeatureExtractor(BridgeTowerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bridgetower = _BridgeTowerTextModelWrapper(config.text_config)
+        self.itc_text_head = BridgeTowerITCHead(config.hidden_size, config.contrastive_hidden_size)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ):
+        outputs = self.bridgetower(input_ids=input_ids, attention_mask=attention_mask)
+        final_hidden_cls = outputs.last_hidden_state[:,0,:]
+        final_hidden_cls = F.normalize(self.itc_text_head(final_hidden_cls), dim=-1, p=2)
+        return final_hidden_cls
+class BridgeTowerForITC(BridgeTowerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bridgetower = BridgeTowerModel(config)
+        self.itc_text_head = BridgeTowerITCHead(config.hidden_size, config.contrastive_hidden_size)
+        self.itc_image_head = BridgeTowerITCHead(config.hidden_size, config.contrastive_hidden_size)
+        self.itc_cross_modal_head = BridgeTowerITCHead(config.hidden_size * 2, config.contrastive_hidden_size)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]:
+        assert output_hidden_states, 'output_hidden_states should be set to True for BridgeTowerForITC'
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bridgetower(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            pixel_values=pixel_values,
+            pixel_mask=pixel_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            image_embeds=image_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooler_output = outputs.pooler_output if return_dict else outputs[2]
+        hidden_states_txt, hidden_states_img, hidden_states_cross_modal = outputs.hidden_states
+        final_hidden_txt = hidden_states_txt[-1]
+        final_hidden_img = hidden_states_img[-1]
+        image_embeds_with_ln = self.bridgetower.vision_model.visual.forward_post(final_hidden_img)
+        image_token_type_embeddings = self.bridgetower.token_type_embeddings(
+            torch.full((1,), 1, dtype=torch.long, device=self.bridgetower.token_type_embeddings.weight.device)
+        ).expand_as(image_embeds_with_ln)
+        final_hidden_img = (
+            self.bridgetower.cross_modal_image_transform(image_embeds_with_ln)
+            + image_token_type_embeddings
+        )
+        final_hidden_txt = F.normalize(self.itc_text_head(final_hidden_txt[:,0,:]), dim=-1, p=2)
+        final_hidden_img = F.normalize(self.itc_image_head(final_hidden_img[:,0,:]), dim=-1, p=2)
+        final_hidden_cross = F.normalize(self.itc_cross_modal_head(pooler_output), dim=-1, p=2)
+        logits = torch.stack([final_hidden_txt, final_hidden_img, final_hidden_cross], dim=-2)
+        if not return_dict:
+            return tuple(logits)
+        return SequenceClassifierOutput(
+            loss=None,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

requirements.txt CHANGED Viewed

@@ -1,4 +1,8 @@
 git+https://github.com/huggingface/transformers
 torch
 requests
-Pillow

 git+https://github.com/huggingface/transformers
 torch
 requests
+Pillow
+youtube-transcript-api
+faiss-cpu
+webvtt
+pytube