{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "metadata": {} }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/root/miniconda3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "InternVideo2Config {\n", " \"_attn_implementation_autoset\": true,\n", " \"architectures\": [\n", " \"InternVideo2_CLIP_small\"\n", " ],\n", " \"auto_map\": {\n", " \"AutoConfig\": \"config.InternVideo2Config\",\n", " \"AutoModel\": \"modeling_internvideo2encoder.InternVideo2_CLIP_small\"\n", " },\n", " \"auto_resume\": false,\n", " \"batch_size\": 64,\n", " \"batch_size_test\": 4,\n", " \"best_key\": [\n", " \"msrvtt_1k_test_match\",\n", " \"t2v_r1\"\n", " ],\n", " \"compile_model\": false,\n", " \"criterion\": {\n", " \"clip_loss_ratio\": [\n", " 1.0,\n", " 1.0\n", " ],\n", " \"distill_final_features\": true,\n", " \"loss_weight\": {\n", " \"mlm\": 1.0,\n", " \"mvm\": 0.0,\n", " \"uta\": 0.0,\n", " \"vtc\": 1.0,\n", " \"vtm\": 1.0\n", " },\n", " \"mlm_masking_prob\": 0.5,\n", " \"vtm_hard_neg\": true\n", " },\n", " \"debug\": false,\n", " \"deep_fusion\": false,\n", " \"deepspeed\": {\n", " \"enable\": true,\n", " \"stage\": 1\n", " },\n", " \"delete_ds_optim_states\": true,\n", " \"device\": \"cuda\",\n", " \"dist_url\": \"env://\",\n", " \"evaluate\": false,\n", " \"evaluation\": {\n", " \"eval_frame_ensemble\": \"concat\",\n", " \"eval_offload\": true,\n", " \"eval_x_only\": false,\n", " \"k_test\": 128\n", " },\n", " \"gradient_checkpointing\": true,\n", " \"inputs\": {\n", " \"batch_size\": {\n", " \"image\": 64,\n", " \"video\": 64\n", " },\n", " \"batch_size_test\": {\n", " \"image\": 4,\n", " \"video\": 4\n", " },\n", " \"image_res\": 224,\n", " \"max_txt_l\": {\n", " \"image\": 32,\n", " \"video\": 32\n", " },\n", " \"video_input\": {\n", " \"num_frames\": 8,\n", " \"num_frames_test\": 8,\n", " \"random_aug\": false,\n", " \"sample_type\": \"middle\",\n", " \"sample_type_test\": \"middle\"\n", " }\n", " },\n", " \"jump_evaluate\": false,\n", " \"log_freq\": 100,\n", " \"max_txt_l\": 32,\n", " \"mode\": \"pt\",\n", " \"model\": {\n", " \"embed_dim\": 1024,\n", " \"find_unused_parameters\": false,\n", " \"freeze_text\": true,\n", " \"freeze_vision\": true,\n", " \"load_vision_ckpt_from_internvideo2_stage2\": false,\n", " \"model_cls\": \"InternVideo2_CLIP_small\",\n", " \"multimodal\": {\n", " \"enable\": true\n", " },\n", " \"open_text_projection\": false,\n", " \"open_vision_clip_projector\": true,\n", " \"temp\": 0.01,\n", " \"temp_min\": 0.01,\n", " \"text_encoder\": {\n", " \"embed_dim\": 512,\n", " \"image_cfg\": {\n", " \"image_size\": 224,\n", " \"model_name\": \"vit_b16\"\n", " },\n", " \"text_cfg\": {\n", " \"causal_masking\": true,\n", " \"context_length\": 77,\n", " \"dim\": 512,\n", " \"ffn_multiplier_per_layer\": 4.0,\n", " \"model_name\": \"base\",\n", " \"n_heads_per_layer\": 8,\n", " \"n_transformer_layers\": 12,\n", " \"norm_layer\": \"layer_norm_fp32\",\n", " \"vocab_size\": 49408\n", " }\n", " },\n", " \"vision_encoder\": {\n", " \"align_dim\": 512,\n", " \"attn_pool_num_heads\": 16,\n", " \"checkpoint_num\": 0,\n", " \"clip_embed_dim\": 768,\n", " \"depth\": 24,\n", " \"drop_cls_token\": false,\n", " \"drop_path_rate\": 0.0,\n", " \"embed_dim\": 1024,\n", " \"fused_mlp_heuristic\": 1,\n", " \"head_drop_path_rate\": 0.0,\n", " \"img_size\": 224,\n", " \"in_chans\": 3,\n", " \"init_values\": 0.1,\n", " \"layerscale_no_force_fp32\": true,\n", " \"mlp_ratio\": 4,\n", " \"name\": \"internvideo2_1B\",\n", " \"num_frames\": 8,\n", " \"num_heads\": 16,\n", " \"patch_size\": 14,\n", " \"qk_normalization\": true,\n", " \"qkv_bias\": false,\n", " \"sep_pos_embed\": false,\n", " \"tubelet_size\": 1,\n", " \"use_checkpoint\": false,\n", " \"use_flash_attn\": false,\n", " \"use_fused_mlp\": false,\n", " \"use_fused_rmsnorm\": false\n", " }\n", " },\n", " \"model_type\": \"internvideo2\",\n", " \"num_frames\": 8,\n", " \"num_frames_test\": 8,\n", " \"num_workers\": 6,\n", " \"optimizer\": {\n", " \"different_lr\": {\n", " \"enable\": false,\n", " \"lr\": 0.001,\n", " \"module_names\": []\n", " },\n", " \"lr\": 5e-05,\n", " \"max_grad_norm\": 3.0,\n", " \"opt\": \"adamW\",\n", " \"opt_betas\": [\n", " 0.9,\n", " 0.98\n", " ],\n", " \"weight_decay\": 0.05\n", " },\n", " \"output_dir\": null,\n", " \"pretrained_path\": \"\",\n", " \"resume\": false,\n", " \"save_ckpt_iter\": null,\n", " \"save_latest\": true,\n", " \"scheduler\": {\n", " \"epochs\": 10,\n", " \"min_lr_multi\": 0.01,\n", " \"sched\": \"cosine\",\n", " \"warmup_epochs\": 1\n", " },\n", " \"seed\": 42,\n", " \"test_file\": {\n", " \"didemo_ret_test\": \"available_corpus[\\\"didemo_ret_test\\\"]\",\n", " \"msrvtt_1k_test\": \"available_corpus[\\\"msrvtt_1k_test\\\"]\"\n", " },\n", " \"test_types\": [\n", " \"msrvtt_1k_test\",\n", " \"didemo_ret_test\"\n", " ],\n", " \"text_enc\": \"bert_large\",\n", " \"tokenizer\": null,\n", " \"torch_dtype\": \"float32\",\n", " \"train_file\": \"available_corpus[\\\"pretrain_example_data_1B\\\"]\",\n", " \"transformers_version\": \"4.51.3\",\n", " \"use_bf16\": true,\n", " \"use_flash_sdp\": false,\n", " \"use_half_precision\": false,\n", " \"use_mem_efficient_sdp\": false,\n", " \"wandb\": {\n", " \"enable\": false,\n", " \"entity\": \"opengvlab\",\n", " \"project\": \"InternVideo2-Stage2\"\n", " }\n", "}\n", "\n" ] } ], "source": [ "from transformers import AutoConfig, AutoModel\n", "config = AutoConfig.from_pretrained(\"/fs-computility/video/heyinan/iv2hf/\", trust_remote_code=True)\n", "model = AutoModel.from_pretrained(\"/fs-computility/video/heyinan/iv2hf/\", trust_remote_code=True).to(config.device)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "metadata": {} }, "outputs": [], "source": [ "import os\n", "import random\n", "import io\n", "import av\n", "import cv2\n", "import decord\n", "import imageio\n", "from decord import VideoReader\n", "import torch\n", "import numpy as np\n", "import math\n", "import torch.nn.functional as F\n", "decord.bridge.set_bridge(\"torch\")\n", "\n", "\n", "def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1, start=None, end=None):\n", " start_frame, end_frame = 0, vlen\n", " if start is not None:\n", " start_frame = max(start_frame,int(start * input_fps))\n", " if end is not None:\n", " end_frame = min(end_frame,int(end * input_fps))\n", "\n", " # Ensure start_frame is less than end_frame\n", " if start_frame >= end_frame:\n", " raise ValueError(\"Start frame index must be less than end frame index\")\n", "\n", " # Calculate the length of the clip in frames\n", " clip_length = end_frame - start_frame\n", "\n", " if sample in [\"rand\", \"middle\"]: # uniform sampling\n", " acc_samples = min(num_frames, clip_length)\n", " # split the clip into `acc_samples` intervals, and sample from each interval.\n", " intervals = np.linspace(start=start_frame, stop=end_frame, num=acc_samples + 1).astype(int)\n", " ranges = []\n", " for idx, interv in enumerate(intervals[:-1]):\n", " ranges.append((interv, intervals[idx + 1] - 1))\n", " if sample == 'rand':\n", " try:\n", " frame_indices = [random.choice(range(x[0], x[1] + 1)) for x in ranges]\n", " except:\n", " frame_indices = np.random.permutation(clip_length)[:acc_samples] + start_frame\n", " frame_indices.sort()\n", " frame_indices = list(frame_indices)\n", " elif fix_start is not None:\n", " frame_indices = [x[0] + fix_start for x in ranges]\n", " elif sample == 'middle':\n", " frame_indices = [(x[0] + x[1]) // 2 for x in ranges]\n", " else:\n", " raise NotImplementedError\n", "\n", " if len(frame_indices) < num_frames: # padded with last frame\n", " padded_frame_indices = [frame_indices[-1]] * num_frames\n", " padded_frame_indices[:len(frame_indices)] = frame_indices\n", " frame_indices = padded_frame_indices\n", " elif \"fps\" in sample: # fps0.5, sequentially sample frames at 0.5 fps\n", " output_fps = float(sample[3:])\n", " duration = float(clip_length) / input_fps\n", " delta = 1 / output_fps # gap between frames, this is also the clip length each frame represents\n", " frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)\n", " frame_indices = np.around(frame_seconds * input_fps).astype(int) + start_frame\n", " frame_indices = [e for e in frame_indices if e < end_frame]\n", " if max_num_frames > 0 and len(frame_indices) > max_num_frames:\n", " frame_indices = frame_indices[:max_num_frames]\n", " # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)\n", " else:\n", " raise ValueError\n", " return frame_indices\n", "\n", "def read_frames_decord(\n", " video_path, num_frames, sample='middle', fix_start=None, \n", " max_num_frames=-1, client=None, trimmed30=False, start=None, end=None\n", " ):\n", " num_threads = 1 if video_path.endswith('.webm') else 0 # make ssv2 happy\n", "\n", " video_reader = VideoReader(video_path, num_threads=num_threads)\n", " vlen = len(video_reader)\n", " \n", " fps = video_reader.get_avg_fps()\n", " duration = vlen / float(fps)\n", "\n", " frame_indices = get_frame_indices(\n", " num_frames, vlen, sample=sample, fix_start=fix_start,\n", " input_fps=fps, max_num_frames=max_num_frames, start=start, end=end\n", " )\n", "\n", " frames = video_reader.get_batch(frame_indices) # (T, H, W, C), torch.uint8\n", " frames = frames.permute(0, 3, 1, 2) # (T, C, H, W), torch.uint8\n", " return frames, frame_indices, duration" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "metadata": {} }, "outputs": [], "source": [ "def get_text_feature(model, texts):\n", " text_input = model.tokenizer(texts).to(model.device)\n", " text_features = model.encode_text(text_input)\n", " return text_features\n", " \n", "def get_similarity(video_feature, text_feature):\n", " video_feature = F.normalize(video_feature, dim=-1)\n", " text_feature = F.normalize(text_feature, dim=-1)\n", " sim_matrix = text_feature @ video_feature.T\n", " return sim_matrix" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "metadata": {} }, "outputs": [], "source": [ "def get_top_videos(model, text_features, video_features, video_paths, texts):\n", " # text_features = get_text_feature(texts)\n", "\n", " video_features = F.normalize(video_features, dim=-1)\n", " text_features = F.normalize(text_features, dim=-1)\n", "\n", " # print(text_features.shape, video_features.shape)\n", " sim_matrix = text_features @ video_features.T\n", " # print(sim_matrix.shape)\n", "\n", " top_k = 5\n", " sim_matrix_top_k = torch.topk(sim_matrix, top_k, dim=1)[1]\n", " softmax_sim_matrix = F.softmax(sim_matrix, dim=1)\n", "\n", " retrieval_infos = {}\n", " for i in range(len(sim_matrix_top_k)):\n", " print(\"\\n\",texts[i])\n", " retrieval_infos[texts[i]] = []\n", " for j in range(top_k):\n", " print(\"top\", j+1, \":\", video_paths[sim_matrix_top_k[i][j]], \"~prob:\", sim_matrix[i][sim_matrix_top_k[i][j]].item())\n", " retrieval_infos[texts[i]].append({\"video\": video_paths[sim_matrix_top_k[i][j]], \"prob\": sim_matrix[i][sim_matrix_top_k[i][j]].item(), \"rank\": j+1})\n", " return retrieval_infos" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "if __name__==\"__main__\":\n", " video_features = []\n", " demo_videos = [\"video-scene-00030.mp4\",\"video-scene-00031.mp4\",\"xinhuashe_test_video/video-scene-00032.mp4\",\"xinhuashe_test_video/video-scene-00033.mp4\",\"video-scene-00034.mp4\"]\n", " texts = ['a person talking', 'a logo', 'a building']\n", " for video_path in demo_videos:\n", " frames, frame_indices, video_duration = read_frames_decord(video_path,8)\n", " frames = model.transform(frames).unsqueeze(0).to(model.device)\n", " # 获得视频特征\n", " with torch.no_grad():\n", " video_feature = model.encode_vision(frames, test=True)\n", " video_features.append(video_feature)\n", " \n", " # # 获得文本特征\n", " text_features = get_text_feature(model, texts)\n", " video_features = torch.cat(video_features, dim=0).to(text_features.dtype).to(config.device)\n", " results = get_top_videos(model, text_features, video_features, demo_videos, texts)\n", "\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 2 }