{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "metadata": {}
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/miniconda3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "InternVideo2Config {\n",
      "  \"_attn_implementation_autoset\": true,\n",
      "  \"architectures\": [\n",
      "    \"InternVideo2_CLIP_small\"\n",
      "  ],\n",
      "  \"auto_map\": {\n",
      "    \"AutoConfig\": \"config.InternVideo2Config\",\n",
      "    \"AutoModel\": \"modeling_internvideo2encoder.InternVideo2_CLIP_small\"\n",
      "  },\n",
      "  \"auto_resume\": false,\n",
      "  \"batch_size\": 64,\n",
      "  \"batch_size_test\": 4,\n",
      "  \"best_key\": [\n",
      "    \"msrvtt_1k_test_match\",\n",
      "    \"t2v_r1\"\n",
      "  ],\n",
      "  \"compile_model\": false,\n",
      "  \"criterion\": {\n",
      "    \"clip_loss_ratio\": [\n",
      "      1.0,\n",
      "      1.0\n",
      "    ],\n",
      "    \"distill_final_features\": true,\n",
      "    \"loss_weight\": {\n",
      "      \"mlm\": 1.0,\n",
      "      \"mvm\": 0.0,\n",
      "      \"uta\": 0.0,\n",
      "      \"vtc\": 1.0,\n",
      "      \"vtm\": 1.0\n",
      "    },\n",
      "    \"mlm_masking_prob\": 0.5,\n",
      "    \"vtm_hard_neg\": true\n",
      "  },\n",
      "  \"debug\": false,\n",
      "  \"deep_fusion\": false,\n",
      "  \"deepspeed\": {\n",
      "    \"enable\": true,\n",
      "    \"stage\": 1\n",
      "  },\n",
      "  \"delete_ds_optim_states\": true,\n",
      "  \"device\": \"cuda\",\n",
      "  \"dist_url\": \"env://\",\n",
      "  \"evaluate\": false,\n",
      "  \"evaluation\": {\n",
      "    \"eval_frame_ensemble\": \"concat\",\n",
      "    \"eval_offload\": true,\n",
      "    \"eval_x_only\": false,\n",
      "    \"k_test\": 128\n",
      "  },\n",
      "  \"gradient_checkpointing\": true,\n",
      "  \"inputs\": {\n",
      "    \"batch_size\": {\n",
      "      \"image\": 64,\n",
      "      \"video\": 64\n",
      "    },\n",
      "    \"batch_size_test\": {\n",
      "      \"image\": 4,\n",
      "      \"video\": 4\n",
      "    },\n",
      "    \"image_res\": 224,\n",
      "    \"max_txt_l\": {\n",
      "      \"image\": 32,\n",
      "      \"video\": 32\n",
      "    },\n",
      "    \"video_input\": {\n",
      "      \"num_frames\": 8,\n",
      "      \"num_frames_test\": 8,\n",
      "      \"random_aug\": false,\n",
      "      \"sample_type\": \"middle\",\n",
      "      \"sample_type_test\": \"middle\"\n",
      "    }\n",
      "  },\n",
      "  \"jump_evaluate\": false,\n",
      "  \"log_freq\": 100,\n",
      "  \"max_txt_l\": 32,\n",
      "  \"mode\": \"pt\",\n",
      "  \"model\": {\n",
      "    \"embed_dim\": 1024,\n",
      "    \"find_unused_parameters\": false,\n",
      "    \"freeze_text\": true,\n",
      "    \"freeze_vision\": true,\n",
      "    \"load_vision_ckpt_from_internvideo2_stage2\": false,\n",
      "    \"model_cls\": \"InternVideo2_CLIP_small\",\n",
      "    \"multimodal\": {\n",
      "      \"enable\": true\n",
      "    },\n",
      "    \"open_text_projection\": false,\n",
      "    \"open_vision_clip_projector\": true,\n",
      "    \"temp\": 0.01,\n",
      "    \"temp_min\": 0.01,\n",
      "    \"text_encoder\": {\n",
      "      \"embed_dim\": 512,\n",
      "      \"image_cfg\": {\n",
      "        \"image_size\": 224,\n",
      "        \"model_name\": \"vit_b16\"\n",
      "      },\n",
      "      \"text_cfg\": {\n",
      "        \"causal_masking\": true,\n",
      "        \"context_length\": 77,\n",
      "        \"dim\": 512,\n",
      "        \"ffn_multiplier_per_layer\": 4.0,\n",
      "        \"model_name\": \"base\",\n",
      "        \"n_heads_per_layer\": 8,\n",
      "        \"n_transformer_layers\": 12,\n",
      "        \"norm_layer\": \"layer_norm_fp32\",\n",
      "        \"vocab_size\": 49408\n",
      "      }\n",
      "    },\n",
      "    \"vision_encoder\": {\n",
      "      \"align_dim\": 512,\n",
      "      \"attn_pool_num_heads\": 16,\n",
      "      \"checkpoint_num\": 0,\n",
      "      \"clip_embed_dim\": 768,\n",
      "      \"depth\": 24,\n",
      "      \"drop_cls_token\": false,\n",
      "      \"drop_path_rate\": 0.0,\n",
      "      \"embed_dim\": 1024,\n",
      "      \"fused_mlp_heuristic\": 1,\n",
      "      \"head_drop_path_rate\": 0.0,\n",
      "      \"img_size\": 224,\n",
      "      \"in_chans\": 3,\n",
      "      \"init_values\": 0.1,\n",
      "      \"layerscale_no_force_fp32\": true,\n",
      "      \"mlp_ratio\": 4,\n",
      "      \"name\": \"internvideo2_1B\",\n",
      "      \"num_frames\": 8,\n",
      "      \"num_heads\": 16,\n",
      "      \"patch_size\": 14,\n",
      "      \"qk_normalization\": true,\n",
      "      \"qkv_bias\": false,\n",
      "      \"sep_pos_embed\": false,\n",
      "      \"tubelet_size\": 1,\n",
      "      \"use_checkpoint\": false,\n",
      "      \"use_flash_attn\": false,\n",
      "      \"use_fused_mlp\": false,\n",
      "      \"use_fused_rmsnorm\": false\n",
      "    }\n",
      "  },\n",
      "  \"model_type\": \"internvideo2\",\n",
      "  \"num_frames\": 8,\n",
      "  \"num_frames_test\": 8,\n",
      "  \"num_workers\": 6,\n",
      "  \"optimizer\": {\n",
      "    \"different_lr\": {\n",
      "      \"enable\": false,\n",
      "      \"lr\": 0.001,\n",
      "      \"module_names\": []\n",
      "    },\n",
      "    \"lr\": 5e-05,\n",
      "    \"max_grad_norm\": 3.0,\n",
      "    \"opt\": \"adamW\",\n",
      "    \"opt_betas\": [\n",
      "      0.9,\n",
      "      0.98\n",
      "    ],\n",
      "    \"weight_decay\": 0.05\n",
      "  },\n",
      "  \"output_dir\": null,\n",
      "  \"pretrained_path\": \"\",\n",
      "  \"resume\": false,\n",
      "  \"save_ckpt_iter\": null,\n",
      "  \"save_latest\": true,\n",
      "  \"scheduler\": {\n",
      "    \"epochs\": 10,\n",
      "    \"min_lr_multi\": 0.01,\n",
      "    \"sched\": \"cosine\",\n",
      "    \"warmup_epochs\": 1\n",
      "  },\n",
      "  \"seed\": 42,\n",
      "  \"test_file\": {\n",
      "    \"didemo_ret_test\": \"available_corpus[\\\"didemo_ret_test\\\"]\",\n",
      "    \"msrvtt_1k_test\": \"available_corpus[\\\"msrvtt_1k_test\\\"]\"\n",
      "  },\n",
      "  \"test_types\": [\n",
      "    \"msrvtt_1k_test\",\n",
      "    \"didemo_ret_test\"\n",
      "  ],\n",
      "  \"text_enc\": \"bert_large\",\n",
      "  \"tokenizer\": null,\n",
      "  \"torch_dtype\": \"float32\",\n",
      "  \"train_file\": \"available_corpus[\\\"pretrain_example_data_1B\\\"]\",\n",
      "  \"transformers_version\": \"4.51.3\",\n",
      "  \"use_bf16\": true,\n",
      "  \"use_flash_sdp\": false,\n",
      "  \"use_half_precision\": false,\n",
      "  \"use_mem_efficient_sdp\": false,\n",
      "  \"wandb\": {\n",
      "    \"enable\": false,\n",
      "    \"entity\": \"opengvlab\",\n",
      "    \"project\": \"InternVideo2-Stage2\"\n",
      "  }\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoConfig, AutoModel\n",
    "config = AutoConfig.from_pretrained(\"/fs-computility/video/heyinan/iv2hf/\", trust_remote_code=True)\n",
    "model = AutoModel.from_pretrained(\"/fs-computility/video/heyinan/iv2hf/\", trust_remote_code=True).to(config.device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "metadata": {}
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import random\n",
    "import io\n",
    "import av\n",
    "import cv2\n",
    "import decord\n",
    "import imageio\n",
    "from decord import VideoReader\n",
    "import torch\n",
    "import numpy as np\n",
    "import math\n",
    "import torch.nn.functional as F\n",
    "decord.bridge.set_bridge(\"torch\")\n",
    "\n",
    "\n",
    "def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1, start=None, end=None):\n",
    "    start_frame, end_frame = 0, vlen\n",
    "    if start is not None:\n",
    "        start_frame = max(start_frame,int(start * input_fps))\n",
    "    if end is not None:\n",
    "        end_frame = min(end_frame,int(end * input_fps))\n",
    "\n",
    "    # Ensure start_frame is less than end_frame\n",
    "    if start_frame >= end_frame:\n",
    "        raise ValueError(\"Start frame index must be less than end frame index\")\n",
    "\n",
    "    # Calculate the length of the clip in frames\n",
    "    clip_length = end_frame - start_frame\n",
    "\n",
    "    if sample in [\"rand\", \"middle\"]:  # uniform sampling\n",
    "        acc_samples = min(num_frames, clip_length)\n",
    "        # split the clip into `acc_samples` intervals, and sample from each interval.\n",
    "        intervals = np.linspace(start=start_frame, stop=end_frame, num=acc_samples + 1).astype(int)\n",
    "        ranges = []\n",
    "        for idx, interv in enumerate(intervals[:-1]):\n",
    "            ranges.append((interv, intervals[idx + 1] - 1))\n",
    "        if sample == 'rand':\n",
    "            try:\n",
    "                frame_indices = [random.choice(range(x[0], x[1] + 1)) for x in ranges]\n",
    "            except:\n",
    "                frame_indices = np.random.permutation(clip_length)[:acc_samples] + start_frame\n",
    "                frame_indices.sort()\n",
    "                frame_indices = list(frame_indices)\n",
    "        elif fix_start is not None:\n",
    "            frame_indices = [x[0] + fix_start for x in ranges]\n",
    "        elif sample == 'middle':\n",
    "            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]\n",
    "        else:\n",
    "            raise NotImplementedError\n",
    "\n",
    "        if len(frame_indices) < num_frames:  # padded with last frame\n",
    "            padded_frame_indices = [frame_indices[-1]] * num_frames\n",
    "            padded_frame_indices[:len(frame_indices)] = frame_indices\n",
    "            frame_indices = padded_frame_indices\n",
    "    elif \"fps\" in sample:  # fps0.5, sequentially sample frames at 0.5 fps\n",
    "        output_fps = float(sample[3:])\n",
    "        duration = float(clip_length) / input_fps\n",
    "        delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents\n",
    "        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)\n",
    "        frame_indices = np.around(frame_seconds * input_fps).astype(int) + start_frame\n",
    "        frame_indices = [e for e in frame_indices if e < end_frame]\n",
    "        if max_num_frames > 0 and len(frame_indices) > max_num_frames:\n",
    "            frame_indices = frame_indices[:max_num_frames]\n",
    "            # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)\n",
    "    else:\n",
    "        raise ValueError\n",
    "    return frame_indices\n",
    "\n",
    "def read_frames_decord(\n",
    "        video_path, num_frames, sample='middle', fix_start=None, \n",
    "        max_num_frames=-1, client=None, trimmed30=False, start=None, end=None\n",
    "    ):\n",
    "    num_threads = 1 if video_path.endswith('.webm') else 0 # make ssv2 happy\n",
    "\n",
    "    video_reader = VideoReader(video_path, num_threads=num_threads)\n",
    "    vlen = len(video_reader)\n",
    " \n",
    "    fps = video_reader.get_avg_fps()\n",
    "    duration = vlen / float(fps)\n",
    "\n",
    "    frame_indices = get_frame_indices(\n",
    "        num_frames, vlen, sample=sample, fix_start=fix_start,\n",
    "        input_fps=fps, max_num_frames=max_num_frames, start=start, end=end\n",
    "    )\n",
    "\n",
    "    frames = video_reader.get_batch(frame_indices)  # (T, H, W, C), torch.uint8\n",
    "    frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8\n",
    "    return frames, frame_indices, duration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "metadata": {}
   },
   "outputs": [],
   "source": [
    "def get_text_feature(model, texts):\n",
    "    text_input = model.tokenizer(texts).to(model.device)\n",
    "    text_features = model.encode_text(text_input)\n",
    "    return text_features\n",
    "    \n",
    "def get_similarity(video_feature, text_feature):\n",
    "    video_feature = F.normalize(video_feature, dim=-1)\n",
    "    text_feature = F.normalize(text_feature, dim=-1)\n",
    "    sim_matrix = text_feature @ video_feature.T\n",
    "    return sim_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "metadata": {}
   },
   "outputs": [],
   "source": [
    "def get_top_videos(model, text_features, video_features, video_paths, texts):\n",
    "    # text_features = get_text_feature(texts)\n",
    "\n",
    "    video_features = F.normalize(video_features, dim=-1)\n",
    "    text_features = F.normalize(text_features, dim=-1)\n",
    "\n",
    "    # print(text_features.shape, video_features.shape)\n",
    "    sim_matrix = text_features @ video_features.T\n",
    "    # print(sim_matrix.shape)\n",
    "\n",
    "    top_k = 5\n",
    "    sim_matrix_top_k = torch.topk(sim_matrix, top_k, dim=1)[1]\n",
    "    softmax_sim_matrix = F.softmax(sim_matrix, dim=1)\n",
    "\n",
    "    retrieval_infos = {}\n",
    "    for i in range(len(sim_matrix_top_k)):\n",
    "        print(\"\\n\",texts[i])\n",
    "        retrieval_infos[texts[i]] = []\n",
    "        for j in range(top_k):\n",
    "            print(\"top\", j+1, \":\", video_paths[sim_matrix_top_k[i][j]], \"~prob:\", sim_matrix[i][sim_matrix_top_k[i][j]].item())\n",
    "            retrieval_infos[texts[i]].append({\"video\":  video_paths[sim_matrix_top_k[i][j]], \"prob\": sim_matrix[i][sim_matrix_top_k[i][j]].item(), \"rank\": j+1})\n",
    "    return retrieval_infos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "metadata": {}
   },
   "outputs": [],
   "source": [
    "if __name__==\"__main__\":\n",
    "    video_features = []\n",
    "    demo_videos = [\"video-scene-00030.mp4\",\"video-scene-00031.mp4\",\"xinhuashe_test_video/video-scene-00032.mp4\",\"xinhuashe_test_video/video-scene-00033.mp4\",\"video-scene-00034.mp4\"]\n",
    "    texts = ['a person talking', 'a logo', 'a building']\n",
    "    for video_path in demo_videos:\n",
    "        frames, frame_indices, video_duration = read_frames_decord(video_path,8)\n",
    "        frames = model.transform(frames).unsqueeze(0).to(model.device)\n",
    "        # 获得视频特征\n",
    "        with torch.no_grad():\n",
    "            video_feature = model.encode_vision(frames, test=True)\n",
    "            video_features.append(video_feature)\n",
    "    \n",
    "    # # 获得文本特征\n",
    "    text_features = get_text_feature(model, texts)\n",
    "    video_features = torch.cat(video_features, dim=0).to(text_features.dtype).to(config.device)\n",
    "    results = get_top_videos(model, text_features, video_features, demo_videos, texts)\n",
    "\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}