AskYoutube commited on
Commit
e3b47db
·
1 Parent(s): f251ff3

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +10 -5
README.md CHANGED
@@ -10,21 +10,26 @@ VideoCLIP uses a Video Q-Former to aggregate frame-level embeddings temporally i
10
  # Usage
11
 
12
  ```
13
- # Load model
14
  import video_clip
15
  eval_config = 'eval_configs/video_clip.yaml'
16
  model, vis_processor = video_clip.load_model(eval_config)
17
 
18
- # Compute video embeddings
 
 
19
  video_embs = video_clip.get_all_video_embeddings(videos, model, vis_processor)
20
 
21
- # Compute Video-Text similarity
 
22
  v2t_sim = video_clip.compute_sim(model, texts, video_embs)
23
 
24
- # Compute Text-Video similarity
 
25
  t2v_sim = v2t_sim.T
26
 
27
- # Compute Video-Video distance
 
28
  v2v_dists = video_clip.compute_dist_videoq(model, video_embs[0], video_embs)
29
  ```
30
 
 
10
  # Usage
11
 
12
  ```
13
+ # Load model.
14
  import video_clip
15
  eval_config = 'eval_configs/video_clip.yaml'
16
  model, vis_processor = video_clip.load_model(eval_config)
17
 
18
+ # Compute video embeddings.
19
+ # video_embs: float matrix of size [num_videos, clip_dim_size, query_tokens] containing VideoCLIP embeddings.
20
+ # In this model, clip_dim_size=1024 and query_tokens=32.
21
  video_embs = video_clip.get_all_video_embeddings(videos, model, vis_processor)
22
 
23
+ # Compute Video-Text similarity.
24
+ # v2t_sim: float matrix of size [num_videos, num_texts] indicating similarity.
25
  v2t_sim = video_clip.compute_sim(model, texts, video_embs)
26
 
27
+ # Compute Text-Video similarity.
28
+ # t2v_sim: float matrix of size [num_texts, num_videos] indicating similarity.
29
  t2v_sim = v2t_sim.T
30
 
31
+ # Compute Video-Video distance.
32
+ # v2v_dists: float vector of size [1, num_videos] indicating distance to query video embedding.
33
  v2v_dists = video_clip.compute_dist_videoq(model, video_embs[0], video_embs)
34
  ```
35