sitammeur commited on
Commit
bfaf6f1
·
verified ·
1 Parent(s): 2faf7d0

Upload 7 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ videos/sample_video_1.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ videos/sample_video_3.mp4 filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing the requirements
2
+ # import warnings
3
+
4
+ # warnings.filterwarnings("ignore")
5
+
6
+ import gradio as gr
7
+ from src.model import describe_video
8
+
9
+
10
+ # Video and text inputs for the interface
11
+ video = gr.Video(type="file", label="Video")
12
+ query = gr.Textbox(label="Query", placeholder="Type your query here")
13
+
14
+ # Output for the interface
15
+ response = gr.Textbox(label="Response", show_label=True, show_copy_button=True)
16
+
17
+ # Examples for the interface
18
+ examples = [
19
+ [
20
+ "./videos/sample_video_1.mp4",
21
+ "Here are some frames of a video. Describe this video in detail",
22
+ ],
23
+ [
24
+ "./videos/sample_video_2.mp4",
25
+ "Which are the animals in this video, and how many are there?",
26
+ ],
27
+ ["./videos/sample_video_3.mp4", "What is happening in this video?"],
28
+ ]
29
+
30
+ # Title, description, and article for the interface
31
+ title = "Video Understanding & Question Answering"
32
+ description = "This Gradio demo uses the MiniCPM-V-2_6 model for video understanding tasks. Upload a video and type a question to get a detailed description or specific information from the video."
33
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2407.03320' target='_blank'>InternLM-XComposer-2.5: A Versatile Large Vision Language Model Supporting Long-Contextual Input and Output</a> | <a href='https://huggingface.co/internlm/internlm-xcomposer2d5-7b' target='_blank'>Model Page</a></p>"
34
+
35
+
36
+ # Launch the interface
37
+ interface = gr.Interface(
38
+ fn=describe_video,
39
+ inputs=[video, query],
40
+ outputs=response,
41
+ examples=examples,
42
+ title=title,
43
+ description=description,
44
+ article=article,
45
+ theme="Soft",
46
+ allow_flagging="never",
47
+ )
48
+ interface.launch(debug=False)
src/__init__.py ADDED
File without changes
src/model.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing the requirements
2
+ import torch
3
+ from transformers import AutoModel, AutoTokenizer
4
+ import spaces
5
+ from src.utils import encode_video
6
+
7
+
8
+ # Device for the model
9
+ device = "cuda"
10
+
11
+ # Load the model and tokenizer
12
+ model = AutoModel.from_pretrained(
13
+ "openbmb/MiniCPM-V-2_6",
14
+ trust_remote_code=True,
15
+ attn_implementation="sdpa",
16
+ torch_dtype=torch.bfloat16,
17
+ )
18
+ model = model.to(device=device)
19
+ tokenizer = AutoTokenizer.from_pretrained(
20
+ "openbmb/MiniCPM-V-2_6", trust_remote_code=True
21
+ )
22
+ model.eval()
23
+
24
+
25
+ @spaces.GPU()
26
+ def describe_video(video, question):
27
+ """
28
+ Describes a video by generating an answer to a given question.
29
+
30
+ Args:
31
+ - video (str): The path to the video file.
32
+ - question (str): The question to be answered about the video.
33
+
34
+ Returns:
35
+ str: The generated answer to the question.
36
+ """
37
+ # Encode the video frames
38
+ frames = encode_video(video)
39
+
40
+ # Message format for the model
41
+ msgs = [{"role": "user", "content": frames + [question]}]
42
+
43
+ # Set decode params for video
44
+ params = {
45
+ "use_image_id": False,
46
+ "max_slice_nums": 1, # Use 1 if CUDA OOM and video resolution > 448*448
47
+ }
48
+
49
+ # Generate the answer
50
+ answer = model.chat(
51
+ image=None,
52
+ msgs=msgs,
53
+ tokenizer=tokenizer,
54
+ sampling=True,
55
+ temperature=0.7,
56
+ stream=True,
57
+ system_prompt="You are an AI assistant specialized in visual content analysis. Given a video and a related question, analyze the video thoroughly and provide a precise and informative answer based on the visible content. Ensure your response is clear, accurate, and directly addresses the question.",
58
+ **params
59
+ )
60
+
61
+ # Return the answer
62
+ return answer
src/utils.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing the requirements
2
+ from PIL import Image
3
+ from decord import VideoReader, cpu
4
+
5
+
6
+ # Maximum number of frames to use
7
+ MAX_NUM_FRAMES = 64 # If CUDA OOM, set a smaller number
8
+
9
+
10
+ def encode_video(video_path):
11
+ """
12
+ Encodes a video file into a list of frames.
13
+
14
+ Args:
15
+ video_path (str): The path to the video file.
16
+
17
+ Returns:
18
+ list: A list of frames, where each frame is represented as an Image object.
19
+ """
20
+
21
+ def uniform_sample(l, n):
22
+ """
23
+ Uniformly samples elements from a list.
24
+
25
+ Args:
26
+ - l (list): The input list.
27
+ - n (int): The number of elements to sample.
28
+
29
+ Returns:
30
+ list: A list of sampled elements.
31
+ """
32
+ gap = len(l) / n
33
+ idxs = [int(i * gap + gap / 2) for i in range(n)]
34
+ return [l[i] for i in idxs]
35
+
36
+ # Read the video file and sample frames
37
+ vr = VideoReader(video_path, ctx=cpu(0))
38
+ sample_fps = round(vr.get_avg_fps() / 1) # FPS
39
+ frame_idx = [i for i in range(0, len(vr), sample_fps)]
40
+
41
+ # Uniformly sample frames if the number of frames is too large
42
+ if len(frame_idx) > MAX_NUM_FRAMES:
43
+ frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
44
+
45
+ # Extract frames from the video
46
+ frames = vr.get_batch(frame_idx).asnumpy()
47
+ frames = [Image.fromarray(v.astype("uint8")) for v in frames]
48
+
49
+ # Return video frames
50
+ return frames
videos/sample_video_1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b277543103dc6b706cdc2b5007085e8eef0c6a9bdc39633e2af31828d7bd98e4
3
+ size 2511799
videos/sample_video_2.mp4 ADDED
Binary file (826 kB). View file
 
videos/sample_video_3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e242b33923dd63ffb2fda6d6853f7ec8ad17207e6221b5467a540159fa1e5c06
3
+ size 2104032