sitammeur commited on
Commit
775cb17
·
verified ·
1 Parent(s): 9fd9702

Update src/model.py

Browse files
Files changed (1) hide show
  1. src/model.py +63 -62
src/model.py CHANGED
@@ -1,62 +1,63 @@
1
- # Importing the requirements
2
- import torch
3
- from transformers import AutoModel, AutoTokenizer
4
- import spaces
5
- from src.utils import encode_video
6
-
7
-
8
- # Device for the model
9
- device = "cuda"
10
-
11
- # Load the model and tokenizer
12
- model = AutoModel.from_pretrained(
13
- "openbmb/MiniCPM-V-2_6",
14
- trust_remote_code=True,
15
- attn_implementation="sdpa",
16
- torch_dtype=torch.bfloat16,
17
- )
18
- model = model.to(device=device)
19
- tokenizer = AutoTokenizer.from_pretrained(
20
- "openbmb/MiniCPM-V-2_6", trust_remote_code=True
21
- )
22
- model.eval()
23
-
24
-
25
- @spaces.GPU()
26
- def describe_video(video, question):
27
- """
28
- Describes a video by generating an answer to a given question.
29
-
30
- Args:
31
- - video (str): The path to the video file.
32
- - question (str): The question to be answered about the video.
33
-
34
- Returns:
35
- str: The generated answer to the question.
36
- """
37
- # Encode the video frames
38
- frames = encode_video(video)
39
-
40
- # Message format for the model
41
- msgs = [{"role": "user", "content": frames + [question]}]
42
-
43
- # Set decode params for video
44
- params = {
45
- "use_image_id": False,
46
- "max_slice_nums": 1, # Use 1 if CUDA OOM and video resolution > 448*448
47
- }
48
-
49
- # Generate the answer
50
- answer = model.chat(
51
- image=None,
52
- msgs=msgs,
53
- tokenizer=tokenizer,
54
- sampling=True,
55
- temperature=0.7,
56
- stream=True,
57
- system_prompt="You are an AI assistant specialized in visual content analysis. Given a video and a related question, analyze the video thoroughly and provide a precise and informative answer based on the visible content. Ensure your response is clear, accurate, and directly addresses the question.",
58
- **params
59
- )
60
-
61
- # Return the answer
62
- return answer
 
 
1
+ # Importing the requirements
2
+ import torch
3
+ from transformers import AutoModel, AutoTokenizer
4
+ import spaces
5
+ from src.utils import encode_video
6
+
7
+
8
+ # Device for the model
9
+ device = "cuda"
10
+
11
+ # Load the model and tokenizer
12
+ model = AutoModel.from_pretrained(
13
+ "openbmb/MiniCPM-V-2_6",
14
+ trust_remote_code=True,
15
+ attn_implementation="sdpa",
16
+ torch_dtype=torch.bfloat16,
17
+ )
18
+ model = model.to(device=device)
19
+ tokenizer = AutoTokenizer.from_pretrained(
20
+ "openbmb/MiniCPM-V-2_6", trust_remote_code=True
21
+ )
22
+ model.eval()
23
+
24
+
25
+ @spaces.GPU()
26
+ def describe_video(video, question):
27
+ """
28
+ Describes a video by generating an answer to a given question.
29
+
30
+ Args:
31
+ - video (str): The path to the video file.
32
+ - question (str): The question to be answered about the video.
33
+
34
+ Returns:
35
+ str: The generated answer to the question.
36
+ """
37
+ # Encode the video frames
38
+ frames = encode_video(video)
39
+ frames = list(frames) # Convert generator or any iterable to list
40
+
41
+ # Message format for the model
42
+ msgs = [{"role": "user", "content": frames + [question]}]
43
+
44
+ # Set decode params for video
45
+ params = {
46
+ "use_image_id": False,
47
+ "max_slice_nums": 1, # Use 1 if CUDA OOM and video resolution > 448*448
48
+ }
49
+
50
+ # Generate the answer
51
+ answer = model.chat(
52
+ image=None,
53
+ msgs=msgs,
54
+ tokenizer=tokenizer,
55
+ sampling=True,
56
+ temperature=0.7,
57
+ stream=True,
58
+ system_prompt="You are an AI assistant specialized in visual content analysis. Given a video and a related question, analyze the video thoroughly and provide a precise and informative answer based on the visible content. Ensure your response is clear, accurate, and directly addresses the question.",
59
+ **params
60
+ )
61
+
62
+ # Return the answer
63
+ return answer