Spaces:

sitammeur
/

VidiQA

Running on Zero

App Files Files Community

sitammeur commited on Aug 19, 2024

Commit

775cb17

verified ·

1 Parent(s): 9fd9702

Update src/model.py

Browse files

Files changed (1) hide show

src/model.py +63 -62

src/model.py CHANGED Viewed

@@ -1,62 +1,63 @@
-# Importing the requirements
-import torch
-from transformers import AutoModel, AutoTokenizer
-import spaces
-from src.utils import encode_video
-# Device for the model
-device = "cuda"
-# Load the model and tokenizer
-model = AutoModel.from_pretrained(
-    "openbmb/MiniCPM-V-2_6",
-    trust_remote_code=True,
-    attn_implementation="sdpa",
-    torch_dtype=torch.bfloat16,
-)
-model = model.to(device=device)
-tokenizer = AutoTokenizer.from_pretrained(
-    "openbmb/MiniCPM-V-2_6", trust_remote_code=True
-)
-model.eval()
-@spaces.GPU()
-def describe_video(video, question):
-    """
-    Describes a video by generating an answer to a given question.
-    Args:
-        - video (str): The path to the video file.
-        - question (str): The question to be answered about the video.
-    Returns:
-        str: The generated answer to the question.
-    """
-    # Encode the video frames
-    frames = encode_video(video)
-    # Message format for the model
-    msgs = [{"role": "user", "content": frames + [question]}]
-    # Set decode params for video
-    params = {
-        "use_image_id": False,
-        "max_slice_nums": 1,  # Use 1 if CUDA OOM and video resolution > 448*448
-    }
-    # Generate the answer
-    answer = model.chat(
-        image=None,
-        msgs=msgs,
-        tokenizer=tokenizer,
-        sampling=True,
-        temperature=0.7,
-        stream=True,
-        system_prompt="You are an AI assistant specialized in visual content analysis. Given a video and a related question, analyze the video thoroughly and provide a precise and informative answer based on the visible content. Ensure your response is clear, accurate, and directly addresses the question.",
-        **params
-    )
-    # Return the answer
-    return answer

+# Importing the requirements
+import torch
+from transformers import AutoModel, AutoTokenizer
+import spaces
+from src.utils import encode_video
+# Device for the model
+device = "cuda"
+# Load the model and tokenizer
+model = AutoModel.from_pretrained(
+    "openbmb/MiniCPM-V-2_6",
+    trust_remote_code=True,
+    attn_implementation="sdpa",
+    torch_dtype=torch.bfloat16,
+)
+model = model.to(device=device)
+tokenizer = AutoTokenizer.from_pretrained(
+    "openbmb/MiniCPM-V-2_6", trust_remote_code=True
+)
+model.eval()
+@spaces.GPU()
+def describe_video(video, question):
+    """
+    Describes a video by generating an answer to a given question.
+    Args:
+        - video (str): The path to the video file.
+        - question (str): The question to be answered about the video.
+    Returns:
+        str: The generated answer to the question.
+    """
+    # Encode the video frames
+    frames = encode_video(video)
+    frames = list(frames)  # Convert generator or any iterable to list
+    # Message format for the model
+    msgs = [{"role": "user", "content": frames + [question]}]
+    # Set decode params for video
+    params = {
+        "use_image_id": False,
+        "max_slice_nums": 1,  # Use 1 if CUDA OOM and video resolution > 448*448
+    }
+    # Generate the answer
+    answer = model.chat(
+        image=None,
+        msgs=msgs,
+        tokenizer=tokenizer,
+        sampling=True,
+        temperature=0.7,
+        stream=True,
+        system_prompt="You are an AI assistant specialized in visual content analysis. Given a video and a related question, analyze the video thoroughly and provide a precise and informative answer based on the visible content. Ensure your response is clear, accurate, and directly addresses the question.",
+        **params
+    )
+    # Return the answer
+    return answer