Spaces:
Runtime error
Runtime error
File size: 5,911 Bytes
eb0678a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# Based on https://github.com/haotian-liu/LLaVA.
import os
import json
import math
import torch
import argparse
from tqdm import tqdm
from decord import VideoReader, cpu
from llama_vstream.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llama_vstream.conversation import conv_templates, SeparatorStyle
from llama_vstream.model.builder import load_pretrained_model
from llama_vstream.utils import disable_torch_init
from llama_vstream.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
def split_list(lst, n):
"""Split a list into n (roughly) equal-sized chunks"""
chunk_size = math.ceil(len(lst) / n) # integer division
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
def get_chunk(lst, n, k):
chunks = split_list(lst, n)
return chunks[k]
def parse_args():
"""
Parse command-line arguments.
"""
parser = argparse.ArgumentParser()
# Define the command-line arguments
parser.add_argument('--video_dir', help='Directory containing video files.', required=True)
parser.add_argument('--gt_file', help='Path to the ground truth file containing question.', required=True)
parser.add_argument('--output_dir', help='Directory to save the model results JSON.', required=True)
parser.add_argument('--output_name', help='Name of the file for storing results JSON.', required=True)
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
parser.add_argument("--model-base", type=str, default=None)
parser.add_argument("--conv-mode", type=str, default=None)
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--model-max-length", type=int, default=None)
return parser.parse_args()
def load_video(video_path):
vr = VideoReader(video_path, ctx=cpu(0))
total_frame_num = len(vr)
fps = round(vr.get_avg_fps())
frame_idx = [i for i in range(0, len(vr), fps)]
spare_frames = vr.get_batch(frame_idx).asnumpy()
return spare_frames
def run_inference(args):
"""
Run inference on ActivityNet QA DataSet using the Video-ChatGPT model.
Args:
args: Command-line arguments.
"""
# Initialize the model
model_name = get_model_name_from_path(args.model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.model_max_length)
# Load both ground truth file containing questions and answers
with open(args.gt_file) as file:
gt_questions = json.load(file)
gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
# Create the output directory if it doesn't exist
if not os.path.exists(args.output_dir):
try:
os.makedirs(args.output_dir)
except Exception as e:
print(f'mkdir Except: {e}')
video_formats = ['.mp4', '.avi', '.mov', '.mkv']
if args.num_chunks > 1:
output_name = f"{args.num_chunks}_{args.chunk_idx}"
else:
output_name = args.output_name
answers_file = os.path.join(args.output_dir, f"{output_name}.json")
ans_file = open(answers_file, "w")
for sample in tqdm(gt_questions, desc=f"cuda:{args.chunk_idx} "):
video_name = sample['video_id']
question = sample['question']
id = sample['id']
answer = sample['answer']
sample_set = {'id': id, 'question': question, 'answer': answer}
# Load the video file
for fmt in video_formats: # Added this line
temp_path = os.path.join(args.video_dir, f"{video_name}{fmt}")
if os.path.exists(temp_path):
video_path = temp_path
break
# Check if the video exists
if os.path.exists(video_path):
video = load_video(video_path)
video = image_processor.preprocess(video, return_tensors='pt')['pixel_values'].half().cuda()
video = [video]
qs = question
if model.config.mm_use_im_start_end:
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
else:
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
conv = conv_templates[args.conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=video,
do_sample=True,
temperature=0.002,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria])
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[:-len(stop_str)]
outputs = outputs.strip()
sample_set['pred'] = outputs
ans_file.write(json.dumps(sample_set) + "\n")
ans_file.flush()
ans_file.close()
if __name__ == "__main__":
args = parse_args()
run_inference(args)
|