|
import argparse |
|
import csv |
|
import os |
|
|
|
import requests |
|
import tqdm |
|
|
|
from .utils import extract_frames, prompts, read_video_list |
|
|
|
|
|
def get_caption(frame, prompt, api_key): |
|
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"} |
|
payload = { |
|
"model": "gpt-4-vision-preview", |
|
"messages": [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": prompt, |
|
}, |
|
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[0]}"}}, |
|
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[1]}"}}, |
|
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[2]}"}}, |
|
], |
|
} |
|
], |
|
"max_tokens": 300, |
|
} |
|
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=60) |
|
caption = response.json()["choices"][0]["message"]["content"] |
|
caption = caption.replace("\n", " ") |
|
return caption |
|
|
|
|
|
def main(args): |
|
|
|
|
|
|
|
videos = read_video_list(args.video_folder, args.output_file) |
|
f = open(args.output_file, "a") |
|
writer = csv.writer(f) |
|
|
|
|
|
|
|
|
|
for video in tqdm.tqdm(videos): |
|
video_path = os.path.join(args.video_folder, video) |
|
frame, length = extract_frames(video_path, base_64=True) |
|
if len(frame) < 3: |
|
continue |
|
|
|
prompt = prompts[args.prompt] |
|
caption = get_caption(frame, prompt, args.key) |
|
|
|
writer.writerow((video, caption, length)) |
|
f.close() |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("video_folder", type=str) |
|
parser.add_argument("output_file", type=str) |
|
parser.add_argument("--prompt", type=str, default="three_frames") |
|
parser.add_argument("--key", type=str) |
|
args = parser.parse_args() |
|
|
|
main(args) |
|
|