File size: 3,399 Bytes
14247d9
ca276a7
 
14247d9
ca276a7
 
 
 
4aaa812
 
 
 
 
 
ca276a7
 
 
 
 
 
85bf6ab
ca276a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4aaa812
 
 
 
ca276a7
 
 
 
 
14247d9
 
 
c1b1966
 
ca276a7
 
 
 
 
 
14247d9
 
 
3d2e8a9
14247d9
c494e29
47c1c5b
14247d9
 
c494e29
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
import argparse
import shutil
import os
from video_keyframe_detector.cli import keyframeDetection
import numpy as np
import cv2
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'tinyllava/TinyLLaVA-Phi-2-SigLIP-3.1B'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
config = model.config
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False, model_max_length = config.tokenizer_model_max_length,padding_side = config.tokenizer_padding_side)


def extract_keyframes(video_path, num_keyframes=12):
    video_id = video_path.split('/')[-1].strip().split('.')[0]

    os.makedirs("temp", exist_ok=True)

    keyframeDetection(video_path, "temp", 0.2)
    video_frame_list = sorted(os.listdir(os.path.join("temp", "keyFrames")), key=lambda x: int(x.split('.')[0][8:]))
    os.makedirs(os.path.join("video_frames", video_id), exist_ok=True)
    selected_frame_idx_set = set(np.linspace(1, len(video_frame_list) - 1, num_keyframes).astype(int))
    cnt = 0
    for i in range(len(video_frame_list)):
        if i in selected_frame_idx_set:
            source_file = os.path.join("temp", "keyFrames", video_frame_list[i])
            target_file = os.path.join("video_frames", video_id, f"frame_{cnt}.jpg")
            shutil.copyfile(source_file, target_file)
            cnt += 1
    
    shutil.rmtree("temp", ignore_errors=True)


def concatenate_frames(video_path):
    os.makedirs("concatenated_frames", exist_ok=True)
    video_id = video_path.split('/')[-1].strip().split('.')[0]
    image_frame_dir = os.path.join("video_frames", video_id)
    image_frame_list = sorted(os.listdir(os.path.join(image_frame_dir)), key=lambda x: int(x.split('.')[0].split('_')[1]))
    img_list = []
    for image_frame in image_frame_list:
        img_frame = cv2.imread(os.path.join(image_frame_dir, image_frame))
        img_list.append(img_frame)

    img_row1 = cv2.hconcat(img_list[:4])
    img_row2 = cv2.hconcat(img_list[4:8])
    img_row3 = cv2.hconcat(img_list[8:12])

    img_v = cv2.vconcat([img_row1, img_row2, img_row3])
    cv2.imwrite(os.path.join("concatenated_frames", f"{video_id}.jpg"), img_v)


def image_parser(args):
    out = args.image_file.split(args.sep)
    return out


def generate_video_caption(video_path):
    video_id = video_path.split('/')[-1].strip().split('.')[0]

    image_file = os.path.join("concatenated_frames", f"{video_id}.jpg")
    prompt = "In a short sentence, describe the process in the video."
    output_text, generation_time = model.chat(prompt=prompt, image=image_file, tokenizer=tokenizer)

    return output_text


def clean_files_and_folders():
    shutil.rmtree("concatenated_frames")
    shutil.rmtree("video_frames")


def video_to_text(video_file):
    video_path = video_file.name

    extract_keyframes(video_path)
    concatenate_frames(video_path)
    video_caption = generate_video_caption(video_path)
    clean_files_and_folders()

    return video_caption

iface = gr.Interface(
    fn=video_to_text,
    inputs=gr.File(file_types=["video"]),
    outputs="text",
    title="MAMA Video-Text Generation Pipeline",
    description="Upload a video and get the description. Due to limited budget, we can only use TinyLLaVA on CPUs. Please only try videos which are less than 1MB. Thank you so much and Welcome to MAMA!"
)

iface.launch(share=True)