Spaces:

mshukor
/

eP-ALM-Video-Text

Build error

App Files Files Community

eP-ALM-Video-Text / app.py

mshukor HF staff

Update app.py

2d0b5a3 over 1 year ago

raw

history blame contribute delete

4.89 kB

	import os

	os.system('cd TimeSformer;'
	'pip install .; cd ..')

	os.system('ls -l')
	os.system('pwd')

	import os, sys
	sys.path.append("/home/user/app/TimeSformer/")



	import torch
	from torchvision import transforms


	from transformers import AutoTokenizer


	from PIL import Image
	import json
	import os

	from torchvision import transforms

	from models.epalm import ePALM

	import os

	from transformers import AutoTokenizer

	# import ruamel_yaml as yaml
	from ruamel.yaml import YAML

	import torch
	import gradio as gr


	yaml=YAML(typ='safe')



	use_cuda = torch.cuda.is_available()
	device = torch.device('cuda') if use_cuda else torch.device('cpu')
	device_type = 'cuda' if use_cuda else 'cpu'

	## Load model

	### Captioning
	config = 'configs/video/ePALM_video_caption_msrvtt.yaml'
	config = yaml.load(open(config, 'r'))

	text_model = 'facebook/opt-2.7b'
	vision_model_name = 'timesformer'



	start_layer_idx = 19
	end_layer_idx = 31
	low_cpu = True
	MODEL = ePALM(opt_model_name=text_model,
	vision_model_name=vision_model_name,
	use_vis_prefix=True,
	start_layer_idx=start_layer_idx,
	end_layer_idx=end_layer_idx,
	return_hidden_state_vision=True,
	config=config,
	low_cpu=low_cpu
	)
	print("Model Built")
	MODEL.to(device)

	checkpoint_path = 'checkpoints/float32/ePALM_video_caption_msrvtt/checkpoint_best.pth'
	checkpoint = torch.load(checkpoint_path, map_location='cpu')
	state_dict = checkpoint['model']
	msg = MODEL.load_state_dict(state_dict,strict=False)

	MODEL.bfloat16()




	## Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(text_model, use_fast=False)
	eos_token = tokenizer.eos_token
	pad_token = tokenizer.pad_token

	special_answer_token = '</a>'

	special_tokens_dict = {'additional_special_tokens': [special_answer_token]}
	tokenizer.add_special_tokens(special_tokens_dict)


	image_size = 224
	normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))



	type_transform = transforms.Lambda(lambda x: x.float().div(255.0))
	test_transform = transforms.Compose([
	transforms.Resize((image_size,image_size),interpolation=Image.BICUBIC),
	type_transform,
	normalize,
	])

	from dataset.video_utils import VIDEO_READER_FUNCS
	video_reader = VIDEO_READER_FUNCS['decord']

	def read_video(path, num_frames=16):


	frames, frame_indices, video_duration = video_reader(
	path, num_frames, 'rand', max_num_frames=-1
	)
	video = test_transform(frames)

	return video.permute(1, 0, 2, 3).unsqueeze(0)


	do_sample=False
	num_beams=5
	max_length=30





	def inference(image, task_type, instruction):


	if task_type == 'Video Captioning':
	text = ['']
	text_input = tokenizer(text, padding='longest', return_tensors="pt").to(device)
	model = MODEL
	else:
	raise NotImplemented

	image = read_video(image)






	with torch.autocast(device_type=device_type, dtype=torch.bfloat16, enabled=True):

	out = model(image=image, text=text_input, mode='generate', return_dict=True, max_length=max_length,
	do_sample=do_sample, num_beams=num_beams)


	if 'Captioning' in task_type:
	for i, o in enumerate(out):
	res = tokenizer.decode(o)
	response = res.split('</s>')[1].replace(pad_token, '').replace('</s>', '').replace(eos_token, '') # skip_special_tokens=True
	else:
	for o in out:
	o_list = o.tolist()
	response = tokenizer.decode(o_list).split(special_answer_token)[1].replace(pad_token, '').replace('</s>', '').replace(eos_token, '') # skip_special_tokens=True

	return response


	inputs = [gr.Video(source="upload", type="filepath"), gr.inputs.Radio(choices=['Video Captioning'], type="value", default="Video Captioning", label="Task"), gr.inputs.Textbox(lines=1, label="Instruction")]
	outputs = ['text']
	examples = [
	['examples/videos/video7014.mp4', 'Video Captioning', None],
	['examples/videos/video7017.mp4', 'Video Captioning', None],
	['examples/videos/video7019.mp4', 'Video Captioning', None],
	['examples/videos/video7021.mp4', 'Video Captioning', None],
	['examples/videos/video7021.mp4', 'Video Captioning', None],
	]

	title = "eP-ALM for Video-Text tasks"
	description = "Gradio Demo for eP-ALM. For this demo, we use 2.7B OPT. As the model runs on CPUs and float16 mixed precision is not supported on CPUs, the generation can take up to 2 mins."
	article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2303.11403' target='_blank'>Paper</a> \| <a href='https://github.com/mshukor/eP-ALM' target='_blank'>Github Repo</a></p>"

	io = gr.Interface(fn=inference, inputs=inputs, outputs=outputs,
	title=title, description=description, article=article, examples=examples, cache_examples=False)
	io.launch()