Spaces:

tpha4308
/

video-qa

Sleeping

video-qa / utils.py

Thao Pham

add utils function for transcribe

f33d0ed 3 months ago

4.67 kB

	import os
	from io import StringIO, BytesIO
	from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union
	import base64
	import glob
	from tqdm import tqdm
	from pytubefix import YouTube, Stream
	import cv2
	import json
	import textwrap


	# helper function for convert time in second to time format for .vtt or .srt file
	def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
	assert seconds >= 0, "non-negative timestamp expected"
	milliseconds = round(seconds * 1000.0)

	hours = milliseconds // 3_600_000
	milliseconds -= hours * 3_600_000

	minutes = milliseconds // 60_000
	milliseconds -= minutes * 60_000

	seconds = milliseconds // 1_000
	milliseconds -= seconds * 1_000

	hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
	return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"


	def _processText(text: str, maxLineWidth=None):
	if (maxLineWidth is None or maxLineWidth < 0):
	return text

	lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
	return '\n'.join(lines)

	# helper function to convert transcripts generated by whisper to .vtt file
	def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
	print("WEBVTT\n", file=file)
	for segment in transcript:
	text = _processText(segment['text'], maxLineWidth).replace('-->', '->')

	print(
	f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
	f"{text}\n",
	file=file,
	flush=True,
	)

	# Taken from the course: https://www.deeplearning.ai/short-courses/multimodal-rag-chat-with-videos/
	def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int=-1) -> str:
	segmentStream = StringIO()

	if format == 'vtt':
	write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
	else:
	raise Exception("Unknown format " + format)

	segmentStream.seek(0)
	return segmentStream.read()

	def download_video(video_url, path='/tmp/'):
	print(f'Getting video information for {video_url}')
	if not video_url.startswith('http'):
	return os.path.join(path, video_url)

	filepath = glob.glob(os.path.join(path, '*.mp4'))
	if len(filepath) > 0:
	return filepath[0]

	def progress_callback(stream: Stream, data_chunk: bytes, bytes_remaining: int) -> None:
	pbar.update(len(data_chunk))

	yt = YouTube(video_url, on_progress_callback=progress_callback)
	stream = yt.streams.filter(progressive=True, file_extension='mp4', res='720p').desc().first()
	if stream is None:
	stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
	if not os.path.exists(path):
	os.makedirs(path)

	filepath = os.path.join(path, stream.default_filename)
	if not os.path.exists(filepath):
	print('Downloading video from YouTube...')
	pbar = tqdm(desc='Downloading video from YouTube', total=stream.filesize, unit="bytes")
	stream.download(path)
	pbar.close()
	return filepath

	# a help function that helps to convert a specific time written as a string in format `webvtt` into a time in miliseconds
	def str2time(strtime):
	# strip character " if exists
	strtime = strtime.strip('"')
	# get hour, minute, second from time string
	hrs, mins, seconds = [float(c) for c in strtime.split(':')]
	# get the corresponding time as total seconds
	total_seconds = hrs * 60*2 + mins 60 + seconds
	total_miliseconds = total_seconds * 1000
	return total_miliseconds

	# Resizes a image and maintains aspect ratio
	def maintain_aspect_ratio_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
	# Grab the image size and initialize dimensions
	dim = None
	(h, w) = image.shape[:2]

	# Return original image if no need to resize
	if width is None and height is None:
	return image

	# We are resizing height if width is none
	if width is None:
	# Calculate the ratio of the height and construct the dimensions
	r = height / float(h)
	dim = (int(w * r), height)
	# We are resizing width if height is none
	else:
	# Calculate the ratio of the width and construct the dimensions
	r = width / float(w)
	dim = (width, int(h * r))

	# Return the resized image
	return cv2.resize(image, dim, interpolation=inter)

	def load_json_file(file_path):
	# Open the JSON file in read mode
	with open(file_path, 'r') as file:
	data = json.load(file)
	return data