Spaces:
Running
Running
import re | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from youtube_transcript_api._errors import TranscriptsDisabled | |
MAX_SIZE = 50_000 | |
YT_REGEX = r'^((http)s?:\/\/)?((www\.)|(m\.))?youtube.com\/watch\?([^\?]*&)?v=.+$' # noqa | |
YT_REGEX_SHORT = r'^((http)s?:\/\/)?youtu.be\/([^\?=]+)(\?[^?]+)?$' | |
def _extract_video_id(url: str) -> str: | |
if not re.match(YT_REGEX, url): | |
if not re.match(YT_REGEX_SHORT, url): | |
return '' | |
ind = url.find('?') | |
ind = len(url) if ind == -1 else ind | |
return url[url.find('e/')+2:ind] | |
res = url.split('v=') | |
ind = res[1].find('&') | |
ind = len(res[1]) if ind == -1 else ind | |
return res[1][:ind] | |
def get_youtube_caption(url: str) -> str: | |
try: | |
video_id = _extract_video_id(url) | |
if not video_id: | |
return '' | |
res, size = [], 0 | |
for transcript in YouTubeTranscriptApi.get_transcript(video_id): | |
res.append(transcript['text'].replace('\n', ' ')) | |
size += len(transcript['text']) | |
if size >= MAX_SIZE: | |
return ' '.join(res)[:MAX_SIZE] | |
return ' '.join(res)[:MAX_SIZE] | |
except TranscriptsDisabled: | |
return 'no-cap' | |
except Exception: | |
return 'err' | |