File size: 5,129 Bytes
a45bd3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import argparse
from ._api import YouTubeTranscriptApi
from .formatters import FormatterLoader
class YouTubeTranscriptCli(object):
def __init__(self, args):
self._args = args
def run(self):
parsed_args = self._parse_args()
if parsed_args.exclude_manually_created and parsed_args.exclude_generated:
return ''
proxies = None
if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
cookies = parsed_args.cookies
transcripts = []
exceptions = []
for video_id in parsed_args.video_ids:
try:
transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id))
except Exception as exception:
exceptions.append(exception)
return '\n\n'.join(
[str(exception) for exception in exceptions]
+ ([FormatterLoader().load(parsed_args.format).format_transcripts(transcripts)] if transcripts else [])
)
def _fetch_transcript(self, parsed_args, proxies, cookies, video_id):
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies)
if parsed_args.list_transcripts:
return str(transcript_list)
if parsed_args.exclude_manually_created:
transcript = transcript_list.find_generated_transcript(parsed_args.languages)
elif parsed_args.exclude_generated:
transcript = transcript_list.find_manually_created_transcript(parsed_args.languages)
else:
transcript = transcript_list.find_transcript(parsed_args.languages)
if parsed_args.translate:
transcript = transcript.translate(parsed_args.translate)
return transcript.fetch()
def _parse_args(self):
parser = argparse.ArgumentParser(
description=(
'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. '
'It also works for automatically generated subtitles and it does not require a headless browser, like '
'other selenium based solutions do!'
)
)
parser.add_argument(
'--list-transcripts',
action='store_const',
const=True,
default=False,
help='This will list the languages in which the given videos are available in.',
)
parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.')
parser.add_argument(
'--languages',
nargs='*',
default=['en',],
type=str,
help=(
'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
'first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails '
'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you '
'may have to play around with the language codes a bit, to find the one which is working for you!'
),
)
parser.add_argument(
'--exclude-generated',
action='store_const',
const=True,
default=False,
help='If this flag is set transcripts which have been generated by YouTube will not be retrieved.',
)
parser.add_argument(
'--exclude-manually-created',
action='store_const',
const=True,
default=False,
help='If this flag is set transcripts which have been manually created will not be retrieved.',
)
parser.add_argument(
'--format',
type=str,
default='pretty',
choices=tuple(FormatterLoader.TYPES.keys()),
)
parser.add_argument(
'--translate',
default='',
help=(
'The language code for the language you want this transcript to be translated to. Use the '
'--list-transcripts feature to find out which languages are translatable and which translation '
'languages are available.'
)
)
parser.add_argument(
'--http-proxy',
default='',
metavar='URL',
help='Use the specified HTTP proxy.'
)
parser.add_argument(
'--https-proxy',
default='',
metavar='URL',
help='Use the specified HTTPS proxy.'
)
parser.add_argument(
'--cookies',
default=None,
help='The cookie file that will be used for authorization with youtube.'
)
return self._sanitize_video_ids(parser.parse_args(self._args))
def _sanitize_video_ids(self, args):
args.video_ids = [video_id.replace('\\', '') for video_id in args.video_ids]
return args
|