Joshua Lochner commited on Apr 17, 2022

Commit

3fd1e1e

1 Parent(s): 17feb06

Add requirements.txt and update model

Browse files

Files changed (20) hide show

added_tokens.json +1 -1
config.json +1 -1
pipeline.py +16 -12
pytorch_model.bin +1 -1
requirements.txt +1 -0
rng_state.pth +2 -2
scheduler.pt +1 -1
tokenizer.json +16 -2
tokenizer_config.json +1 -1
trainer_state.json +84 -513
training_args.bin +1 -1
youtube_transcript_api2/__init__.py +0 -16
youtube_transcript_api2/__main__.py +0 -15
youtube_transcript_api2/_api.py +0 -140
youtube_transcript_api2/_cli.py +0 -135
youtube_transcript_api2/_errors.py +0 -112
youtube_transcript_api2/_html_unescaping.py +0 -21
youtube_transcript_api2/_settings.py +0 -1
youtube_transcript_api2/_transcripts.py +0 -332
youtube_transcript_api2/formatters.py +0 -165

added_tokens.json CHANGED Viewed

@@ -1 +1 @@

- {"~~URL_TOKEN~~": ~~30523~~, "~~SHORT_HYPHENATED_TOKEN~~": ~~30527~~, "~~START_SPONSOR_TOKEN~~": ~~30534~~, "~~END_INTERACTION_TOKEN~~": ~~30539~~, "~~END_SELFPROMO_TOKEN~~": ~~30537~~, "~~[Laughter]~~": ~~30531~~, "~~BETWEEN_SEGMENTS_TOKEN~~": ~~30540~~, "~~NUMBER_PERCENTAGE_TOKEN~~": ~~30525~~, "~~NUMBER_TOKEN~~": ~~30526~~, "~~PROFANITY_TOKEN~~": ~~30532~~, "~~NO_SEGMENT_TOKEN~~": ~~30533~~, "~~HYPHENATED_URL_TOKEN~~": ~~30524~~, "~~START_SELFPROMO_TOKEN~~": ~~30536~~, "~~[Music]~~": ~~30529~~, "~~[Applause]~~": ~~30530~~, "~~START_INTERACTION_TOKEN~~": ~~30538~~, "~~LONG_WORD_TOKEN~~": ~~30528~~, "END_SPONSOR_TOKEN": 30535, "~~EXTRACT_SEGMENTS:~~ ": ~~30522~~}

+ {"END_SELFPROMO_TOKEN": 30537, "START_SPONSOR_TOKEN": 30534, "START_INTERACTION_TOKEN": 30538, "[Applause]": 30530, "NUMBER_PERCENTAGE_TOKEN": 30525, "END_INTERACTION_TOKEN": 30539, "EXTRACT_SEGMENTS: ": 30522, "START_SELFPROMO_TOKEN": 30536, "LONG_WORD_TOKEN": 30528, "HYPHENATED_URL_TOKEN": 30524, "[Laughter]": 30531, "[Music]": 30529, "PROFANITY_TOKEN": 30532, "SHORT_HYPHENATED_TOKEN": 30527, "NUMBER_TOKEN": 30526, "URL_TOKEN": 30523, "BETWEEN_SEGMENTS_TOKEN": 30540, "END_SPONSOR_TOKEN": 30535, "NO_SEGMENT_TOKEN": 30533}

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "bert-based-uncased",
   "architectures": [
     "BertForSequenceClassification"
   ],

 {
+  "_name_or_path": "models/sponsorblock-classifier-v2/",
   "architectures": [
     "BertForSequenceClassification"
   ],

pipeline.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import json
 from functools import lru_cache
-import youtube_transcript_api2
 import json
 import re
 import requests
@@ -165,7 +170,7 @@ def parse_transcript_json(json_data, granularity):
 def list_transcripts(video_id):
     try:
-        return youtube_transcript_api2.YouTubeTranscriptApi.list_transcripts(video_id)
     except json.decoder.JSONDecodeError:
         return None
@@ -198,14 +203,13 @@ def get_words(video_id, transcript_type='auto', fallback='manual', filter_words_
                 f'{ts._url}&fmt=json3').content
             if raw_transcript:
                 raw_transcript_json = json.loads(raw_transcript)
-    except (youtube_transcript_api2.TooManyRequests, youtube_transcript_api2.YouTubeRequestFailed):
         raise  # Cannot recover from these errors and do not mark as empty transcript
     except requests.exceptions.RequestException:  # Can recover
         return get_words(video_id, transcript_type, fallback, granularity)
-    except youtube_transcript_api2.CouldNotRetrieveTranscript:  # Retrying won't solve
         pass  # Mark as empty transcript
     except json.decoder.JSONDecodeError:
@@ -285,10 +289,11 @@ class PreTrainedPipeline():
         self.tokenizer2 = AutoTokenizer.from_pretrained(path)
         self.pipeline2 = SponsorBlockClassificationPipeline(
             model=self.model2, tokenizer=self.tokenizer2)
-    def __call__(self, inputs: str)-> List[Dict[str, Any]]:
-        if ' ' not in inputs and inputs.count(',') >= 2: # Automated call (compressed string)
             split_info = inputs.split(',', 1)
             times = np.reshape(np.array(split_info[1].split(',')), (-1, 2))
             data = []
@@ -304,15 +309,14 @@ class PreTrainedPipeline():
         return self.pipeline2(data)
 class SponsorBlockClassificationPipeline(TextClassificationPipeline):
     def __init__(self, model, tokenizer):
         super().__init__(model=model, tokenizer=tokenizer, return_all_scores=True)
     def preprocess(self, data, **tokenizer_kwargs):
-        if isinstance(data, str): # If string, assume this is what user wants to classify
-            text = data
-        else: # Otherwise, get data from transcript
             words = get_words(data['video_id'])
             segment_words = extract_segment(words, data['start'], data['end'])
             text = ' '.join(x['text'] for x in segment_words)

 import json
 from functools import lru_cache
+from youtube_transcript_api import (
+    YouTubeTranscriptApi,
+    TooManyRequests,
+    YouTubeRequestFailed,
+    CouldNotRetrieveTranscript
+)
 import json
 import re
 import requests
 def list_transcripts(video_id):
     try:
+        return YouTubeTranscriptApi.list_transcripts(video_id)
     except json.decoder.JSONDecodeError:
         return None
                 f'{ts._url}&fmt=json3').content
             if raw_transcript:
                 raw_transcript_json = json.loads(raw_transcript)
+    except (TooManyRequests, YouTubeRequestFailed):
         raise  # Cannot recover from these errors and do not mark as empty transcript
     except requests.exceptions.RequestException:  # Can recover
         return get_words(video_id, transcript_type, fallback, granularity)
+    except CouldNotRetrieveTranscript:  # Retrying won't solve
         pass  # Mark as empty transcript
     except json.decoder.JSONDecodeError:
         self.tokenizer2 = AutoTokenizer.from_pretrained(path)
         self.pipeline2 = SponsorBlockClassificationPipeline(
             model=self.model2, tokenizer=self.tokenizer2)
+    def __call__(self, inputs: str) -> List[Dict[str, Any]]:
+        # Automated call (compressed string)
+        if ' ' not in inputs and inputs.count(',') >= 2:
             split_info = inputs.split(',', 1)
             times = np.reshape(np.array(split_info[1].split(',')), (-1, 2))
             data = []
         return self.pipeline2(data)
 class SponsorBlockClassificationPipeline(TextClassificationPipeline):
     def __init__(self, model, tokenizer):
         super().__init__(model=model, tokenizer=tokenizer, return_all_scores=True)
     def preprocess(self, data, **tokenizer_kwargs):
+        if isinstance(data, str):  # If string, assume this is what user wants to classify
+            text = data
+        else:  # Otherwise, get data from transcript
             words = get_words(data['video_id'])
             segment_words = extract_segment(words, data['start'], data['end'])
             text = ' '.join(x['text'] for x in segment_words)

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:59a2338b4e0278a49bf75ae1a771de425811ffdbba69108a4b92526497cd0dae
 size 438084653

 version https://git-lfs.github.com/spec/v1
+oid sha256:012162a219c071b5208c4179f56b8e6263bad1b532f7fc4fc13f0311de5f1729
 size 438084653

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ youtube_transcript_api

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:156876b214e338523f93bae5052b71d5b3872e79cd8fe551742463c8af0b6821
-size 14439

 version https://git-lfs.github.com/spec/v1
+oid sha256:a203bf3393a6882ed6527945f185bfa78a49e0b8709ba172146c735dcfc1aa6c
+size 14567

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54284b17a87ffcefec12bdfce42ddab7e6d490e5c99efdd0a5cf1220abf9fd71
 size 623

 version https://git-lfs.github.com/spec/v1
+oid sha256:28e1eb9d44a2bee663e6731a6c6d574d4d710f519e640406cfff15e0f37ab9f9
 size 623

tokenizer.json CHANGED Viewed

@@ -1,7 +1,21 @@
 {
   "version": "1.0",
-  "truncation": null,
-  "padding": null,
   "added_tokens": [
     {
       "id": 0,

 {
   "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 512,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
+  "padding": {
+    "strategy": {
+      "Fixed": 512
+    },
+    "direction": "Right",
+    "pad_to_multiple_of": null,
+    "pad_id": 0,
+    "pad_type_id": 0,
+    "pad_token": "[PAD]"
+  },
   "added_tokens": [
     {
       "id": 0,

tokenizer_config.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "models/classifier-~~85000~~", "tokenizer_class": "BertTokenizer"}


1	+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "models/sponsorblock-classifier-v2/", "tokenizer_class": "BertTokenizer"}

trainer_state.json CHANGED Viewed

@@ -1,658 +1,229 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 3.2687460617517328,
-  "global_step": 415000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.04,
-      "learning_rate": 1.921235034656585e-05,
-      "loss": 0.3334,
       "step": 5000
     },
     {
       "epoch": 0.08,
-      "learning_rate": 1.8424700693131696e-05,
-      "loss": 0.3387,
       "step": 10000
     },
     {
       "epoch": 0.12,
-      "learning_rate": 1.7637051039697544e-05,
-      "loss": 0.3327,
       "step": 15000
     },
     {
       "epoch": 0.16,
-      "learning_rate": 1.684940138626339e-05,
-      "loss": 0.3492,
       "step": 20000
     },
     {
       "epoch": 0.2,
-      "learning_rate": 1.606175173282924e-05,
-      "loss": 0.3349,
       "step": 25000
     },
     {
       "epoch": 0.2,
-      "eval_accuracy": 0.9155995845794678,
-      "eval_loss": 0.389266699552536,
-      "eval_runtime": 551.1932,
-      "eval_samples_per_second": 51.289,
-      "eval_steps_per_second": 12.823,
       "step": 25000
     },
     {
       "epoch": 0.24,
-      "learning_rate": 1.5274102079395087e-05,
-      "loss": 0.3279,
       "step": 30000
     },
     {
       "epoch": 0.28,
-      "learning_rate": 1.4486452425960932e-05,
-      "loss": 0.3301,
       "step": 35000
     },
     {
       "epoch": 0.32,
-      "learning_rate": 1.369880277252678e-05,
-      "loss": 0.3243,
       "step": 40000
     },
     {
       "epoch": 0.35,
-      "learning_rate": 1.2911153119092628e-05,
-      "loss": 0.293,
       "step": 45000
     },
     {
       "epoch": 0.39,
-      "learning_rate": 1.2123503465658477e-05,
-      "loss": 0.3053,
       "step": 50000
     },
     {
       "epoch": 0.39,
-      "eval_accuracy": 0.9235231876373291,
-      "eval_loss": 0.3810465931892395,
-      "eval_runtime": 542.4272,
-      "eval_samples_per_second": 52.118,
-      "eval_steps_per_second": 13.03,
       "step": 50000
     },
     {
       "epoch": 0.43,
-      "learning_rate": 1.1335853812224324e-05,
-      "loss": 0.3126,
       "step": 55000
     },
     {
       "epoch": 0.47,
-      "learning_rate": 1.0548204158790173e-05,
-      "loss": 0.3072,
       "step": 60000
     },
     {
       "epoch": 0.51,
-      "learning_rate": 9.760554505356018e-06,
-      "loss": 0.2957,
       "step": 65000
     },
     {
       "epoch": 0.55,
-      "learning_rate": 8.972904851921865e-06,
-      "loss": 0.2968,
       "step": 70000
     },
     {
       "epoch": 0.59,
-      "learning_rate": 8.185255198487714e-06,
-      "loss": 0.2882,
       "step": 75000
     },
     {
       "epoch": 0.59,
-      "eval_accuracy": 0.9224973320960999,
-      "eval_loss": 0.37537074089050293,
-      "eval_runtime": 521.9317,
-      "eval_samples_per_second": 54.164,
-      "eval_steps_per_second": 13.542,
       "step": 75000
     },
     {
       "epoch": 0.63,
-      "learning_rate": 7.3976055450535615e-06,
-      "loss": 0.2754,
       "step": 80000
     },
     {
       "epoch": 0.67,
-      "learning_rate": 6.6099558916194085e-06,
-      "loss": 0.2607,
       "step": 85000
     },
     {
       "epoch": 0.71,
-      "learning_rate": 5.8223062381852555e-06,
-      "loss": 0.2818,
       "step": 90000
     },
     {
       "epoch": 0.75,
-      "learning_rate": 5.034656584751103e-06,
-      "loss": 0.2736,
       "step": 95000
     },
     {
       "epoch": 0.79,
-      "learning_rate": 4.24700693131695e-06,
-      "loss": 0.2644,
       "step": 100000
     },
     {
       "epoch": 0.79,
-      "eval_accuracy": 0.9297842383384705,
-      "eval_loss": 0.3645715117454529,
-      "eval_runtime": 521.9055,
-      "eval_samples_per_second": 54.167,
-      "eval_steps_per_second": 13.543,
       "step": 100000
     },
     {
       "epoch": 0.83,
-      "learning_rate": 3.459357277882798e-06,
-      "loss": 0.2552,
       "step": 105000
     },
     {
       "epoch": 0.87,
-      "learning_rate": 2.6717076244486457e-06,
-      "loss": 0.266,
       "step": 110000
     },
     {
       "epoch": 0.91,
-      "learning_rate": 1.884057971014493e-06,
-      "loss": 0.2684,
       "step": 115000
     },
     {
       "epoch": 0.95,
-      "learning_rate": 1.0964083175803404e-06,
-      "loss": 0.2501,
       "step": 120000
     },
     {
       "epoch": 0.98,
-      "learning_rate": 3.087586641461878e-07,
-      "loss": 0.273,
       "step": 125000
     },
     {
       "epoch": 0.98,
-      "eval_accuracy": 0.9299964904785156,
-      "eval_loss": 0.3369257152080536,
-      "eval_runtime": 522.7551,
-      "eval_samples_per_second": 54.079,
-      "eval_steps_per_second": 13.521,
       "step": 125000
     },
     {
       "epoch": 1.02,
-      "learning_rate": 1.7952110901071204e-05,
-      "loss": 0.2834,
       "step": 130000
     },
     {
       "epoch": 1.06,
-      "learning_rate": 1.787334593572779e-05,
-      "loss": 0.3047,
       "step": 135000
     },
     {
       "epoch": 1.1,
-      "learning_rate": 1.7794580970384373e-05,
-      "loss": 0.2963,
       "step": 140000
-    },
-    {
-      "epoch": 1.14,
-      "learning_rate": 1.771581600504096e-05,
-      "loss": 0.3031,
-      "step": 145000
-    },
-    {
-      "epoch": 1.18,
-      "learning_rate": 1.7637051039697544e-05,
-      "loss": 0.3033,
-      "step": 150000
-    },
-    {
-      "epoch": 1.18,
-      "eval_accuracy": 0.9257162809371948,
-      "eval_loss": 0.4006378650665283,
-      "eval_runtime": 519.4649,
-      "eval_samples_per_second": 54.421,
-      "eval_steps_per_second": 13.606,
-      "step": 150000
-    },
-    {
-      "epoch": 1.22,
-      "learning_rate": 1.755828607435413e-05,
-      "loss": 0.3024,
-      "step": 155000
-    },
-    {
-      "epoch": 1.26,
-      "learning_rate": 1.7479521109010713e-05,
-      "loss": 0.3135,
-      "step": 160000
-    },
-    {
-      "epoch": 1.3,
-      "learning_rate": 1.74007561436673e-05,
-      "loss": 0.3137,
-      "step": 165000
-    },
-    {
-      "epoch": 1.34,
-      "learning_rate": 1.732199117832388e-05,
-      "loss": 0.3227,
-      "step": 170000
-    },
-    {
-      "epoch": 1.38,
-      "learning_rate": 1.7243226212980467e-05,
-      "loss": 0.3246,
-      "step": 175000
-    },
-    {
-      "epoch": 1.38,
-      "eval_accuracy": 0.924018383026123,
-      "eval_loss": 0.3924681842327118,
-      "eval_runtime": 518.8244,
-      "eval_samples_per_second": 54.489,
-      "eval_steps_per_second": 13.623,
-      "step": 175000
-    },
-    {
-      "epoch": 1.42,
-      "learning_rate": 1.7164461247637053e-05,
-      "loss": 0.3281,
-      "step": 180000
-    },
-    {
-      "epoch": 1.46,
-      "learning_rate": 1.708569628229364e-05,
-      "loss": 0.3256,
-      "step": 185000
-    },
-    {
-      "epoch": 1.5,
-      "learning_rate": 1.700693131695022e-05,
-      "loss": 0.313,
-      "step": 190000
-    },
-    {
-      "epoch": 1.54,
-      "learning_rate": 1.6928166351606807e-05,
-      "loss": 0.3313,
-      "step": 195000
-    },
-    {
-      "epoch": 1.58,
-      "learning_rate": 1.684940138626339e-05,
-      "loss": 0.2953,
-      "step": 200000
-    },
-    {
-      "epoch": 1.58,
-      "eval_accuracy": 0.9212592840194702,
-      "eval_loss": 0.3895967900753021,
-      "eval_runtime": 526.2623,
-      "eval_samples_per_second": 53.718,
-      "eval_steps_per_second": 13.431,
-      "step": 200000
-    },
-    {
-      "epoch": 1.61,
-      "learning_rate": 1.6770636420919976e-05,
-      "loss": 0.3103,
-      "step": 205000
-    },
-    {
-      "epoch": 1.65,
-      "learning_rate": 1.669187145557656e-05,
-      "loss": 0.3089,
-      "step": 210000
-    },
-    {
-      "epoch": 1.69,
-      "learning_rate": 1.6613106490233147e-05,
-      "loss": 0.3095,
-      "step": 215000
-    },
-    {
-      "epoch": 1.73,
-      "learning_rate": 1.653434152488973e-05,
-      "loss": 0.3288,
-      "step": 220000
-    },
-    {
-      "epoch": 1.77,
-      "learning_rate": 1.6455576559546316e-05,
-      "loss": 0.3199,
-      "step": 225000
-    },
-    {
-      "epoch": 1.77,
-      "eval_accuracy": 0.9203749299049377,
-      "eval_loss": 0.3942428529262543,
-      "eval_runtime": 520.6801,
-      "eval_samples_per_second": 54.294,
-      "eval_steps_per_second": 13.575,
-      "step": 225000
-    },
-    {
-      "epoch": 1.81,
-      "learning_rate": 1.6376811594202898e-05,
-      "loss": 0.306,
-      "step": 230000
-    },
-    {
-      "epoch": 1.85,
-      "learning_rate": 1.6298046628859484e-05,
-      "loss": 0.3104,
-      "step": 235000
-    },
-    {
-      "epoch": 1.89,
-      "learning_rate": 1.621928166351607e-05,
-      "loss": 0.3139,
-      "step": 240000
-    },
-    {
-      "epoch": 1.93,
-      "learning_rate": 1.6140516698172656e-05,
-      "loss": 0.3179,
-      "step": 245000
-    },
-    {
-      "epoch": 1.97,
-      "learning_rate": 1.606175173282924e-05,
-      "loss": 0.3226,
-      "step": 250000
-    },
-    {
-      "epoch": 1.97,
-      "eval_accuracy": 0.9243367314338684,
-      "eval_loss": 0.4058537185192108,
-      "eval_runtime": 552.8946,
-      "eval_samples_per_second": 51.131,
-      "eval_steps_per_second": 12.784,
-      "step": 250000
-    },
-    {
-      "epoch": 2.01,
-      "learning_rate": 1.5982986767485824e-05,
-      "loss": 0.3167,
-      "step": 255000
-    },
-    {
-      "epoch": 2.05,
-      "learning_rate": 1.5904221802142407e-05,
-      "loss": 0.3034,
-      "step": 260000
-    },
-    {
-      "epoch": 2.09,
-      "learning_rate": 1.5825456836798993e-05,
-      "loss": 0.2976,
-      "step": 265000
-    },
-    {
-      "epoch": 2.13,
-      "learning_rate": 1.574669187145558e-05,
-      "loss": 0.3039,
-      "step": 270000
-    },
-    {
-      "epoch": 2.17,
-      "learning_rate": 1.5667926906112164e-05,
-      "loss": 0.2889,
-      "step": 275000
-    },
-    {
-      "epoch": 2.17,
-      "eval_accuracy": 0.9221436381340027,
-      "eval_loss": 0.39818692207336426,
-      "eval_runtime": 508.9512,
-      "eval_samples_per_second": 55.546,
-      "eval_steps_per_second": 13.887,
-      "step": 275000
-    },
-    {
-      "epoch": 2.21,
-      "learning_rate": 1.5589161940768747e-05,
-      "loss": 0.3079,
-      "step": 280000
-    },
-    {
-      "epoch": 2.24,
-      "learning_rate": 1.5510396975425333e-05,
-      "loss": 0.3148,
-      "step": 285000
-    },
-    {
-      "epoch": 2.28,
-      "learning_rate": 1.5431632010081915e-05,
-      "loss": 0.2829,
-      "step": 290000
-    },
-    {
-      "epoch": 2.32,
-      "learning_rate": 1.53528670447385e-05,
-      "loss": 0.2978,
-      "step": 295000
-    },
-    {
-      "epoch": 2.36,
-      "learning_rate": 1.5274102079395087e-05,
-      "loss": 0.2963,
-      "step": 300000
-    },
-    {
-      "epoch": 2.36,
-      "eval_accuracy": 0.922957181930542,
-      "eval_loss": 0.43214836716651917,
-      "eval_runtime": 528.9348,
-      "eval_samples_per_second": 53.447,
-      "eval_steps_per_second": 13.363,
-      "step": 300000
-    },
-    {
-      "epoch": 2.4,
-      "learning_rate": 1.5195337114051671e-05,
-      "loss": 0.3007,
-      "step": 305000
-    },
-    {
-      "epoch": 2.44,
-      "learning_rate": 1.5116572148708256e-05,
-      "loss": 0.2901,
-      "step": 310000
-    },
-    {
-      "epoch": 2.48,
-      "learning_rate": 1.5037807183364841e-05,
-      "loss": 0.2905,
-      "step": 315000
-    },
-    {
-      "epoch": 2.52,
-      "learning_rate": 1.4959042218021424e-05,
-      "loss": 0.3082,
-      "step": 320000
-    },
-    {
-      "epoch": 2.56,
-      "learning_rate": 1.488027725267801e-05,
-      "loss": 0.2899,
-      "step": 325000
-    },
-    {
-      "epoch": 2.56,
-      "eval_accuracy": 0.924124538898468,
-      "eval_loss": 0.40452826023101807,
-      "eval_runtime": 520.2943,
-      "eval_samples_per_second": 54.335,
-      "eval_steps_per_second": 13.585,
-      "step": 325000
-    },
-    {
-      "epoch": 2.6,
-      "learning_rate": 1.4801512287334594e-05,
-      "loss": 0.301,
-      "step": 330000
-    },
-    {
-      "epoch": 2.64,
-      "learning_rate": 1.472274732199118e-05,
-      "loss": 0.3014,
-      "step": 335000
-    },
-    {
-      "epoch": 2.68,
-      "learning_rate": 1.4643982356647764e-05,
-      "loss": 0.2974,
-      "step": 340000
-    },
-    {
-      "epoch": 2.72,
-      "learning_rate": 1.456521739130435e-05,
-      "loss": 0.3215,
-      "step": 345000
-    },
-    {
-      "epoch": 2.76,
-      "learning_rate": 1.4486452425960932e-05,
-      "loss": 0.2986,
-      "step": 350000
-    },
-    {
-      "epoch": 2.76,
-      "eval_accuracy": 0.9239122867584229,
-      "eval_loss": 0.4114295542240143,
-      "eval_runtime": 565.9722,
-      "eval_samples_per_second": 49.949,
-      "eval_steps_per_second": 12.488,
-      "step": 350000
-    },
-    {
-      "epoch": 2.8,
-      "learning_rate": 1.4407687460617518e-05,
-      "loss": 0.2881,
-      "step": 355000
-    },
-    {
-      "epoch": 2.84,
-      "learning_rate": 1.4328922495274103e-05,
-      "loss": 0.2937,
-      "step": 360000
-    },
-    {
-      "epoch": 2.87,
-      "learning_rate": 1.4250157529930688e-05,
-      "loss": 0.2987,
-      "step": 365000
-    },
-    {
-      "epoch": 2.91,
-      "learning_rate": 1.4171392564587273e-05,
-      "loss": 0.3355,
-      "step": 370000
-    },
-    {
-      "epoch": 2.95,
-      "learning_rate": 1.4092627599243858e-05,
-      "loss": 0.3086,
-      "step": 375000
-    },
-    {
-      "epoch": 2.95,
-      "eval_accuracy": 0.9162362813949585,
-      "eval_loss": 0.4699816107749939,
-      "eval_runtime": 523.819,
-      "eval_samples_per_second": 53.969,
-      "eval_steps_per_second": 13.493,
-      "step": 375000
-    },
-    {
-      "epoch": 2.99,
-      "learning_rate": 1.4013862633900441e-05,
-      "loss": 0.3187,
-      "step": 380000
-    },
-    {
-      "epoch": 3.03,
-      "learning_rate": 1.3935097668557027e-05,
-      "loss": 0.304,
-      "step": 385000
-    },
-    {
-      "epoch": 3.07,
-      "learning_rate": 1.3856332703213611e-05,
-      "loss": 0.2856,
-      "step": 390000
-    },
-    {
-      "epoch": 3.11,
-      "learning_rate": 1.3777567737870197e-05,
-      "loss": 0.2877,
-      "step": 395000
-    },
-    {
-      "epoch": 3.15,
-      "learning_rate": 1.369880277252678e-05,
-      "loss": 0.2896,
-      "step": 400000
-    },
-    {
-      "epoch": 3.15,
-      "eval_accuracy": 0.9198089838027954,
-      "eval_loss": 0.43677982687950134,
-      "eval_runtime": 526.7389,
-      "eval_samples_per_second": 53.67,
-      "eval_steps_per_second": 13.418,
-      "step": 400000
-    },
-    {
-      "epoch": 3.19,
-      "learning_rate": 1.3620037807183365e-05,
-      "loss": 0.2946,
-      "step": 405000
-    },
-    {
-      "epoch": 3.23,
-      "learning_rate": 1.354127284183995e-05,
-      "loss": 0.2899,
-      "step": 410000
-    },
-    {
-      "epoch": 3.27,
-      "learning_rate": 1.3462507876496535e-05,
-      "loss": 0.2907,
-      "step": 415000
     }
   ],
   "max_steps": 1269600,
   "num_train_epochs": 10,
-  "total_flos": 4.367706162646794e+17,
   "trial_name": null,
   "trial_params": null
 }

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.1027095148078134,
+  "global_step": 140000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.04,
+      "learning_rate": 1.9921235034656587e-06,
+      "loss": 0.2621,
       "step": 5000
     },
     {
       "epoch": 0.08,
+      "learning_rate": 1.984247006931317e-06,
+      "loss": 0.2548,
       "step": 10000
     },
     {
       "epoch": 0.12,
+      "learning_rate": 1.9763705103969753e-06,
+      "loss": 0.2451,
       "step": 15000
     },
     {
       "epoch": 0.16,
+      "learning_rate": 1.9684940138626337e-06,
+      "loss": 0.2479,
       "step": 20000
     },
     {
       "epoch": 0.2,
+      "learning_rate": 1.9606175173282924e-06,
+      "loss": 0.2477,
       "step": 25000
     },
     {
       "epoch": 0.2,
+      "eval_accuracy": 0.9227449297904968,
+      "eval_loss": 0.42459121346473694,
+      "eval_runtime": 532.8547,
+      "eval_samples_per_second": 53.054,
+      "eval_steps_per_second": 13.264,
       "step": 25000
     },
     {
       "epoch": 0.24,
+      "learning_rate": 1.9527410207939508e-06,
+      "loss": 0.253,
       "step": 30000
     },
     {
       "epoch": 0.28,
+      "learning_rate": 1.9448645242596095e-06,
+      "loss": 0.2466,
       "step": 35000
     },
     {
       "epoch": 0.32,
+      "learning_rate": 1.936988027725268e-06,
+      "loss": 0.25,
       "step": 40000
     },
     {
       "epoch": 0.35,
+      "learning_rate": 1.929111531190926e-06,
+      "loss": 0.2402,
       "step": 45000
     },
     {
       "epoch": 0.39,
+      "learning_rate": 1.9212350346565845e-06,
+      "loss": 0.2515,
       "step": 50000
     },
     {
       "epoch": 0.39,
+      "eval_accuracy": 0.926034688949585,
+      "eval_loss": 0.3926495909690857,
+      "eval_runtime": 506.3742,
+      "eval_samples_per_second": 55.828,
+      "eval_steps_per_second": 13.958,
       "step": 50000
     },
     {
       "epoch": 0.43,
+      "learning_rate": 1.9133585381222433e-06,
+      "loss": 0.2383,
       "step": 55000
     },
     {
       "epoch": 0.47,
+      "learning_rate": 1.9054820415879016e-06,
+      "loss": 0.2523,
       "step": 60000
     },
     {
       "epoch": 0.51,
+      "learning_rate": 1.8976055450535602e-06,
+      "loss": 0.2372,
       "step": 65000
     },
     {
       "epoch": 0.55,
+      "learning_rate": 1.8897290485192185e-06,
+      "loss": 0.2395,
       "step": 70000
     },
     {
       "epoch": 0.59,
+      "learning_rate": 1.881852551984877e-06,
+      "loss": 0.2376,
       "step": 75000
     },
     {
       "epoch": 0.59,
+      "eval_accuracy": 0.9263883829116821,
+      "eval_loss": 0.3989144265651703,
+      "eval_runtime": 505.8843,
+      "eval_samples_per_second": 55.882,
+      "eval_steps_per_second": 13.972,
       "step": 75000
     },
     {
       "epoch": 0.63,
+      "learning_rate": 1.8739760554505356e-06,
+      "loss": 0.2331,
       "step": 80000
     },
     {
       "epoch": 0.67,
+      "learning_rate": 1.8660995589161941e-06,
+      "loss": 0.2426,
       "step": 85000
     },
     {
       "epoch": 0.71,
+      "learning_rate": 1.8582230623818525e-06,
+      "loss": 0.2493,
       "step": 90000
     },
     {
       "epoch": 0.75,
+      "learning_rate": 1.850346565847511e-06,
+      "loss": 0.2379,
       "step": 95000
     },
     {
       "epoch": 0.79,
+      "learning_rate": 1.8424700693131694e-06,
+      "loss": 0.2428,
       "step": 100000
     },
     {
       "epoch": 0.79,
+      "eval_accuracy": 0.9267421364784241,
+      "eval_loss": 0.3985295295715332,
+      "eval_runtime": 549.265,
+      "eval_samples_per_second": 51.469,
+      "eval_steps_per_second": 12.868,
       "step": 100000
     },
     {
       "epoch": 0.83,
+      "learning_rate": 1.834593572778828e-06,
+      "loss": 0.2429,
       "step": 105000
     },
     {
       "epoch": 0.87,
+      "learning_rate": 1.8267170762444864e-06,
+      "loss": 0.238,
       "step": 110000
     },
     {
       "epoch": 0.91,
+      "learning_rate": 1.818840579710145e-06,
+      "loss": 0.2322,
       "step": 115000
     },
     {
       "epoch": 0.95,
+      "learning_rate": 1.8109640831758033e-06,
+      "loss": 0.2371,
       "step": 120000
     },
     {
       "epoch": 0.98,
+      "learning_rate": 1.8030875866414619e-06,
+      "loss": 0.2303,
       "step": 125000
     },
     {
       "epoch": 0.98,
+      "eval_accuracy": 0.9282631874084473,
+      "eval_loss": 0.40024659037590027,
+      "eval_runtime": 548.3799,
+      "eval_samples_per_second": 51.552,
+      "eval_steps_per_second": 12.889,
       "step": 125000
     },
     {
       "epoch": 1.02,
+      "learning_rate": 1.7952110901071202e-06,
+      "loss": 0.221,
       "step": 130000
     },
     {
       "epoch": 1.06,
+      "learning_rate": 1.7873345935727788e-06,
+      "loss": 0.2199,
       "step": 135000
     },
     {
       "epoch": 1.1,
+      "learning_rate": 1.779458097038437e-06,
+      "loss": 0.2097,
       "step": 140000
     }
   ],
   "max_steps": 1269600,
   "num_train_epochs": 10,
+  "total_flos": 1.473443106221998e+17,
   "trial_name": null,
   "trial_params": null
 }

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:644215cf0b9aba1d6b87848f34f5414c4363298f97f869acd603d3745f7e08d1
 size 2991

 version https://git-lfs.github.com/spec/v1
+oid sha256:03c6251eac46fa18822e93d9f2cc88f4b61044f93b32dd36da4e4f50a6315f9c
 size 2991

youtube_transcript_api2/__init__.py DELETED Viewed

@@ -1,16 +0,0 @@
-from ._api import YouTubeTranscriptApi
-from ._transcripts import TranscriptList, Transcript
-from ._errors import (
-    TranscriptsDisabled,
-    NoTranscriptFound,
-    CouldNotRetrieveTranscript,
-    VideoUnavailable,
-    TooManyRequests,
-    NotTranslatable,
-    TranslationLanguageNotAvailable,
-    NoTranscriptAvailable,
-    CookiePathInvalid,
-    CookiesInvalid,
-    FailedToCreateConsentCookie,
-    YouTubeRequestFailed,
-)

youtube_transcript_api2/__main__.py DELETED Viewed

@@ -1,15 +0,0 @@
-import sys
-import logging
-from ._cli import YouTubeTranscriptCli
-def main():
-    logging.basicConfig()
-    print(YouTubeTranscriptCli(sys.argv[1:]).run())
-if __name__ == '__main__':
-    main()

youtube_transcript_api2/_api.py DELETED Viewed

@@ -1,140 +0,0 @@
-import requests
-try: # pragma: no cover
-    import http.cookiejar as cookiejar
-    CookieLoadError = (FileNotFoundError, cookiejar.LoadError)
-except ImportError: # pragma: no cover
-    import cookielib as cookiejar
-    CookieLoadError = IOError
-from ._transcripts import TranscriptListFetcher
-from ._errors import (
-    CookiePathInvalid,
-    CookiesInvalid
-)
-class YouTubeTranscriptApi(object):
-    @classmethod
-    def list_transcripts(cls, video_id, proxies=None, cookies=None):
-        """
-        Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
-        which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
-        over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide
-        metadata and can either be fetched by calling `transcript.fetch()` or translated by calling
-        `transcript.translate('en')`. Example::
-            # retrieve the available transcripts
-            transcript_list = YouTubeTranscriptApi.get('video_id')
-            # iterate over all available transcripts
-            for transcript in transcript_list:
-                # the Transcript object provides metadata properties
-                print(
-                    transcript.video_id,
-                    transcript.language,
-                    transcript.language_code,
-                    # whether it has been manually created or generated by YouTube
-                    transcript.is_generated,
-                    # a list of languages the transcript can be translated to
-                    transcript.translation_languages,
-                )
-                # fetch the actual transcript data
-                print(transcript.fetch())
-                # translating the transcript will return another transcript object
-                print(transcript.translate('en').fetch())
-            # you can also directly filter for the language you are looking for, using the transcript list
-            transcript = transcript_list.find_transcript(['de', 'en'])
-            # or just filter for manually created transcripts
-            transcript = transcript_list.find_manually_created_transcript(['de', 'en'])
-            # or automatically generated ones
-            transcript = transcript_list.find_generated_transcript(['de', 'en'])
-        :param video_id: the youtube video id
-        :type video_id: str
-        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
-        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
-        :param cookies: a string of the path to a text file containing youtube authorization cookies
-        :type cookies: str
-        :return: the list of available transcripts
-        :rtype TranscriptList:
-        """
-        with requests.Session() as http_client:
-            if cookies:
-                http_client.cookies = cls._load_cookies(cookies, video_id)
-            http_client.proxies = proxies if proxies else {}
-            return TranscriptListFetcher(http_client).fetch(video_id)
-    @classmethod
-    def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
-        """
-        Retrieves the transcripts for a list of videos.
-        :param video_ids: a list of youtube video ids
-        :type video_ids: list[str]
-        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
-        it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
-        do so.
-        :type languages: list[str]
-        :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
-        one of the video transcripts
-        :type continue_after_error: bool
-        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
-        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
-        :param cookies: a string of the path to a text file containing youtube authorization cookies
-        :type cookies: str
-        :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
-        video ids, which could not be retrieved
-        :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
-        """
-        data = {}
-        unretrievable_videos = []
-        for video_id in video_ids:
-            try:
-                data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies)
-            except Exception as exception:
-                if not continue_after_error:
-                    raise exception
-                unretrievable_videos.append(video_id)
-        return data, unretrievable_videos
-    @classmethod
-    def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
-        """
-        Retrieves the transcript for a single video. This is just a shortcut for calling::
-            YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
-        :param video_id: the youtube video id
-        :type video_id: str
-        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
-        it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
-        do so.
-        :type languages: list[str]
-        :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
-        :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
-        :param cookies: a string of the path to a text file containing youtube authorization cookies
-        :type cookies: str
-        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
-        :rtype [{'text': str, 'start': float, 'end': float}]:
-        """
-        return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
-    @classmethod
-    def _load_cookies(cls, cookies, video_id):
-        try:
-            cookie_jar = cookiejar.MozillaCookieJar()
-            cookie_jar.load(cookies)
-            if not cookie_jar:
-                raise CookiesInvalid(video_id)
-            return cookie_jar
-        except CookieLoadError:
-            raise CookiePathInvalid(video_id)

youtube_transcript_api2/_cli.py DELETED Viewed

@@ -1,135 +0,0 @@
-import argparse
-from ._api import YouTubeTranscriptApi
-from .formatters import FormatterLoader
-class YouTubeTranscriptCli(object):
-    def __init__(self, args):
-        self._args = args
-    def run(self):
-        parsed_args = self._parse_args()
-        if parsed_args.exclude_manually_created and parsed_args.exclude_generated:
-            return ''
-        proxies = None
-        if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
-            proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
-        cookies = parsed_args.cookies
-        transcripts = []
-        exceptions = []
-        for video_id in parsed_args.video_ids:
-            try:
-                transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id))
-            except Exception as exception:
-                exceptions.append(exception)
-        return '\n\n'.join(
-            [str(exception) for exception in exceptions]
-            + ([FormatterLoader().load(parsed_args.format).format_transcripts(transcripts)] if transcripts else [])
-        )
-    def _fetch_transcript(self, parsed_args, proxies, cookies, video_id):
-        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies)
-        if parsed_args.list_transcripts:
-            return str(transcript_list)
-        if parsed_args.exclude_manually_created:
-            transcript = transcript_list.find_generated_transcript(parsed_args.languages)
-        elif parsed_args.exclude_generated:
-            transcript = transcript_list.find_manually_created_transcript(parsed_args.languages)
-        else:
-            transcript = transcript_list.find_transcript(parsed_args.languages)
-        if parsed_args.translate:
-            transcript = transcript.translate(parsed_args.translate)
-        return transcript.fetch()
-    def _parse_args(self):
-        parser = argparse.ArgumentParser(
-            description=(
-                'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. '
-                'It also works for automatically generated subtitles and it does not require a headless browser, like '
-                'other selenium based solutions do!'
-            )
-        )
-        parser.add_argument(
-            '--list-transcripts',
-            action='store_const',
-            const=True,
-            default=False,
-            help='This will list the languages in which the given videos are available in.',
-        )
-        parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.')
-        parser.add_argument(
-            '--languages',
-            nargs='*',
-            default=['en',],
-            type=str,
-            help=(
-                'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
-                'first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails '
-                'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you '
-                'may have to play around with the language codes a bit, to find the one which is working for you!'
-            ),
-        )
-        parser.add_argument(
-            '--exclude-generated',
-            action='store_const',
-            const=True,
-            default=False,
-            help='If this flag is set transcripts which have been generated by YouTube will not be retrieved.',
-        )
-        parser.add_argument(
-            '--exclude-manually-created',
-            action='store_const',
-            const=True,
-            default=False,
-            help='If this flag is set transcripts which have been manually created will not be retrieved.',
-        )
-        parser.add_argument(
-            '--format',
-            type=str,
-            default='pretty',
-            choices=tuple(FormatterLoader.TYPES.keys()),
-        )
-        parser.add_argument(
-            '--translate',
-            default='',
-            help=(
-                'The language code for the language you want this transcript to be translated to. Use the '
-                '--list-transcripts feature to find out which languages are translatable and which translation '
-                'languages are available.'
-            )
-        )
-        parser.add_argument(
-            '--http-proxy',
-            default='',
-            metavar='URL',
-            help='Use the specified HTTP proxy.'
-        )
-        parser.add_argument(
-            '--https-proxy',
-            default='',
-            metavar='URL',
-            help='Use the specified HTTPS proxy.'
-        )
-        parser.add_argument(
-            '--cookies',
-            default=None,
-            help='The cookie file that will be used for authorization with youtube.'
-        )
-        return self._sanitize_video_ids(parser.parse_args(self._args))
-    def _sanitize_video_ids(self, args):
-        args.video_ids = [video_id.replace('\\', '') for video_id in args.video_ids]
-        return args

youtube_transcript_api2/_errors.py DELETED Viewed

@@ -1,112 +0,0 @@
-from ._settings import WATCH_URL
-class CouldNotRetrieveTranscript(Exception):
-    """
-    Raised if a transcript could not be retrieved.
-    """
-    ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
-    CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
-    CAUSE_MESSAGE = ''
-    GITHUB_REFERRAL = (
-        '\n\nIf you are sure that the described cause is not responsible for this error '
-        'and that a transcript should be retrievable, please create an issue at '
-        'https://github.com/jdepoix/youtube-transcript-api/issues. '
-        'Please add which version of youtube_transcript_api you are using '
-        'and provide the information needed to replicate the error. '
-        'Also make sure that there are no open issues which already describe your problem!'
-    )
-    def __init__(self, video_id):
-        self.video_id = video_id
-        super(CouldNotRetrieveTranscript, self).__init__(self._build_error_message())
-    def _build_error_message(self):
-        cause = self.cause
-        error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
-        if cause:
-            error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL
-        return error_message
-    @property
-    def cause(self):
-        return self.CAUSE_MESSAGE
-class YouTubeRequestFailed(CouldNotRetrieveTranscript):
-    CAUSE_MESSAGE = 'Request to YouTube failed: {reason}'
-    def __init__(self, video_id, http_error):
-        self.reason = str(http_error)
-        super(YouTubeRequestFailed, self).__init__(video_id)
-    @property
-    def cause(self):
-        return self.CAUSE_MESSAGE.format(
-            reason=self.reason,
-        )
-class VideoUnavailable(CouldNotRetrieveTranscript):
-    CAUSE_MESSAGE = 'The video is no longer available'
-class TooManyRequests(CouldNotRetrieveTranscript):
-    CAUSE_MESSAGE = (
-        'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
-        'One of the following things can be done to work around this:\n\
-        - Manually solve the captcha in a browser and export the cookie. '
-        'Read here how to use that cookie with '
-        'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
-        - Use a different IP address\n\
-        - Wait until the ban on your IP has been lifted'
-    )
-class TranscriptsDisabled(CouldNotRetrieveTranscript):
-    CAUSE_MESSAGE = 'Subtitles are disabled for this video'
-class NoTranscriptAvailable(CouldNotRetrieveTranscript):
-    CAUSE_MESSAGE = 'No transcripts are available for this video'
-class NotTranslatable(CouldNotRetrieveTranscript):
-    CAUSE_MESSAGE = 'The requested language is not translatable'
-class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript):
-    CAUSE_MESSAGE = 'The requested translation language is not available'
-class CookiePathInvalid(CouldNotRetrieveTranscript):
-    CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded'
-class CookiesInvalid(CouldNotRetrieveTranscript):
-    CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
-class FailedToCreateConsentCookie(CouldNotRetrieveTranscript):
-    CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'
-class NoTranscriptFound(CouldNotRetrieveTranscript):
-    CAUSE_MESSAGE = (
-        'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
-        '{transcript_data}'
-    )
-    def __init__(self, video_id, requested_language_codes, transcript_data):
-        self._requested_language_codes = requested_language_codes
-        self._transcript_data = transcript_data
-        super(NoTranscriptFound, self).__init__(video_id)
-    @property
-    def cause(self):
-        return self.CAUSE_MESSAGE.format(
-            requested_language_codes=self._requested_language_codes,
-            transcript_data=str(self._transcript_data),
-        )

youtube_transcript_api2/_html_unescaping.py DELETED Viewed

@@ -1,21 +0,0 @@
-import sys
-# This can only be tested by using different python versions, therefore it is not covered by coverage.py
-if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover
-    # Python 3.4+
-    from html import unescape
-else: # pragma: no cover
-    if sys.version_info.major <= 2:
-        # Python 2
-        import HTMLParser
-        html_parser = HTMLParser.HTMLParser()
-    else:
-        # Python 3.0 - 3.3
-        import html.parser
-        html_parser = html.parser.HTMLParser()
-    def unescape(string):
-        return html_parser.unescape(string)

youtube_transcript_api2/_settings.py DELETED Viewed

	@@ -1 +0,0 @@
1	- WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'

youtube_transcript_api2/_transcripts.py DELETED Viewed

@@ -1,332 +0,0 @@
-import sys
-# This can only be tested by using different python versions, therefore it is not covered by coverage.py
-if sys.version_info.major == 2: # pragma: no cover
-    reload(sys)
-    sys.setdefaultencoding('utf-8')
-import json
-from xml.etree import ElementTree
-import re
-from requests import HTTPError
-from ._html_unescaping import unescape
-from ._errors import (
-    VideoUnavailable,
-    TooManyRequests,
-    YouTubeRequestFailed,
-    NoTranscriptFound,
-    TranscriptsDisabled,
-    NotTranslatable,
-    TranslationLanguageNotAvailable,
-    NoTranscriptAvailable,
-    FailedToCreateConsentCookie,
-)
-from ._settings import WATCH_URL
-def _raise_http_errors(response, video_id):
-    try:
-        response.raise_for_status()
-        return response
-    except HTTPError as error:
-        raise YouTubeRequestFailed(error, video_id)
-class TranscriptListFetcher(object):
-    def __init__(self, http_client):
-        self._http_client = http_client
-    def fetch(self, video_id):
-        return TranscriptList.build(
-            self._http_client,
-            video_id,
-            self._extract_captions_json(self._fetch_video_html(video_id), video_id)
-        )
-    def _extract_captions_json(self, html, video_id):
-        splitted_html = html.split('"captions":')
-        if len(splitted_html) <= 1:
-            if 'class="g-recaptcha"' in html:
-                raise TooManyRequests(video_id)
-            if '"playabilityStatus":' not in html:
-                raise VideoUnavailable(video_id)
-            raise TranscriptsDisabled(video_id)
-        captions_json = json.loads(
-            splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
-        ).get('playerCaptionsTracklistRenderer')
-        if captions_json is None:
-            raise TranscriptsDisabled(video_id)
-        if 'captionTracks' not in captions_json:
-            raise NoTranscriptAvailable(video_id)
-        return captions_json
-    def _create_consent_cookie(self, html, video_id):
-        match = re.search('name="v" value="(.*?)"', html)
-        if match is None:
-            raise FailedToCreateConsentCookie(video_id)
-        self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
-    def _fetch_video_html(self, video_id):
-        html = self._fetch_html(video_id)
-        if 'action="https://consent.youtube.com/s"' in html:
-            self._create_consent_cookie(html, video_id)
-            html = self._fetch_html(video_id)
-            if 'action="https://consent.youtube.com/s"' in html:
-                raise FailedToCreateConsentCookie(video_id)
-        return html
-    def _fetch_html(self, video_id):
-        response = self._http_client.get(WATCH_URL.format(video_id=video_id))
-        return unescape(_raise_http_errors(response, video_id).text)
-class TranscriptList(object):
-    """
-    This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
-    for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
-    """
-    def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
-        """
-        The constructor is only for internal use. Use the static build method instead.
-        :param video_id: the id of the video this TranscriptList is for
-        :type video_id: str
-        :param manually_created_transcripts: dict mapping language codes to the manually created transcripts
-        :type manually_created_transcripts: dict[str, Transcript]
-        :param generated_transcripts: dict mapping language codes to the generated transcripts
-        :type generated_transcripts: dict[str, Transcript]
-        :param translation_languages: list of languages which can be used for translatable languages
-        :type translation_languages: list[dict[str, str]]
-        """
-        self.video_id = video_id
-        self._manually_created_transcripts = manually_created_transcripts
-        self._generated_transcripts = generated_transcripts
-        self._translation_languages = translation_languages
-    @staticmethod
-    def build(http_client, video_id, captions_json):
-        """
-        Factory method for TranscriptList.
-        :param http_client: http client which is used to make the transcript retrieving http calls
-        :type http_client: requests.Session
-        :param video_id: the id of the video this TranscriptList is for
-        :type video_id: str
-        :param captions_json: the JSON parsed from the YouTube pages static HTML
-        :type captions_json: dict
-        :return: the created TranscriptList
-        :rtype TranscriptList:
-        """
-        translation_languages = [
-            {
-                'language': translation_language['languageName']['simpleText'],
-                'language_code': translation_language['languageCode'],
-            } for translation_language in captions_json['translationLanguages']
-        ]
-        manually_created_transcripts = {}
-        generated_transcripts = {}
-        for caption in captions_json['captionTracks']:
-            if caption.get('kind', '') == 'asr':
-                transcript_dict = generated_transcripts
-            else:
-                transcript_dict = manually_created_transcripts
-            transcript_dict[caption['languageCode']] = Transcript(
-                http_client,
-                video_id,
-                caption['baseUrl'],
-                caption['name']['simpleText'],
-                caption['languageCode'],
-                caption.get('kind', '') == 'asr',
-                translation_languages if caption.get('isTranslatable', False) else []
-            )
-        return TranscriptList(
-            video_id,
-            manually_created_transcripts,
-            generated_transcripts,
-            translation_languages,
-        )
-    def __iter__(self):
-        return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
-    def find_transcript(self, language_codes):
-        """
-        Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
-        are found, generated transcripts are used. If you only want generated transcripts use
-        `find_manually_created_transcript` instead.
-        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
-        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
-        it fails to do so.
-        :type languages: list[str]
-        :return: the found Transcript
-        :rtype Transcript:
-        :raises: NoTranscriptFound
-        """
-        return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
-    def find_generated_transcript(self, language_codes):
-        """
-        Finds a automatically generated transcript for a given language code.
-        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
-        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
-        it fails to do so.
-        :type languages: list[str]
-        :return: the found Transcript
-        :rtype Transcript:
-        :raises: NoTranscriptFound
-        """
-        return self._find_transcript(language_codes, [self._generated_transcripts,])
-    def find_manually_created_transcript(self, language_codes):
-        """
-        Finds a manually created transcript for a given language code.
-        :param language_codes: A list of language codes in a descending priority. For example, if this is set to
-        ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
-        it fails to do so.
-        :type languages: list[str]
-        :return: the found Transcript
-        :rtype Transcript:
-        :raises: NoTranscriptFound
-        """
-        return self._find_transcript(language_codes, [self._manually_created_transcripts,])
-    def _find_transcript(self, language_codes, transcript_dicts):
-        for language_code in language_codes:
-            for transcript_dict in transcript_dicts:
-                if language_code in transcript_dict:
-                    return transcript_dict[language_code]
-        raise NoTranscriptFound(
-            self.video_id,
-            language_codes,
-            self
-        )
-    def __str__(self):
-        return (
-            'For this video ({video_id}) transcripts are available in the following languages:\n\n'
-            '(MANUALLY CREATED)\n'
-            '{available_manually_created_transcript_languages}\n\n'
-            '(GENERATED)\n'
-            '{available_generated_transcripts}\n\n'
-            '(TRANSLATION LANGUAGES)\n'
-            '{available_translation_languages}'
-        ).format(
-            video_id=self.video_id,
-            available_manually_created_transcript_languages=self._get_language_description(
-                str(transcript) for transcript in self._manually_created_transcripts.values()
-            ),
-            available_generated_transcripts=self._get_language_description(
-                str(transcript) for transcript in self._generated_transcripts.values()
-            ),
-            available_translation_languages=self._get_language_description(
-                '{language_code} ("{language}")'.format(
-                    language=translation_language['language'],
-                    language_code=translation_language['language_code'],
-                ) for translation_language in self._translation_languages
-            )
-        )
-    def _get_language_description(self, transcript_strings):
-        description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
-        return description if description else 'None'
-class Transcript(object):
-    def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
-        """
-        You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
-        TranscriptList.
-        :param http_client: http client which is used to make the transcript retrieving http calls
-        :type http_client: requests.Session
-        :param video_id: the id of the video this TranscriptList is for
-        :type video_id: str
-        :param url: the url which needs to be called to fetch the transcript
-        :param language: the name of the language this transcript uses
-        :param language_code:
-        :param is_generated:
-        :param translation_languages:
-        """
-        self._http_client = http_client
-        self.video_id = video_id
-        self._url = url
-        self.language = language
-        self.language_code = language_code
-        self.is_generated = is_generated
-        self.translation_languages = translation_languages
-        self._translation_languages_dict = {
-            translation_language['language_code']: translation_language['language']
-            for translation_language in translation_languages
-        }
-    def fetch(self):
-        """
-        Loads the actual transcript data.
-        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
-        :rtype [{'text': str, 'start': float, 'end': float}]:
-        """
-        response = self._http_client.get(self._url)
-        return _TranscriptParser().parse(
-            _raise_http_errors(response, self.video_id).text,
-        )
-    def __str__(self):
-        return '{language_code} ("{language}"){translation_description}'.format(
-            language=self.language,
-            language_code=self.language_code,
-            translation_description='[TRANSLATABLE]' if self.is_translatable else ''
-        )
-    @property
-    def is_translatable(self):
-        return len(self.translation_languages) > 0
-    def translate(self, language_code):
-        if not self.is_translatable:
-            raise NotTranslatable(self.video_id)
-        if language_code not in self._translation_languages_dict:
-            raise TranslationLanguageNotAvailable(self.video_id)
-        return Transcript(
-            self._http_client,
-            self.video_id,
-            '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
-            self._translation_languages_dict[language_code],
-            language_code,
-            True,
-            [],
-        )
-class _TranscriptParser(object):
-    HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
-    def parse(self, plain_data):
-        return [
-            {
-                'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
-                'start': float(xml_element.attrib['start']),
-                'duration': float(xml_element.attrib.get('dur', '0.0')),
-            }
-            for xml_element in ElementTree.fromstring(plain_data)
-            if xml_element.text is not None
-        ]

youtube_transcript_api2/formatters.py DELETED Viewed

@@ -1,165 +0,0 @@
-import json
-import pprint
-class Formatter(object):
-    """Formatter should be used as an abstract base class.
-    Formatter classes should inherit from this class and implement
-    their own .format() method which should return a string. A
-    transcript is represented by a List of Dictionary items.
-    """
-    def format_transcript(self, transcript, **kwargs):
-        raise NotImplementedError('A subclass of Formatter must implement ' \
-            'their own .format_transcript() method.')
-    def format_transcripts(self, transcripts, **kwargs):
-        raise NotImplementedError('A subclass of Formatter must implement ' \
-                                  'their own .format_transcripts() method.')
-class PrettyPrintFormatter(Formatter):
-    def format_transcript(self, transcript, **kwargs):
-        """Pretty prints a transcript.
-        :param transcript:
-        :return: A pretty printed string representation of the transcript.'
-        :rtype str
-        """
-        return pprint.pformat(transcript, **kwargs)
-    def format_transcripts(self, transcripts, **kwargs):
-        """Pretty prints a list of transcripts.
-        :param transcripts:
-        :return: A pretty printed string representation of the transcripts.'
-        :rtype str
-        """
-        return self.format_transcript(transcripts, **kwargs)
-class JSONFormatter(Formatter):
-    def format_transcript(self, transcript, **kwargs):
-        """Converts a transcript into a JSON string.
-        :param transcript:
-        :return: A JSON string representation of the transcript.'
-        :rtype str
-        """
-        return json.dumps(transcript, **kwargs)
-    def format_transcripts(self, transcripts, **kwargs):
-        """Converts a list of transcripts into a JSON string.
-        :param transcripts:
-        :return: A JSON string representation of the transcript.'
-        :rtype str
-        """
-        return self.format_transcript(transcripts, **kwargs)
-class TextFormatter(Formatter):
-    def format_transcript(self, transcript, **kwargs):
-        """Converts a transcript into plain text with no timestamps.
-        :param transcript:
-        :return: all transcript text lines separated by newline breaks.'
-        :rtype str
-        """
-        return '\n'.join(line['text'] for line in transcript)
-    def format_transcripts(self, transcripts, **kwargs):
-        """Converts a list of transcripts into plain text with no timestamps.
-        :param transcripts:
-        :return: all transcript text lines separated by newline breaks.'
-        :rtype str
-        """
-        return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])
-class WebVTTFormatter(Formatter):
-    def _seconds_to_timestamp(self, time):
-        """Helper that converts `time` into a transcript cue timestamp.
-        :reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp
-        :param time: a float representing time in seconds.
-        :type time: float
-        :return: a string formatted as a cue timestamp, 'HH:MM:SS.MS'
-        :rtype str
-        :example:
-        >>> self._seconds_to_timestamp(6.93)
-        '00:00:06.930'
-        """
-        time = float(time)
-        hours, remainder = divmod(time, 3600)
-        mins, secs = divmod(remainder, 60)
-        ms = int(round((time - int(time))*1000, 2))
-        return "{:02.0f}:{:02.0f}:{:02.0f}.{:03d}".format(hours, mins, secs, ms)
-    def format_transcript(self, transcript, **kwargs):
-        """A basic implementation of WEBVTT formatting.
-        :param transcript:
-        :reference: https://www.w3.org/TR/webvtt1/#introduction-caption
-        """
-        lines = []
-        for i, line in enumerate(transcript):
-            if i < len(transcript) - 1:
-                # Looks ahead, use next start time since duration value
-                # would create an overlap between start times.
-                time_text = "{} --> {}".format(
-                    self._seconds_to_timestamp(line['start']),
-                    self._seconds_to_timestamp(transcript[i + 1]['start'])
-                )
-            else:
-                # Reached the end, cannot look ahead, use duration now.
-                duration = line['start'] + line['duration']
-                time_text = "{} --> {}".format(
-                    self._seconds_to_timestamp(line['start']),
-                    self._seconds_to_timestamp(duration)
-                )
-            lines.append("{}\n{}".format(time_text, line['text']))
-        return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"
-    def format_transcripts(self, transcripts, **kwargs):
-        """A basic implementation of WEBVTT formatting for a list of transcripts.
-        :param transcripts:
-        :reference: https://www.w3.org/TR/webvtt1/#introduction-caption
-        """
-        return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])
-class FormatterLoader(object):
-    TYPES = {
-        'json': JSONFormatter,
-        'pretty': PrettyPrintFormatter,
-        'text': TextFormatter,
-        'webvtt': WebVTTFormatter,
-    }
-    class UnknownFormatterType(Exception):
-        def __init__(self, formatter_type):
-            super(FormatterLoader.UnknownFormatterType, self).__init__(
-                'The format \'{formatter_type}\' is not supported. '
-                'Choose one of the following formats: {supported_formatter_types}'.format(
-                    formatter_type=formatter_type,
-                    supported_formatter_types=', '.join(FormatterLoader.TYPES.keys()),
-                )
-            )
-    def load(self, formatter_type='pretty'):
-        """
-        Loads the Formatter for the given formatter type.
-        :param formatter_type:
-        :return: Formatter object
-        """
-        if formatter_type not in FormatterLoader.TYPES.keys():
-            raise FormatterLoader.UnknownFormatterType(formatter_type)
-        return FormatterLoader.TYPES[formatter_type]()