knowledge-scribe

Sleeping

App Files Files Community

dwb2023 commited on Jun 9, 2024

Commit

554c0b5

verified ·

1 Parent(s): 0c669fb

Update app.py

Browse files

update max length

Files changed (1) hide show

app.py +9 -13

app.py CHANGED Viewed

@@ -4,27 +4,29 @@ from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
 import torch
 from huggingface_hub import CommitScheduler
 import tempfile
 import os
 import json
-import spaces
 from datetime import datetime
 from pathlib import Path
 from uuid import uuid4
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
-YT_LENGTH_LIMIT_S = 4800  # 1 hour limit
 device = 0 if torch.cuda.is_available() else "cpu"
 pipe = pipeline(task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device)
 JSON_DATASET_DIR = Path("json_dataset")
 JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
 JSON_DATASET_PATH = JSON_DATASET_DIR / f"transcriptions-{uuid4()}.json"
 scheduler = CommitScheduler(
-    repo_id="transcript-dataset",
     repo_type="dataset",
     folder_path=JSON_DATASET_DIR,
     path_in_repo="data",
@@ -43,16 +45,10 @@ def download_yt_audio(yt_url, filename):
         info = info_loader.extract_info(yt_url, download=False)
     except youtube_dl.utils.DownloadError as err:
         raise gr.Error(str(err))
-    file_length = info["duration_string"]
-    file_h_m_s = list(map(int, file_length.split(":")))
-    if len(file_h_m_s) == 1:
-        file_h_m_s.insert(0, 0)
-    if len(file_h_m_s) == 2:
-        file_h_m_s.insert(0, 0)
-    file_length_s = sum(x * 60 ** i for i, x in enumerate(reversed(file_h_m_s)))
-    if file_length_s > YT_LENGTH_LIMIT_S:
-        yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
-        file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
         raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
     ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
     with youtube_dl.YoutubeDL(ydl_opts) as ydl:

 from transformers.pipelines.audio_utils import ffmpeg_read
 import torch
 from huggingface_hub import CommitScheduler
+import spaces
 import tempfile
 import os
 import json
 from datetime import datetime
 from pathlib import Path
 from uuid import uuid4
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
+YT_LENGTH_LIMIT_S = 4800  # 1 hour 20 minutes
 device = 0 if torch.cuda.is_available() else "cpu"
 pipe = pipeline(task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device)
+# Define paths and create directory if not exists
 JSON_DATASET_DIR = Path("json_dataset")
 JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
 JSON_DATASET_PATH = JSON_DATASET_DIR / f"transcriptions-{uuid4()}.json"
+# Initialize CommitScheduler for saving data to Hugging Face Dataset
 scheduler = CommitScheduler(
+    repo_id="your-huggingface-dataset-repo",
     repo_type="dataset",
     folder_path=JSON_DATASET_DIR,
     path_in_repo="data",
         info = info_loader.extract_info(yt_url, download=False)
     except youtube_dl.utils.DownloadError as err:
         raise gr.Error(str(err))
+    file_length = info["duration"]
+    if file_length > YT_LENGTH_LIMIT_S:
+        yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
+        file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
         raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
     ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
     with youtube_dl.YoutubeDL(ydl_opts) as ydl: