dwb2023 commited on
Commit
554c0b5
·
verified ·
1 Parent(s): 0c669fb

Update app.py

Browse files

update max length

Files changed (1) hide show
  1. app.py +9 -13
app.py CHANGED
@@ -4,27 +4,29 @@ from transformers import pipeline
4
  from transformers.pipelines.audio_utils import ffmpeg_read
5
  import torch
6
  from huggingface_hub import CommitScheduler
 
7
  import tempfile
8
  import os
9
  import json
10
- import spaces
11
  from datetime import datetime
12
  from pathlib import Path
13
  from uuid import uuid4
14
 
15
  MODEL_NAME = "openai/whisper-large-v3"
16
  BATCH_SIZE = 8
17
- YT_LENGTH_LIMIT_S = 4800 # 1 hour limit
18
 
19
  device = 0 if torch.cuda.is_available() else "cpu"
20
  pipe = pipeline(task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device)
21
 
 
22
  JSON_DATASET_DIR = Path("json_dataset")
23
  JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
24
  JSON_DATASET_PATH = JSON_DATASET_DIR / f"transcriptions-{uuid4()}.json"
25
 
 
26
  scheduler = CommitScheduler(
27
- repo_id="transcript-dataset",
28
  repo_type="dataset",
29
  folder_path=JSON_DATASET_DIR,
30
  path_in_repo="data",
@@ -43,16 +45,10 @@ def download_yt_audio(yt_url, filename):
43
  info = info_loader.extract_info(yt_url, download=False)
44
  except youtube_dl.utils.DownloadError as err:
45
  raise gr.Error(str(err))
46
- file_length = info["duration_string"]
47
- file_h_m_s = list(map(int, file_length.split(":")))
48
- if len(file_h_m_s) == 1:
49
- file_h_m_s.insert(0, 0)
50
- if len(file_h_m_s) == 2:
51
- file_h_m_s.insert(0, 0)
52
- file_length_s = sum(x * 60 ** i for i, x in enumerate(reversed(file_h_m_s)))
53
- if file_length_s > YT_LENGTH_LIMIT_S:
54
- yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
55
- file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
56
  raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
57
  ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
58
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
 
4
  from transformers.pipelines.audio_utils import ffmpeg_read
5
  import torch
6
  from huggingface_hub import CommitScheduler
7
+ import spaces
8
  import tempfile
9
  import os
10
  import json
 
11
  from datetime import datetime
12
  from pathlib import Path
13
  from uuid import uuid4
14
 
15
  MODEL_NAME = "openai/whisper-large-v3"
16
  BATCH_SIZE = 8
17
+ YT_LENGTH_LIMIT_S = 4800 # 1 hour 20 minutes
18
 
19
  device = 0 if torch.cuda.is_available() else "cpu"
20
  pipe = pipeline(task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device)
21
 
22
+ # Define paths and create directory if not exists
23
  JSON_DATASET_DIR = Path("json_dataset")
24
  JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
25
  JSON_DATASET_PATH = JSON_DATASET_DIR / f"transcriptions-{uuid4()}.json"
26
 
27
+ # Initialize CommitScheduler for saving data to Hugging Face Dataset
28
  scheduler = CommitScheduler(
29
+ repo_id="your-huggingface-dataset-repo",
30
  repo_type="dataset",
31
  folder_path=JSON_DATASET_DIR,
32
  path_in_repo="data",
 
45
  info = info_loader.extract_info(yt_url, download=False)
46
  except youtube_dl.utils.DownloadError as err:
47
  raise gr.Error(str(err))
48
+ file_length = info["duration"]
49
+ if file_length > YT_LENGTH_LIMIT_S:
50
+ yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
51
+ file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
 
 
 
 
 
 
52
  raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
53
  ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
54
  with youtube_dl.YoutubeDL(ydl_opts) as ydl: