cstr commited on
Commit
4347dae
1 Parent(s): 73225e0
Files changed (1) hide show
  1. app.py +32 -31
app.py CHANGED
@@ -95,37 +95,42 @@ def download_youtube_audio(url, method_choice):
95
 
96
  def yt_dlp_method(url):
97
  """
98
- Downloads YouTube audio using yt-dlp.
99
-
100
  Args:
101
  url (str): The YouTube URL.
102
-
103
  Returns:
104
  str: Path to the downloaded audio file.
105
  """
106
  logging.info("Using yt-dlp method")
 
 
107
  ydl_opts = {
108
  'format': 'bestaudio/best',
 
109
  'postprocessors': [{
110
  'key': 'FFmpegExtractAudio',
111
  'preferredcodec': 'mp3',
112
  'preferredquality': '192',
113
  }],
114
- 'outtmpl': '%(id)s.%(ext)s',
 
115
  }
116
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
117
  info = ydl.extract_info(url, download=True)
118
- output_file = f"{info['id']}.mp3"
 
119
  logging.info(f"Downloaded YouTube audio: {output_file}")
120
  return output_file
121
 
122
  def pytube_method(url):
123
  """
124
- Downloads audio using pytube.
125
-
126
  Args:
127
  url (str): The YouTube URL.
128
-
129
  Returns:
130
  str: Path to the downloaded audio file.
131
  """
@@ -133,7 +138,8 @@ def pytube_method(url):
133
  from pytube import YouTube
134
  yt = YouTube(url)
135
  audio_stream = yt.streams.filter(only_audio=True).first()
136
- out_file = audio_stream.download()
 
137
  base, ext = os.path.splitext(out_file)
138
  new_file = base + '.mp3'
139
  os.rename(out_file, new_file)
@@ -396,10 +402,10 @@ loaded_models = {}
396
 
397
  def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
398
  """
399
- Transcribes audio from a given source using the specified pipeline and model.
400
 
401
  Args:
402
- input_source (str or file): URL of audio, path to local file, or uploaded file object.
403
  pipeline_type (str): Type of pipeline to use ('faster-batched', 'faster-sequenced', or 'transformers').
404
  model_id (str): The ID of the model to use.
405
  dtype (str): Data type for model computations ('int8', 'float16', or 'float32').
@@ -424,32 +430,22 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
424
  if verbose:
425
  yield verbose_messages, "", None
426
 
427
- # Determine if input_source is a URL or file
428
- audio_path = None
429
- is_temp_file = False
430
-
431
- if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
432
- # Input source is a URL
433
- audio_path, is_temp_file = download_audio(input_source, download_method)
434
- if not audio_path or audio_path.startswith("Error"):
435
- yield f"Error downloading audio: {audio_path}", "", None
436
- return
437
- elif isinstance(input_source, str) and os.path.exists(input_source):
438
- # Input source is a local file path
439
- audio_path = input_source
440
- is_temp_file = False
441
- elif hasattr(input_source, 'name'):
442
- # Input source is an uploaded file object
443
- audio_path = input_source.name
444
- is_temp_file = False
445
- else:
446
- yield "No valid audio source provided.", "", None
447
  return
448
 
449
  # Convert start_time and end_time to float or None
450
  start_time = float(start_time) if start_time else None
451
  end_time = float(end_time) if end_time else None
452
 
 
453
  if start_time is not None or end_time is not None:
454
  audio_path = trim_audio(audio_path, start_time, end_time)
455
  is_temp_file = True # The trimmed audio is a temporary file
@@ -463,6 +459,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
463
  model_or_pipeline = loaded_models[model_key]
464
  logging.info("Loaded model from cache")
465
  else:
 
466
  if pipeline_type == "faster-batched":
467
  model = WhisperModel(model_id, device=device, compute_type=dtype)
468
  model_or_pipeline = BatchedInferencePipeline(model=model)
@@ -495,6 +492,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
495
  raise ValueError("Invalid pipeline type")
496
  loaded_models[model_key] = model_or_pipeline # Cache the model or pipeline
497
 
 
498
  start_time_perf = time.time()
499
  if pipeline_type == "faster-batched":
500
  segments, info = model_or_pipeline.transcribe(audio_path, batch_size=batch_size)
@@ -505,6 +503,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
505
  segments = result["chunks"]
506
  end_time_perf = time.time()
507
 
 
508
  transcription_time = end_time_perf - start_time_perf
509
  audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
510
 
@@ -516,6 +515,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
516
  if verbose:
517
  yield verbose_messages + metrics_output, "", None
518
 
 
519
  transcription = ""
520
 
521
  for segment in segments:
@@ -527,6 +527,7 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
527
  if verbose:
528
  yield verbose_messages + metrics_output, transcription, None
529
 
 
530
  transcription_file = save_transcription(transcription)
531
  yield verbose_messages + metrics_output, transcription, transcription_file
532
 
 
95
 
96
  def yt_dlp_method(url):
97
  """
98
+ Downloads YouTube audio using yt-dlp and saves it to a temporary file.
99
+
100
  Args:
101
  url (str): The YouTube URL.
102
+
103
  Returns:
104
  str: Path to the downloaded audio file.
105
  """
106
  logging.info("Using yt-dlp method")
107
+ temp_dir = tempfile.mkdtemp()
108
+ output_template = os.path.join(temp_dir, '%(id)s.%(ext)s')
109
  ydl_opts = {
110
  'format': 'bestaudio/best',
111
+ 'outtmpl': output_template,
112
  'postprocessors': [{
113
  'key': 'FFmpegExtractAudio',
114
  'preferredcodec': 'mp3',
115
  'preferredquality': '192',
116
  }],
117
+ 'quiet': True,
118
+ 'no_warnings': True,
119
  }
120
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
121
  info = ydl.extract_info(url, download=True)
122
+ output_file = ydl.prepare_filename(info)
123
+ output_file = os.path.splitext(output_file)[0] + '.mp3'
124
  logging.info(f"Downloaded YouTube audio: {output_file}")
125
  return output_file
126
 
127
  def pytube_method(url):
128
  """
129
+ Downloads audio from a YouTube URL using pytube and saves it to a temporary file.
130
+
131
  Args:
132
  url (str): The YouTube URL.
133
+
134
  Returns:
135
  str: Path to the downloaded audio file.
136
  """
 
138
  from pytube import YouTube
139
  yt = YouTube(url)
140
  audio_stream = yt.streams.filter(only_audio=True).first()
141
+ temp_dir = tempfile.mkdtemp()
142
+ out_file = audio_stream.download(output_path=temp_dir)
143
  base, ext = os.path.splitext(out_file)
144
  new_file = base + '.mp3'
145
  os.rename(out_file, new_file)
 
402
 
403
  def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
404
  """
405
+ Transcribes audio from a given URL using the specified pipeline and model.
406
 
407
  Args:
408
+ input_source (str): URL of the audio.
409
  pipeline_type (str): Type of pipeline to use ('faster-batched', 'faster-sequenced', or 'transformers').
410
  model_id (str): The ID of the model to use.
411
  dtype (str): Data type for model computations ('int8', 'float16', or 'float32').
 
430
  if verbose:
431
  yield verbose_messages, "", None
432
 
433
+ # Input source is expected to be a URL
434
+ if not input_source or not input_source.strip():
435
+ yield "No audio URL provided.", "", None
436
+ return
437
+
438
+ # Download the audio from the URL
439
+ audio_path, is_temp_file = download_audio(input_source, download_method)
440
+ if not audio_path or audio_path.startswith("Error"):
441
+ yield f"Error downloading audio: {audio_path}", "", None
 
 
 
 
 
 
 
 
 
 
 
442
  return
443
 
444
  # Convert start_time and end_time to float or None
445
  start_time = float(start_time) if start_time else None
446
  end_time = float(end_time) if end_time else None
447
 
448
+ # Trim the audio if start or end times are provided
449
  if start_time is not None or end_time is not None:
450
  audio_path = trim_audio(audio_path, start_time, end_time)
451
  is_temp_file = True # The trimmed audio is a temporary file
 
459
  model_or_pipeline = loaded_models[model_key]
460
  logging.info("Loaded model from cache")
461
  else:
462
+ # Load the appropriate model or pipeline based on the pipeline type
463
  if pipeline_type == "faster-batched":
464
  model = WhisperModel(model_id, device=device, compute_type=dtype)
465
  model_or_pipeline = BatchedInferencePipeline(model=model)
 
492
  raise ValueError("Invalid pipeline type")
493
  loaded_models[model_key] = model_or_pipeline # Cache the model or pipeline
494
 
495
+ # Perform the transcription
496
  start_time_perf = time.time()
497
  if pipeline_type == "faster-batched":
498
  segments, info = model_or_pipeline.transcribe(audio_path, batch_size=batch_size)
 
503
  segments = result["chunks"]
504
  end_time_perf = time.time()
505
 
506
+ # Calculate metrics
507
  transcription_time = end_time_perf - start_time_perf
508
  audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
509
 
 
515
  if verbose:
516
  yield verbose_messages + metrics_output, "", None
517
 
518
+ # Compile the transcription text
519
  transcription = ""
520
 
521
  for segment in segments:
 
527
  if verbose:
528
  yield verbose_messages + metrics_output, transcription, None
529
 
530
+ # Save the transcription to a file
531
  transcription_file = save_transcription(transcription)
532
  yield verbose_messages + metrics_output, transcription, transcription_file
533